From 05ac12bb0e90baff74141ddaa535b77176c49d35 Mon Sep 17 00:00:00 2001 From: Charles Connell Date: Sat, 4 Jan 2014 21:34:10 -0500 Subject: [PATCH] Access search index through singleton object, more selective index updates --- karmaworld/apps/notes/models.py | 26 +++-- karmaworld/apps/notes/search.py | 183 ++++++++++++++++++++++---------- karmaworld/apps/notes/views.py | 12 +-- 3 files changed, 150 insertions(+), 71 deletions(-) diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py index 100bd3e..686ec82 100644 --- a/karmaworld/apps/notes/models.py +++ b/karmaworld/apps/notes/models.py @@ -8,7 +8,7 @@ """ import datetime from django.db.models import SET_NULL -from django.db.models.signals import post_save, post_delete +from django.db.models.signals import post_save, post_delete, pre_save from django.dispatch import receiver import os import urllib @@ -24,7 +24,7 @@ from taggit.managers import TaggableManager from karmaworld.apps.courses.models import Course from karmaworld.apps.licenses.models import License -import karmaworld.apps.notes.search as search +from karmaworld.apps.notes.search import SearchIndex fs = FileSystemStorage(location=settings.MEDIA_ROOT) @@ -229,19 +229,28 @@ def update_note_counts(note_instance): note_instance.course.update_note_count() note_instance.course.school.update_note_count() +@receiver(pre_save, sender=Note, weak=False) +def note_pre_save_receiver(sender, **kwargs): + """Stick an instance of the pre-save value of + the given Note instance in the instances itself. + This will be looked at in post_save.""" + if not 'instance' in kwargs: + return + + kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id) + @receiver(post_save, sender=Note, weak=False) def note_save_receiver(sender, **kwargs): if not 'instance' in kwargs: return note = kwargs['instance'] - # Update course and school counts of how - # many notes they have + index = SearchIndex() if kwargs['created']: update_note_counts(note) - - # Add or update document in search index - search.add_document(note) + index.add_note(note) + else: + index.update_note(note, note.old_instance) @receiver(post_delete, sender=Note, weak=False) @@ -255,4 +264,5 @@ def note_delete_receiver(sender, **kwargs): update_note_counts(kwargs['instance']) # Remove document from search index - search.remove_document(note) + index = SearchIndex() + index.remove_note(note) diff --git a/karmaworld/apps/notes/search.py b/karmaworld/apps/notes/search.py index ad62c6b..d578f5d 100644 --- a/karmaworld/apps/notes/search.py +++ b/karmaworld/apps/notes/search.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding:utf8 -*- # Copyright (C) 2013 FinalsClub Foundation + import calendar import time @@ -14,70 +15,138 @@ PAGE_SIZE = 10 logging.basicConfig() logger = logging.getLogger(__name__) -api_client = itc.ApiClient(secret.PRIVATE_URL) -if not api_client.get_index(secret.INDEX).exists(): - api_client.create_index(secret.INDEX, {'public_search': False}) - -index = api_client.get_index(secret.INDEX) - -while not index.has_started(): - time.sleep(0.5) - -# Default scoring function -# Results are sorted by combination of "relevance" -# and number of thanks they have received. -# "Relevance" is a black box provided by IndexDen. -index.add_function(0, 'relevance * log(doc.var[0])') class SearchResult(object): + """The result of making a query into IndexDen. + @param ordered_ids A list of the note IDs found, in order they + should be displayed + @param snippet_dict A dictionary mapping note IDs to snippets + to show in search results + @param has_more A boolean indicating if the user should + request more results by increasing + the page number of the query.""" def __init__(self, ordered_ids, snippet_dict, has_more): self.ordered_ids = ordered_ids self.snippet_dict = snippet_dict self.has_more = has_more -def note_to_dict(note): - d = { - 'name': note.name, - 'text': note.text - } - - if note.tags.exists(): - d['tags'] = ' '.join([str(tag) for tag in note.tags.all()]) - - if note.course: - d['course_id'] = note.course.id - - if note.uploaded_at: - d['timestamp'] = calendar.timegm(note.uploaded_at.timetuple()) - - return d - -def add_document(note): - if note.text: - logger.info("Indexing {n}".format(n=note)) - index.add_document(note.id, note_to_dict(note), variables={0: note.thanks}) - else: - logger.warn("Note {n} has no text, will not add to IndexDen".format(n=note)) - -def remove_document(note): - index.delete_document(note.id) - -def search(query, course_id=None, page=0): - """Returns note IDs matching the given query, - filtered by course ID if given""" - if course_id: - real_query = '("%s" OR name:"%s") AND course_id:%s' % (query, query, course_id) - else: - real_query = '"%s" OR name:"%s"' % (query, query) - - raw_results = index.search(real_query, snippet_fields=['text'], - length=PAGE_SIZE, start=(page*PAGE_SIZE)) - - ordered_ids = [int(r['docid']) for r in raw_results['results']] - snippet_dict = {int(r['docid']): r['snippet_text'] for r in raw_results['results']} - # Are there more results to show the user if they want? - has_more = True if int(raw_results['matches']) > ((page+1) * PAGE_SIZE) else False +class Singleton(type): + """Set this as the metaclass of another + class to ensure that it will only have one instance. + Borrowed from + http://stackoverflow.com/questions/6760685/creating-a-singleton-in-python""" + + _instances = {} + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class SearchIndex(object): + """A singleton class used to interface with the IndexDen + search index.""" - return SearchResult(ordered_ids, snippet_dict, has_more) + __metaclass__ = Singleton + + def __init__(self): + api_client = itc.ApiClient(secret.PRIVATE_URL) + if not api_client.get_index(secret.INDEX).exists(): + api_client.create_index(secret.INDEX, {'public_search': False}) + + self.index = api_client.get_index(secret.INDEX) + + while not self.index.has_started(): + time.sleep(0.5) + + # Default scoring function + # Results are sorted by combination of "relevance" + # and number of thanks they have received. + # "Relevance" is a black box provided by IndexDen. + self.index.add_function(0, 'relevance * log(doc.var[0])') + + @staticmethod + def _tags_to_str(tags): + return ' '.join([str(tag) for tag in tags.all()]) + + @staticmethod + def _note_to_dict(note): + d = { + 'name': note.name, + 'text': note.text + } + + if note.tags.exists(): + d['tags'] = SearchIndex._tags_to_str(note.tags) + + if note.course: + d['course_id'] = note.course.id + + if note.uploaded_at: + d['timestamp'] = calendar.timegm(note.uploaded_at.timetuple()) + + return d + + def add_note(self, note): + """Add a note to the index. If the note is + already in the index, it will be overwritten.""" + if note.text: + logger.info("Indexing {n}".format(n=note)) + self.index.add_document(note.id, SearchIndex._note_to_dict(note), variables={0: note.thanks}) + else: + logger.info("Note {n} has no text, will not add to IndexDen".format(n=note)) + + def update_note(self, new_note, old_note): + """Update a note. Will only truly update the search + index if it needs to. Compares the fields in new_note with + old_note to see what has changed.""" + if not new_note.text: + logger.info("Note {n} has no text, will not add to IndexDen".format(n=new_note)) + return + + # If the indexable fields have changed, + # send the document to IndexDen again + if new_note.text != old_note.text or \ + new_note.name != old_note.name or \ + SearchIndex._tags_to_str(new_note.tags) != SearchIndex._tags_to_str(old_note.tags) or \ + new_note.course != old_note.course or \ + new_note.uploaded_at != old_note.uploaded_at: + logger.info("Indexing {n}".format(n=new_note)) + self.index.add_document(new_note.id, SearchIndex._note_to_dict(new_note), variables={0: new_note.thanks}) + + # If only the thanks count has changed, we can + # just send that + elif new_note.thanks != old_note.thanks: + logger.info("Indexing thanks variable for {n}".format(n=new_note)) + self.index.update_variables(new_note.id, variables={0: new_note.thanks}) + + # Otherwise we don't need to do anything + else: + logger.info("Note {n} has not changed sufficiently, will not update IndexDen".format(n=new_note)) + + def remove_note(self, note): + """Remove a note from the search index.""" + + logger.info("Removing from index: {n}".format(n=note)) + self.index.delete_document(note.id) + + def search(self, query, course_id=None, page=0): + """Returns an instance of SearchResult for your query.""" + + if course_id: + real_query = '("%s" OR name:"%s") AND course_id:%s' % (query, query, course_id) + else: + real_query = '"%s" OR name:"%s"' % (query, query) + + raw_results = self.index.search(real_query, snippet_fields=['text'], + length=PAGE_SIZE, start=(page*PAGE_SIZE)) + + ordered_ids = [int(r['docid']) for r in raw_results['results']] + snippet_dict = {int(r['docid']): r['snippet_text'] for r in raw_results['results']} + + # Are there more results to show the user if they want? + has_more = True if int(raw_results['matches']) > ((page+1) * PAGE_SIZE) else False + + return SearchResult(ordered_ids, snippet_dict, has_more) diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py index 43f42a1..502356f 100644 --- a/karmaworld/apps/notes/views.py +++ b/karmaworld/apps/notes/views.py @@ -1,22 +1,20 @@ #!/usr/bin/env python # -*- coding:utf8 -*- # Copyright (C) 2012 FinalsClub Foundation + import json from django.core.exceptions import ObjectDoesNotExist from karmaworld.apps.courses.models import Course -from karmaworld.apps.notes import search +from karmaworld.apps.notes.search import SearchIndex import os from django.conf import settings -from django.contrib.sites.models import Site from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseNotFound from django.views.generic import DetailView, ListView from django.views.generic import FormView from django.views.generic import View -from django.views.generic import TemplateView from django.views.generic.detail import SingleObjectMixin -from django.shortcuts import get_object_or_404, render_to_response from karmaworld.apps.notes.models import Note from karmaworld.apps.notes.forms import NoteForm @@ -173,12 +171,14 @@ class NoteSearchView(ListView): else: page = 0 + index = SearchIndex() + if 'course_id' in self.request.GET: - raw_results = search.search(self.request.GET['query'], + raw_results = index.search(self.request.GET['query'], self.request.GET['course_id'], page=page) else: - raw_results = search.search(self.request.GET['query'], + raw_results = index.search(self.request.GET['query'], page=page) instances = Note.objects.in_bulk(raw_results.ordered_ids) -- 2.25.1