Access search index through singleton object, more selective index updates
authorCharles Connell <charles@connells.org>
Sun, 5 Jan 2014 02:34:10 +0000 (21:34 -0500)
committerCharles Connell <charles@connells.org>
Sun, 5 Jan 2014 02:34:10 +0000 (21:34 -0500)
karmaworld/apps/notes/models.py
karmaworld/apps/notes/search.py
karmaworld/apps/notes/views.py

index 100bd3e2708e35668414f1f563496cb4fcc02e23..686ec8290feb2c56fa1cf59f813c4a85b6cf55bd 100644 (file)
@@ -8,7 +8,7 @@
 """
 import datetime
 from django.db.models import SET_NULL
-from django.db.models.signals import post_save, post_delete
+from django.db.models.signals import post_save, post_delete, pre_save
 from django.dispatch import receiver
 import os
 import urllib
@@ -24,7 +24,7 @@ from taggit.managers import TaggableManager
 
 from karmaworld.apps.courses.models import Course
 from karmaworld.apps.licenses.models import License
-import karmaworld.apps.notes.search as search
+from karmaworld.apps.notes.search import SearchIndex
 
 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
 
@@ -229,19 +229,28 @@ def update_note_counts(note_instance):
         note_instance.course.update_note_count()
         note_instance.course.school.update_note_count()
 
+@receiver(pre_save, sender=Note, weak=False)
+def note_pre_save_receiver(sender, **kwargs):
+    """Stick an instance of the pre-save value of
+    the given Note instance in the instances itself.
+    This will be looked at in post_save."""
+    if not 'instance' in kwargs:
+        return
+
+    kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
+
 @receiver(post_save, sender=Note, weak=False)
 def note_save_receiver(sender, **kwargs):
     if not 'instance' in kwargs:
         return
     note = kwargs['instance']
 
-    # Update course and school counts of how
-    # many notes they have
+    index = SearchIndex()
     if kwargs['created']:
         update_note_counts(note)
-
-    # Add or update document in search index
-    search.add_document(note)
+        index.add_note(note)
+    else:
+        index.update_note(note, note.old_instance)
 
 
 @receiver(post_delete, sender=Note, weak=False)
@@ -255,4 +264,5 @@ def note_delete_receiver(sender, **kwargs):
     update_note_counts(kwargs['instance'])
 
     # Remove document from search index
-    search.remove_document(note)
+    index = SearchIndex()
+    index.remove_note(note)
index ad62c6b1b850ccd2439add8bc0481c78f01aa0dc..d578f5db9c5c95a54b99e1a0374e9de3f9428aa5 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding:utf8 -*-
 # Copyright (C) 2013  FinalsClub Foundation
+
 import calendar
 import time
 
@@ -14,70 +15,138 @@ PAGE_SIZE = 10
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 
-api_client = itc.ApiClient(secret.PRIVATE_URL)
-if not api_client.get_index(secret.INDEX).exists():
-    api_client.create_index(secret.INDEX, {'public_search': False})
-
-index = api_client.get_index(secret.INDEX)
-
-while not index.has_started():
-    time.sleep(0.5)
-
-# Default scoring function
-# Results are sorted by combination of "relevance"
-# and number of thanks they have received.
-# "Relevance" is a black box provided by IndexDen.
-index.add_function(0, 'relevance * log(doc.var[0])')
 
 class SearchResult(object):
+    """The result of making a query into IndexDen.
+    @param ordered_ids A list of the note IDs found, in order they
+                       should be displayed
+    @param snippet_dict A dictionary mapping note IDs to snippets
+                        to show in search results
+    @param has_more A boolean indicating if the user should
+                    request more results by increasing
+                    the page number of the query."""
 
     def __init__(self, ordered_ids, snippet_dict, has_more):
         self.ordered_ids = ordered_ids
         self.snippet_dict = snippet_dict
         self.has_more = has_more
 
-def note_to_dict(note):
-    d = {
-        'name': note.name,
-        'text': note.text
-    }
-
-    if note.tags.exists():
-        d['tags'] = ' '.join([str(tag) for tag in note.tags.all()])
-
-    if note.course:
-        d['course_id'] = note.course.id
-
-    if note.uploaded_at:
-        d['timestamp'] = calendar.timegm(note.uploaded_at.timetuple())
-
-    return d
-
-def add_document(note):
-    if note.text:
-        logger.info("Indexing {n}".format(n=note))
-        index.add_document(note.id, note_to_dict(note), variables={0: note.thanks})
-    else:
-        logger.warn("Note {n} has no text, will not add to IndexDen".format(n=note))
-
-def remove_document(note):
-    index.delete_document(note.id)
-
-def search(query, course_id=None, page=0):
-    """Returns note IDs matching the given query,
-    filtered by course ID if given"""
-    if course_id:
-        real_query = '("%s" OR name:"%s") AND course_id:%s' % (query, query, course_id)
-    else:
-        real_query = '"%s" OR name:"%s"' % (query, query)
-
-    raw_results = index.search(real_query, snippet_fields=['text'],
-                               length=PAGE_SIZE, start=(page*PAGE_SIZE))
-
-    ordered_ids = [int(r['docid']) for r in raw_results['results']]
-    snippet_dict = {int(r['docid']): r['snippet_text'] for r in raw_results['results']}
 
-    # Are there more results to show the user if they want?
-    has_more = True if int(raw_results['matches']) > ((page+1) * PAGE_SIZE) else False
+class Singleton(type):
+    """Set this as the metaclass of another
+    class to ensure that it will only have one instance.
+    Borrowed from
+    http://stackoverflow.com/questions/6760685/creating-a-singleton-in-python"""
+
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+
+class SearchIndex(object):
+    """A singleton class used to interface with the IndexDen
+    search index."""
 
-    return SearchResult(ordered_ids, snippet_dict, has_more)
+    __metaclass__ = Singleton
+
+    def __init__(self):
+        api_client = itc.ApiClient(secret.PRIVATE_URL)
+        if not api_client.get_index(secret.INDEX).exists():
+            api_client.create_index(secret.INDEX, {'public_search': False})
+
+        self.index = api_client.get_index(secret.INDEX)
+
+        while not self.index.has_started():
+            time.sleep(0.5)
+
+        # Default scoring function
+        # Results are sorted by combination of "relevance"
+        # and number of thanks they have received.
+        # "Relevance" is a black box provided by IndexDen.
+        self.index.add_function(0, 'relevance * log(doc.var[0])')
+
+    @staticmethod
+    def _tags_to_str(tags):
+        return ' '.join([str(tag) for tag in tags.all()])
+
+    @staticmethod
+    def _note_to_dict(note):
+        d = {
+            'name': note.name,
+            'text': note.text
+        }
+
+        if note.tags.exists():
+            d['tags'] = SearchIndex._tags_to_str(note.tags)
+
+        if note.course:
+            d['course_id'] = note.course.id
+
+        if note.uploaded_at:
+            d['timestamp'] = calendar.timegm(note.uploaded_at.timetuple())
+
+        return d
+
+    def add_note(self, note):
+        """Add a note to the index. If the note is
+        already in the index, it will be overwritten."""
+        if note.text:
+            logger.info("Indexing {n}".format(n=note))
+            self.index.add_document(note.id, SearchIndex._note_to_dict(note), variables={0: note.thanks})
+        else:
+            logger.info("Note {n} has no text, will not add to IndexDen".format(n=note))
+
+    def update_note(self, new_note, old_note):
+        """Update a note. Will only truly update the search
+        index if it needs to. Compares the fields in new_note with
+        old_note to see what has changed."""
+        if not new_note.text:
+            logger.info("Note {n} has no text, will not add to IndexDen".format(n=new_note))
+            return
+
+        # If the indexable fields have changed,
+        # send the document to IndexDen again
+        if new_note.text != old_note.text or \
+            new_note.name != old_note.name or \
+            SearchIndex._tags_to_str(new_note.tags) != SearchIndex._tags_to_str(old_note.tags) or \
+            new_note.course != old_note.course or \
+            new_note.uploaded_at != old_note.uploaded_at:
+            logger.info("Indexing {n}".format(n=new_note))
+            self.index.add_document(new_note.id, SearchIndex._note_to_dict(new_note), variables={0: new_note.thanks})
+
+        # If only the thanks count has changed, we can
+        # just send that
+        elif new_note.thanks != old_note.thanks:
+            logger.info("Indexing thanks variable for {n}".format(n=new_note))
+            self.index.update_variables(new_note.id, variables={0: new_note.thanks})
+
+        # Otherwise we don't need to do anything
+        else:
+            logger.info("Note {n} has not changed sufficiently, will not update IndexDen".format(n=new_note))
+
+    def remove_note(self, note):
+        """Remove a note from the search index."""
+
+        logger.info("Removing from index: {n}".format(n=note))
+        self.index.delete_document(note.id)
+
+    def search(self, query, course_id=None, page=0):
+        """Returns an instance of SearchResult for your query."""
+
+        if course_id:
+            real_query = '("%s" OR name:"%s") AND course_id:%s' % (query, query, course_id)
+        else:
+            real_query = '"%s" OR name:"%s"' % (query, query)
+
+        raw_results = self.index.search(real_query, snippet_fields=['text'],
+                                   length=PAGE_SIZE, start=(page*PAGE_SIZE))
+
+        ordered_ids = [int(r['docid']) for r in raw_results['results']]
+        snippet_dict = {int(r['docid']): r['snippet_text'] for r in raw_results['results']}
+
+        # Are there more results to show the user if they want?
+        has_more = True if int(raw_results['matches']) > ((page+1) * PAGE_SIZE) else False
+
+        return SearchResult(ordered_ids, snippet_dict, has_more)
index 43f42a1e683a499a5020fe9d15002a90410fdb71..502356f2c3c4dc8201639f17702272203bc09995 100644 (file)
@@ -1,22 +1,20 @@
 #!/usr/bin/env python
 # -*- coding:utf8 -*-
 # Copyright (C) 2012  FinalsClub Foundation
+
 import json
 from django.core.exceptions import ObjectDoesNotExist
 from karmaworld.apps.courses.models import Course
-from karmaworld.apps.notes import search
+from karmaworld.apps.notes.search import SearchIndex
 
 import os
 
 from django.conf import settings
-from django.contrib.sites.models import Site
 from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseNotFound
 from django.views.generic import DetailView, ListView
 from django.views.generic import FormView
 from django.views.generic import View
-from django.views.generic import TemplateView
 from django.views.generic.detail import SingleObjectMixin
-from django.shortcuts import get_object_or_404, render_to_response
 
 from karmaworld.apps.notes.models import Note
 from karmaworld.apps.notes.forms import NoteForm
@@ -173,12 +171,14 @@ class NoteSearchView(ListView):
         else:
             page = 0
 
+        index = SearchIndex()
+
         if 'course_id' in self.request.GET:
-            raw_results = search.search(self.request.GET['query'],
+            raw_results = index.search(self.request.GET['query'],
                                               self.request.GET['course_id'],
                                               page=page)
         else:
-            raw_results = search.search(self.request.GET['query'],
+            raw_results = index.search(self.request.GET['query'],
                                         page=page)
 
         instances = Note.objects.in_bulk(raw_results.ordered_ids)