Guess keywords in uploaded notes
authorCharles Connell <charles@connells.org>
Thu, 1 May 2014 18:59:36 +0000 (14:59 -0400)
committerCharles Connell <charles@connells.org>
Thu, 1 May 2014 18:59:36 +0000 (14:59 -0400)
fabfile.py
karmaworld/apps/notes/gdrive.py
karmaworld/apps/notes/views.py
karmaworld/apps/quizzes/find_keywords.py [new file with mode: 0644]
reqs/common.txt

index 7bee315b90ce1c9dc12423a519b307479c8189b5..460dfa65ed153a28aa9dc09070f1f987bf9dae64 100644 (file)
@@ -287,6 +287,14 @@ def import_usde():
     virtenv_exec('{0}/manage.py import_usde_csv {1}'.format(env.code_root, env.usde_csv))
     virtenv_exec('{0}/manage.py sanitize_usde_schools'.format(env.code_root))
 
+@task
+def nltk_download():
+    """
+    Initialize corpa used by NLTK
+    """
+    virtenv_exec('python -c "import nltk\n'
+                 'nltk.download(\'maxent_treebank_pos_tagger\')'"")
+
 @task
 def first_deploy():
     """
@@ -300,6 +308,7 @@ def first_deploy():
     syncdb()
     compress_static()
     collect_static()
+    nltk_download()
     fetch_usde()
     import_usde()
     flush_memcache()
@@ -317,6 +326,7 @@ def deploy():
     syncdb()
     compress_static()
     collect_static()
+    nltk_download()
     flush_memcache()
     restart_supervisord()
 ########## END COMMANDS
index b5977222760ed5b4e491bbe9d4f585050ff8016d..4f4df55fa3a58463e732ed82de52381a394dc453 100644 (file)
@@ -9,6 +9,8 @@ from django.conf import settings
 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
 from karmaworld.apps.notes.models import UserUploadMapping
 from karmaworld.apps.notes.models import NoteMarkdown
+from karmaworld.apps.quizzes.find_keywords import find_keywords
+from karmaworld.apps.quizzes.models import Keyword
 from karmaworld.apps.users.models import NoteKarmaEvent
 import os
 import subprocess
@@ -238,6 +240,10 @@ def convert_raw_document(raw_document, user=None):
         note_markdown = NoteMarkdown(note=note, markdown=markdown)
         note_markdown.save()
 
+    # Guess some keywords from the note text
+    keywords = find_keywords(note.text)
+    for word in keywords:
+        Keyword.objects.create(word=word, note=note)
 
     # If we know the user who uploaded this,
     # associate them with the note
index 97e88439fadec702f357aefb87724994b266d7aa..a0b3d8ac97198d770c4650713a984499a16eb2d3 100644 (file)
@@ -11,6 +11,7 @@ from django.core.exceptions import ValidationError
 from django.forms.formsets import formset_factory
 from karmaworld.apps.courses.models import Course
 from karmaworld.apps.notes.search import SearchIndex
+from karmaworld.apps.quizzes.find_keywords import find_keywords
 from karmaworld.apps.quizzes.forms import KeywordForm
 from karmaworld.apps.quizzes.models import Keyword
 from karmaworld.apps.users.models import NoteKarmaEvent
diff --git a/karmaworld/apps/quizzes/find_keywords.py b/karmaworld/apps/quizzes/find_keywords.py
new file mode 100644 (file)
index 0000000..ea6f340
--- /dev/null
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2014  FinalsClub Foundation
+from __future__ import division
+from collections import defaultdict
+import nltk
+import itertools
+from operator import itemgetter
+from pygraph.classes.digraph import digraph
+from pygraph.algorithms.pagerank import pagerank
+from pygraph.classes.exceptions import AdditionError
+
+
+def _filter(tagged, tags=('NN', 'JJ', 'NNP')):
+    pos_filtered = [item[0] for item in tagged if item[1] in tags]
+    stopwords_filtered = [word.lower() for word in pos_filtered if not word.lower() in nltk.corpus.stopwords.words('english')]
+    remove_punc = [item.replace('.', '') for item in stopwords_filtered]
+    return remove_punc
+
+
+def _normalize(words):
+    lower = [word.lower() for word in words]
+    remove_punc = [item.replace('.', '') for item in lower]
+    return remove_punc
+
+
+def _unique_everseen(iterable, key=None):
+    "List unique elements, preserving order. Remember all elements ever seen."
+    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
+    # unique_everseen('ABBCcAD', str.lower) --> A B C D
+    seen = set()
+    seen_add = seen.add
+    if key is None:
+        for element in itertools.ifilterfalse(seen.__contains__, iterable):
+            seen_add(element)
+            yield element
+    else:
+        for element in iterable:
+            k = key(element)
+            if k not in seen:
+                seen_add(k)
+                yield element
+
+
+def _common_ngrams(normalized_words, top_ordered_keywords, n):
+    ngrams_in_top_keywords = set()
+    common_ngrams = []
+
+    for ngram_words in itertools.product(top_ordered_keywords, repeat=n):
+        target_ngram = list(ngram_words)
+        for i in range(len(normalized_words)):
+            ngram = normalized_words[i:i+n]
+            if target_ngram == ngram:
+                ngrams_in_top_keywords.add(tuple(target_ngram))
+
+    for words in ngrams_in_top_keywords:
+        words_usage_in_ngram = 0
+        individual_word_usage = defaultdict(lambda: 0)
+        for i in range(len(normalized_words)):
+            for word in words:
+                if normalized_words[i] == word:
+                    individual_word_usage[word] += 1
+            if normalized_words[i:i+n] == list(words):
+                words_usage_in_ngram += 1
+
+        for word in words:
+            ratio = words_usage_in_ngram / individual_word_usage[word]
+            if ratio > 0.5:
+                common_ngrams.append(words)
+                break
+
+    return common_ngrams
+
+
+def find_keywords(document, word_count=10):
+    """
+    Credit to https://gist.github.com/voidfiles/1646117
+    and http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf
+    """
+    sentences = nltk.sent_tokenize(document)
+    candidate_words = []
+    all_words = []
+    for sentence in sentences:
+        words = nltk.word_tokenize(sentence)
+        all_words.extend(words)
+        tagged_words = nltk.pos_tag(words)
+        filtered_words = _filter(tagged_words)
+        candidate_words.extend(filtered_words)
+
+    unique_word_set = _unique_everseen(candidate_words)
+
+    gr = digraph()
+    gr.add_nodes(list(unique_word_set))
+
+    window_start = 0
+    window_end = 2
+
+    while 1:
+        window_words = candidate_words[window_start:window_end]
+        if len(window_words) == 2:
+            try:
+                gr.add_edge((window_words[0], window_words[1]))
+            except AdditionError:
+                pass
+        else:
+            break
+
+        window_start += 1
+        window_end += 1
+
+    calculated_page_rank = pagerank(gr)
+    di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True)
+    all_ordered_keywords = [w[0] for w in di]
+    top_ordered_keywords = all_ordered_keywords[:word_count]
+
+    normalized_words = _normalize(all_words)
+
+    common_bigrams = _common_ngrams(normalized_words, top_ordered_keywords, 2)
+    common_trigrams = _common_ngrams(normalized_words, top_ordered_keywords, 3)
+    for words in common_bigrams + common_trigrams:
+        for word in words:
+            top_ordered_keywords.remove(word)
+        top_ordered_keywords.insert(0, ' '.join(words))
+
+    return top_ordered_keywords
index f6059aefaf1dbf8e5a15ca7508996ecf47de67f7..c03a7fb593eb379e4188c04c25d62cf44ea80894 100644 (file)
@@ -27,3 +27,6 @@ git+https://github.com/btbonval/django-ajax-selects-cascade.git
 psycopg2
 git+https://github.com/Soaa-/django-nested-inlines.git
 pyth
+http://python-graph.googlecode.com/files/python-graph-core-1.8.2.tar.gz
+nltk
+numpy