From: Charles Connell Date: Thu, 1 May 2014 18:59:36 +0000 (-0400) Subject: Guess keywords in uploaded notes X-Git-Tag: release-20150131~101 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=a36f9d9429976ed2551b8e7a9dcc9483629db6b5;p=oweals%2Fkarmaworld.git Guess keywords in uploaded notes --- diff --git a/fabfile.py b/fabfile.py index 7bee315..460dfa6 100644 --- a/fabfile.py +++ b/fabfile.py @@ -287,6 +287,14 @@ def import_usde(): virtenv_exec('{0}/manage.py import_usde_csv {1}'.format(env.code_root, env.usde_csv)) virtenv_exec('{0}/manage.py sanitize_usde_schools'.format(env.code_root)) +@task +def nltk_download(): + """ + Initialize corpa used by NLTK + """ + virtenv_exec('python -c "import nltk\n' + 'nltk.download(\'maxent_treebank_pos_tagger\')'"") + @task def first_deploy(): """ @@ -300,6 +308,7 @@ def first_deploy(): syncdb() compress_static() collect_static() + nltk_download() fetch_usde() import_usde() flush_memcache() @@ -317,6 +326,7 @@ def deploy(): syncdb() compress_static() collect_static() + nltk_download() flush_memcache() restart_supervisord() ########## END COMMANDS diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index b597722..4f4df55 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -9,6 +9,8 @@ from django.conf import settings from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned from karmaworld.apps.notes.models import UserUploadMapping from karmaworld.apps.notes.models import NoteMarkdown +from karmaworld.apps.quizzes.find_keywords import find_keywords +from karmaworld.apps.quizzes.models import Keyword from karmaworld.apps.users.models import NoteKarmaEvent import os import subprocess @@ -238,6 +240,10 @@ def convert_raw_document(raw_document, user=None): note_markdown = NoteMarkdown(note=note, markdown=markdown) note_markdown.save() + # Guess some keywords from the note text + keywords = find_keywords(note.text) + for word in keywords: + Keyword.objects.create(word=word, note=note) # If we know the user who uploaded this, # associate them with the note diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py index 97e8843..a0b3d8a 100644 --- a/karmaworld/apps/notes/views.py +++ b/karmaworld/apps/notes/views.py @@ -11,6 +11,7 @@ from django.core.exceptions import ValidationError from django.forms.formsets import formset_factory from karmaworld.apps.courses.models import Course from karmaworld.apps.notes.search import SearchIndex +from karmaworld.apps.quizzes.find_keywords import find_keywords from karmaworld.apps.quizzes.forms import KeywordForm from karmaworld.apps.quizzes.models import Keyword from karmaworld.apps.users.models import NoteKarmaEvent diff --git a/karmaworld/apps/quizzes/find_keywords.py b/karmaworld/apps/quizzes/find_keywords.py new file mode 100644 index 0000000..ea6f340 --- /dev/null +++ b/karmaworld/apps/quizzes/find_keywords.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# -*- coding:utf8 -*- +# Copyright (C) 2014 FinalsClub Foundation +from __future__ import division +from collections import defaultdict +import nltk +import itertools +from operator import itemgetter +from pygraph.classes.digraph import digraph +from pygraph.algorithms.pagerank import pagerank +from pygraph.classes.exceptions import AdditionError + + +def _filter(tagged, tags=('NN', 'JJ', 'NNP')): + pos_filtered = [item[0] for item in tagged if item[1] in tags] + stopwords_filtered = [word.lower() for word in pos_filtered if not word.lower() in nltk.corpus.stopwords.words('english')] + remove_punc = [item.replace('.', '') for item in stopwords_filtered] + return remove_punc + + +def _normalize(words): + lower = [word.lower() for word in words] + remove_punc = [item.replace('.', '') for item in lower] + return remove_punc + + +def _unique_everseen(iterable, key=None): + "List unique elements, preserving order. Remember all elements ever seen." + # unique_everseen('AAAABBBCCDAABBB') --> A B C D + # unique_everseen('ABBCcAD', str.lower) --> A B C D + seen = set() + seen_add = seen.add + if key is None: + for element in itertools.ifilterfalse(seen.__contains__, iterable): + seen_add(element) + yield element + else: + for element in iterable: + k = key(element) + if k not in seen: + seen_add(k) + yield element + + +def _common_ngrams(normalized_words, top_ordered_keywords, n): + ngrams_in_top_keywords = set() + common_ngrams = [] + + for ngram_words in itertools.product(top_ordered_keywords, repeat=n): + target_ngram = list(ngram_words) + for i in range(len(normalized_words)): + ngram = normalized_words[i:i+n] + if target_ngram == ngram: + ngrams_in_top_keywords.add(tuple(target_ngram)) + + for words in ngrams_in_top_keywords: + words_usage_in_ngram = 0 + individual_word_usage = defaultdict(lambda: 0) + for i in range(len(normalized_words)): + for word in words: + if normalized_words[i] == word: + individual_word_usage[word] += 1 + if normalized_words[i:i+n] == list(words): + words_usage_in_ngram += 1 + + for word in words: + ratio = words_usage_in_ngram / individual_word_usage[word] + if ratio > 0.5: + common_ngrams.append(words) + break + + return common_ngrams + + +def find_keywords(document, word_count=10): + """ + Credit to https://gist.github.com/voidfiles/1646117 + and http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf + """ + sentences = nltk.sent_tokenize(document) + candidate_words = [] + all_words = [] + for sentence in sentences: + words = nltk.word_tokenize(sentence) + all_words.extend(words) + tagged_words = nltk.pos_tag(words) + filtered_words = _filter(tagged_words) + candidate_words.extend(filtered_words) + + unique_word_set = _unique_everseen(candidate_words) + + gr = digraph() + gr.add_nodes(list(unique_word_set)) + + window_start = 0 + window_end = 2 + + while 1: + window_words = candidate_words[window_start:window_end] + if len(window_words) == 2: + try: + gr.add_edge((window_words[0], window_words[1])) + except AdditionError: + pass + else: + break + + window_start += 1 + window_end += 1 + + calculated_page_rank = pagerank(gr) + di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True) + all_ordered_keywords = [w[0] for w in di] + top_ordered_keywords = all_ordered_keywords[:word_count] + + normalized_words = _normalize(all_words) + + common_bigrams = _common_ngrams(normalized_words, top_ordered_keywords, 2) + common_trigrams = _common_ngrams(normalized_words, top_ordered_keywords, 3) + for words in common_bigrams + common_trigrams: + for word in words: + top_ordered_keywords.remove(word) + top_ordered_keywords.insert(0, ' '.join(words)) + + return top_ordered_keywords diff --git a/reqs/common.txt b/reqs/common.txt index f6059ae..c03a7fb 100644 --- a/reqs/common.txt +++ b/reqs/common.txt @@ -27,3 +27,6 @@ git+https://github.com/btbonval/django-ajax-selects-cascade.git psycopg2 git+https://github.com/Soaa-/django-nested-inlines.git pyth +http://python-graph.googlecode.com/files/python-graph-core-1.8.2.tar.gz +nltk +numpy