From 0d7f998b96038b4d8390cc11ccad7dbdf6e46489 Mon Sep 17 00:00:00 2001 From: Bryan Date: Tue, 10 Mar 2015 01:52:39 -0400 Subject: [PATCH] OCW importer updated for #359. not fully tested. --- .../management/commands/import_ocw_json.py | 73 +++++++++++-------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/karmaworld/apps/notes/management/commands/import_ocw_json.py b/karmaworld/apps/notes/management/commands/import_ocw_json.py index 33a906c..67105da 100644 --- a/karmaworld/apps/notes/management/commands/import_ocw_json.py +++ b/karmaworld/apps/notes/management/commands/import_ocw_json.py @@ -7,6 +7,8 @@ import os.path import requests from karmaworld.apps.notes.models import Note +from karmaworld.apps.notes.models import NoteMarkdown as NoteContent +from karmaworld.apps.notes.models import Document from karmaworld.apps.notes.gdrive import convert_raw_document from karmaworld.apps.courses.models import Course from karmaworld.apps.courses.models import School @@ -32,29 +34,25 @@ class Command(BaseCommand): def handle(self, *args, **kwargs): if len(args) != 1: - raise ArgumentError("Expected one argument, got none: please specify a directory to parse.") + raise TypeError("Expected one argument, got none: please specify a directory to parse.") # Convert given path to an absolute path, not relative. path = os.path.abspath(args[0]) if not os.path.isdir(path): - raise ArgumentError("First argument should be a directory to parse.") + raise TypeError("First argument should be a directory to parse.") # for now, assume the school is MIT and find by its US DepEd ID. # TODO for later, do something more clever dbschool = School.objects.filter(usde_id=121415)[0] - # for now, assume license is the default OCW license: CC-BY-NC 3 + # for now, assume license is the default OCW license: CC-BY-NC # TODO for later, do something more clever. dblicense = License.objects.get_or_create( name='cc-by-nc-3.0', - html='Creative Commons License' + html='Creative Commons License' )[0] - # build Filepicker upload URL - # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line - fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY) - # find all *.json files in the given directory def is_json_file(filename): return filename[-5:].lower() == '.json' @@ -66,6 +64,7 @@ class Command(BaseCommand): # parse each json file and process it for courses and notes. for filename in json_files: + # each file is assumed to contain courses for a single department with open(filename, 'r') as jsondata: # parse JSON into python parsed = json.load(jsondata) @@ -76,20 +75,24 @@ class Command(BaseCommand): 'school': dbschool, 'url': parsed['departmentLink'], } - dbdept = Department.objects.get_or_create(**dept_info)[0] + # Defer department creation only until there is a valid course. + # keeping it at this scope is a bit like caching. + dbdept = None # process courses for course in parsed['courses']: + if 'noteLinks' not in course or not course['noteLinks']: + print "No Notes in course." + continue + + # only create department if necessary at this time + if dbdept is None: + dbdept = Department.objects.get_or_create(**dept_info)[0] + # Assume first hit is always right. Solving the identity # problem by name alone will always be a fool's errand. dbprof = Professor.objects.get_or_create(name=course['professor'])[0] - # Associate the professor with the department. - # (no need to track the result) - ProfessorAffiliation.objects.get_or_create( - professor=dbprof, - department=dbdept) - # Extract the course info course_info = { 'name': course['courseTitle'], @@ -97,20 +100,12 @@ class Command(BaseCommand): } # Create or Find the Course object. dbcourse = Course.objects.get_or_create(**course_info)[0] - dbcourse.professor = dbprof + dbcourse.professor.add(dbprof) dbcourse.instructor_name = course['professor'] dbcourse.school = dbschool dbcourse.save() print "Course is in the database: {0}".format(dbcourse.name) - ProfessorTaught.objects.get_or_create( - professor=dbprof, - course=dbcourse) - - if 'noteLinks' not in course or not course['noteLinks']: - print "No Notes in course." - continue - # process notes for each course for note in course['noteLinks']: # Check to see if the Note is already uploaded. @@ -121,14 +116,20 @@ class Command(BaseCommand): continue if len(dbnote) == 1: dbnote = dbnote[0] - if dbnote.text and len(dbnote.text) or \ - dbnote.html and len(dbnote.html): + # should only be 1 entry, but get() errors loudly + # when none are found. filter is easier to work with + dbcontent = NoteContent.objects.filter(note_id=dbnote.id) + if dbnote.text and len(dbnote.text) and \ + len(dbcontent) and \ + len(dbcontent[0].html): print "Already there, moving on: {0}".format(url) continue else: # Partially completed note. Remove it and try # again. dbnote.tags.set() # clear tags + for content in dbcontent: + dbcontent.delete() # delete any note content dbnote.delete() # delete note print "Found and removed incomplete note {0}.".format(url) @@ -136,11 +137,18 @@ class Command(BaseCommand): # in RawDocument. rd_test = RawDocument.objects.filter(upstream_link=url) if not len(rd_test): - # https://developers.inkfilepicker.com/docs/web/#inkblob-store + # https://developers.filepicker.io/docs/web/rest/#blob-store print "Uploading link {0} to FP.".format(url) - ulresp = requests.post(fpurl, data={ - 'url': url, - }) + ulresp = requests.post( + 'https://www.filepicker.io/api/store/S3', + params={ + 'key': FILEPICKER_API_KEY, + 'policy': Document.fp_policy, + 'signature': Document.fp_signature, + }, + data={ + 'url': url, + }) try: ulresp.raise_for_status() except Exception, e: @@ -170,7 +178,7 @@ class Command(BaseCommand): # Do tags separately dbnote.tags.add('mit-ocw','karma') - print "Converting document and saving note to S3." + print "Converting document and saving note text." while True: try: convert_raw_document(dbnote) @@ -201,3 +209,6 @@ class Command(BaseCommand): break print "Notes for {0} are done.".format(dbcourse.name) + # Informational output for debugging + if dbdept is None: + print "No courses with notes found for {name}.".format(**dept_info) -- 2.25.1