OCW importer updated for #359. not fully tested.
[oweals/karmaworld.git] / karmaworld / apps / notes / management / commands / import_ocw_json.py
index 33a906c0c10552baa366c8452d9815cc3b265190..67105dae10d0c7c9f5f8e3549d925d2bbb9e88c3 100644 (file)
@@ -7,6 +7,8 @@ import os.path
 import requests
 
 from karmaworld.apps.notes.models import Note
+from karmaworld.apps.notes.models import NoteMarkdown as NoteContent
+from karmaworld.apps.notes.models import Document
 from karmaworld.apps.notes.gdrive import convert_raw_document
 from karmaworld.apps.courses.models import Course
 from karmaworld.apps.courses.models import School
@@ -32,29 +34,25 @@ class Command(BaseCommand):
 
     def handle(self, *args, **kwargs):
         if len(args) != 1:
-            raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
+            raise TypeError("Expected one argument, got none: please specify a directory to parse.")
 
         # Convert given path to an absolute path, not relative.
         path = os.path.abspath(args[0])
 
         if not os.path.isdir(path):
-            raise ArgumentError("First argument should be a directory to parse.")
+            raise TypeError("First argument should be a directory to parse.")
 
         # for now, assume the school is MIT and find by its US DepEd ID.
         # TODO for later, do something more clever
         dbschool = School.objects.filter(usde_id=121415)[0]
 
-        # for now, assume license is the default OCW license: CC-BY-NC 3
+        # for now, assume license is the default OCW license: CC-BY-NC
         # TODO for later, do something more clever.
         dblicense = License.objects.get_or_create(
           name='cc-by-nc-3.0',
-          html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
+          html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/3.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/3.0/88x31.png" /></a>'
         )[0]
 
-        # build Filepicker upload URL
-        # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
-        fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
-
         # find all *.json files in the given directory
         def is_json_file(filename):
             return filename[-5:].lower() == '.json'
@@ -66,6 +64,7 @@ class Command(BaseCommand):
 
         # parse each json file and process it for courses and notes.
         for filename in json_files:
+            # each file is assumed to contain courses for a single department
             with open(filename, 'r') as jsondata:
                 # parse JSON into python
                 parsed = json.load(jsondata)
@@ -76,20 +75,24 @@ class Command(BaseCommand):
                     'school': dbschool,
                     'url': parsed['departmentLink'],
                 }
-                dbdept = Department.objects.get_or_create(**dept_info)[0]
+                # Defer department creation only until there is a valid course.
+                # keeping it at this scope is a bit like caching.
+                dbdept = None
 
                 # process courses
                 for course in parsed['courses']:
+                    if 'noteLinks' not in course or not course['noteLinks']:
+                        print "No Notes in course."
+                        continue
+
+                    # only create department if necessary at this time
+                    if dbdept is None:
+                        dbdept = Department.objects.get_or_create(**dept_info)[0]
+
                     # Assume first hit is always right. Solving the identity
                     # problem by name alone will always be a fool's errand.
                     dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
 
-                    # Associate the professor with the department.
-                    # (no need to track the result)
-                    ProfessorAffiliation.objects.get_or_create(
-                        professor=dbprof,
-                        department=dbdept)
-
                     # Extract the course info
                     course_info = {
                       'name': course['courseTitle'],
@@ -97,20 +100,12 @@ class Command(BaseCommand):
                     }
                     # Create or Find the Course object.
                     dbcourse = Course.objects.get_or_create(**course_info)[0]
-                    dbcourse.professor = dbprof
+                    dbcourse.professor.add(dbprof)
                     dbcourse.instructor_name = course['professor']
                     dbcourse.school = dbschool
                     dbcourse.save()
                     print "Course is in the database: {0}".format(dbcourse.name)
 
-                    ProfessorTaught.objects.get_or_create(
-                        professor=dbprof,
-                        course=dbcourse)
-
-                    if 'noteLinks' not in course or not course['noteLinks']:
-                        print "No Notes in course."
-                        continue
-
                     # process notes for each course
                     for note in course['noteLinks']:
                         # Check to see if the Note is already uploaded.
@@ -121,14 +116,20 @@ class Command(BaseCommand):
                             continue
                         if len(dbnote) == 1:
                             dbnote = dbnote[0]
-                            if dbnote.text and len(dbnote.text) or \
-                               dbnote.html and len(dbnote.html):
+                            # should only be 1 entry, but get() errors loudly
+                            # when none are found. filter is easier to work with
+                            dbcontent = NoteContent.objects.filter(note_id=dbnote.id)
+                            if dbnote.text and len(dbnote.text) and \
+                               len(dbcontent) and \
+                               len(dbcontent[0].html):
                                 print "Already there, moving on: {0}".format(url)
                                 continue
                             else:
                                 # Partially completed note. Remove it and try
                                 # again.
                                 dbnote.tags.set() # clear tags
+                                for content in dbcontent:
+                                    dbcontent.delete() # delete any note content
                                 dbnote.delete() # delete note
                                 print "Found and removed incomplete note {0}.".format(url)
 
@@ -136,11 +137,18 @@ class Command(BaseCommand):
                         # in RawDocument.
                         rd_test = RawDocument.objects.filter(upstream_link=url)
                         if not len(rd_test):
-                            # https://developers.inkfilepicker.com/docs/web/#inkblob-store
+                            # https://developers.filepicker.io/docs/web/rest/#blob-store
                             print "Uploading link {0} to FP.".format(url)
-                            ulresp = requests.post(fpurl, data={
-                              'url': url,
-                            })
+                            ulresp = requests.post(
+                              'https://www.filepicker.io/api/store/S3',
+                              params={
+                                'key': FILEPICKER_API_KEY,
+                                'policy': Document.fp_policy,
+                                'signature': Document.fp_signature,
+                              },
+                              data={
+                                'url': url,
+                              })
                             try:
                                 ulresp.raise_for_status()
                             except Exception, e:
@@ -170,7 +178,7 @@ class Command(BaseCommand):
                         # Do tags separately
                         dbnote.tags.add('mit-ocw','karma')
 
-                        print "Converting document and saving note to S3."
+                        print "Converting document and saving note text."
                         while True:
                             try:
                                 convert_raw_document(dbnote)
@@ -201,3 +209,6 @@ class Command(BaseCommand):
                                 break
 
                     print "Notes for {0} are done.".format(dbcourse.name)
+                # Informational output for debugging
+                if dbdept is None:
+                    print "No courses with notes found for {name}.".format(**dept_info)