OCW importer updated for #359. not fully tested.

[oweals/karmaworld.git] / karmaworld / apps / notes / management / commands / import_ocw_json.py
diff --git a/karmaworld/apps/notes/management/commands/import_ocw_json.py b/karmaworld/apps/notes/management/commands/import_ocw_json.py

index 33a906c0c10552baa366c8452d9815cc3b265190..67105dae10d0c7c9f5f8e3549d925d2bbb9e88c3 100644 (file)
--- a/karmaworld/apps/notes/management/commands/import_ocw_json.py
+++ b/karmaworld/apps/notes/management/commands/import_ocw_json.py
@@ -7,6 +7,8 @@ import os.path
  import requests
  
  from karmaworld.apps.notes.models import Note
+from karmaworld.apps.notes.models import NoteMarkdown as NoteContent
+from karmaworld.apps.notes.models import Document
  from karmaworld.apps.notes.gdrive import convert_raw_document
  from karmaworld.apps.courses.models import Course
  from karmaworld.apps.courses.models import School
@@ -32,29 +34,25 @@ class Command(BaseCommand):
  
      def handle(self, *args, **kwargs):
          if len(args) != 1:
-            raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
+            raise TypeError("Expected one argument, got none: please specify a directory to parse.")
  
          # Convert given path to an absolute path, not relative.
          path = os.path.abspath(args[0])
  
          if not os.path.isdir(path):
-            raise ArgumentError("First argument should be a directory to parse.")
+            raise TypeError("First argument should be a directory to parse.")
  
          # for now, assume the school is MIT and find by its US DepEd ID.
          # TODO for later, do something more clever
          dbschool = School.objects.filter(usde_id=121415)[0]
  
-        # for now, assume license is the default OCW license: CC-BY-NC 3
+        # for now, assume license is the default OCW license: CC-BY-NC
          # TODO for later, do something more clever.
          dblicense = License.objects.get_or_create(
            name='cc-by-nc-3.0',
-          html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
+          html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/3.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/3.0/88x31.png" /></a>'
          )[0]
  
-        # build Filepicker upload URL
-        # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
-        fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
-
          # find all *.json files in the given directory
          def is_json_file(filename):
              return filename[-5:].lower() == '.json'
@@ -66,6 +64,7 @@ class Command(BaseCommand):
  
          # parse each json file and process it for courses and notes.
          for filename in json_files:
+            # each file is assumed to contain courses for a single department
              with open(filename, 'r') as jsondata:
                  # parse JSON into python
                  parsed = json.load(jsondata)
@@ -76,20 +75,24 @@ class Command(BaseCommand):
                      'school': dbschool,
                      'url': parsed['departmentLink'],
                  }
-                dbdept = Department.objects.get_or_create(**dept_info)[0]
+                # Defer department creation only until there is a valid course.
+                # keeping it at this scope is a bit like caching.
+                dbdept = None
  
                  # process courses
                  for course in parsed['courses']:
+                    if 'noteLinks' not in course or not course['noteLinks']:
+                        print "No Notes in course."
+                        continue
+
+                    # only create department if necessary at this time
+                    if dbdept is None:
+                        dbdept = Department.objects.get_or_create(**dept_info)[0]
+
                      # Assume first hit is always right. Solving the identity
                      # problem by name alone will always be a fool's errand.
                      dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
  
-                    # Associate the professor with the department.
-                    # (no need to track the result)
-                    ProfessorAffiliation.objects.get_or_create(
-                        professor=dbprof,
-                        department=dbdept)
-
                      # Extract the course info
                      course_info = {
                        'name': course['courseTitle'],
@@ -97,20 +100,12 @@ class Command(BaseCommand):
                      }
                      # Create or Find the Course object.
                      dbcourse = Course.objects.get_or_create(**course_info)[0]
-                    dbcourse.professor = dbprof
+                    dbcourse.professor.add(dbprof)
                      dbcourse.instructor_name = course['professor']
                      dbcourse.school = dbschool
                      dbcourse.save()
                      print "Course is in the database: {0}".format(dbcourse.name)
  
-                    ProfessorTaught.objects.get_or_create(
-                        professor=dbprof,
-                        course=dbcourse)
-
-                    if 'noteLinks' not in course or not course['noteLinks']:
-                        print "No Notes in course."
-                        continue
-
                      # process notes for each course
                      for note in course['noteLinks']:
                          # Check to see if the Note is already uploaded.
@@ -121,14 +116,20 @@ class Command(BaseCommand):
                              continue
                          if len(dbnote) == 1:
                              dbnote = dbnote[0]
-                            if dbnote.text and len(dbnote.text) or \
-                               dbnote.html and len(dbnote.html):
+                            # should only be 1 entry, but get() errors loudly
+                            # when none are found. filter is easier to work with
+                            dbcontent = NoteContent.objects.filter(note_id=dbnote.id)
+                            if dbnote.text and len(dbnote.text) and \
+                               len(dbcontent) and \
+                               len(dbcontent[0].html):
                                  print "Already there, moving on: {0}".format(url)
                                  continue
                              else:
                                  # Partially completed note. Remove it and try
                                  # again.
                                  dbnote.tags.set() # clear tags
+                                for content in dbcontent:
+                                    dbcontent.delete() # delete any note content
                                  dbnote.delete() # delete note
                                  print "Found and removed incomplete note {0}.".format(url)
  
@@ -136,11 +137,18 @@ class Command(BaseCommand):
                          # in RawDocument.
                          rd_test = RawDocument.objects.filter(upstream_link=url)
                          if not len(rd_test):
-                            # https://developers.inkfilepicker.com/docs/web/#inkblob-store
+                            # https://developers.filepicker.io/docs/web/rest/#blob-store
                              print "Uploading link {0} to FP.".format(url)
-                            ulresp = requests.post(fpurl, data={
-                              'url': url,
-                            })
+                            ulresp = requests.post(
+                              'https://www.filepicker.io/api/store/S3',
+                              params={
+                                'key': FILEPICKER_API_KEY,
+                                'policy': Document.fp_policy,
+                                'signature': Document.fp_signature,
+                              },
+                              data={
+                                'url': url,
+                              })
                              try:
                                  ulresp.raise_for_status()
                              except Exception, e:
@@ -170,7 +178,7 @@ class Command(BaseCommand):
                          # Do tags separately
                          dbnote.tags.add('mit-ocw','karma')
  
-                        print "Converting document and saving note to S3."
+                        print "Converting document and saving note text."
                          while True:
                              try:
                                  convert_raw_document(dbnote)
@@ -201,3 +209,6 @@ class Command(BaseCommand):
                                  break
  
                      print "Notes for {0} are done.".format(dbcourse.name)
+                # Informational output for debugging
+                if dbdept is None:
+                    print "No courses with notes found for {name}.".format(**dept_info)