karmaworld/apps/notes/management/commands/import_ocw_json.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import json
   6 import os.path
   7 import requests
   8
   9 from karmaworld.apps.notes.models import Note
  10 from karmaworld.apps.notes.gdrive import convert_raw_document
  11 from karmaworld.apps.courses.models import Course
  12 from karmaworld.apps.courses.models import School
  13 from karmaworld.apps.courses.models import Professor
  14 from karmaworld.apps.courses.models import Department
  15 from karmaworld.apps.licenses.models import License
  16 from karmaworld.apps.document_upload.models import RawDocument
  17 from django.core.management.base import BaseCommand
  18
  19 FILEPICKER_API_KEY = os.environ['FILEPICKER_API_KEY']
  20
  21 class Command(BaseCommand):
  22     args = 'directory containing json files from mit-ocw-scraper'
  23     help = """
  24            This command will systematically parse all *.json files in the given
  25            directory and load them into the database as course notes, uploaded
  26            through Filepicker.
  27
  28            It is assumed the json files are generated by (or follow the same
  29            format as) mit-ocw-scraper:
  30            https://github.com/AndrewMagliozzi/mit-ocw-scraper
  31            """
  32
  33     def handle(self, *args, **kwargs):
  34         if len(args) != 1:
  35             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
  36
  37         # Convert given path to an absolute path, not relative.
  38         path = os.path.abspath(args[0])
  39
  40         if not os.path.isdir(path):
  41             raise ArgumentError("First argument should be a directory to parse.")
  42
  43         # for now, assume the school is MIT and find by its US DepEd ID.
  44         # TODO for later, do something more clever
  45         dbschool = School.objects.filter(usde_id=121415)[0]
  46
  47         # for now, assume license is the default OCW license: CC-BY-NC 3
  48         # TODO for later, do something more clever.
  49         dblicense = License.objects.get_or_create(
  50           name='cc-by-nc-3.0',
  51           html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
  52         )[0]
  53
  54         # build Filepicker upload URL
  55         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
  56         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
  57
  58         # find all *.json files in the given directory
  59         def is_json_file(filename):
  60             return filename[-5:].lower() == '.json'
  61         json_files = filter(is_json_file, os.listdir(path))
  62         # prepend filenames with absolute paths
  63         def full_path_to_file(filename):
  64             return os.path.sep.join((path, filename))
  65         json_files = map(full_path_to_file, json_files)
  66
  67         # parse each json file and process it for courses and notes.
  68         for filename in json_files:
  69             with open(filename, 'r') as jsondata:
  70                 # parse JSON into python
  71                 parsed = json.load(jsondata)
  72
  73                 # find the department or create one.
  74                 dept_info = {
  75                     'name': parsed['subject'],
  76                     'school': dbschool,
  77                     'url': parsed['departmentLink'],
  78                 }
  79                 dbdept = Department.objects.get_or_create(**dept_info)[0]
  80
  81                 # process courses
  82                 for course in parsed['courses']:
  83                     # Assume first hit is always right. Solving the identity
  84                     # problem by name alone will always be a fool's errand.
  85                     dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
  86
  87                     # Associate the professor with the department.
  88                     # (no need to track the result)
  89                     ProfessorAffiliation.objects.get_or_create(
  90                         professor=dbprof,
  91                         department=dbdept)
  92
  93                     # Extract the course info
  94                     course_info = {
  95                       'name': course['courseTitle'],
  96                       'department': dbdept,
  97                     }
  98                     # Create or Find the Course object.
  99                     dbcourse = Course.objects.get_or_create(**course_info)[0]
 100                     dbcourse.professor = dbprof
 101                     dbcourse.instructor_name = course['professor']
 102                     dbcourse.school = dbschool
 103                     dbcourse.save()
 104                     print "Course is in the database: {0}".format(dbcourse.name)
 105
 106                     ProfessorTaught.objects.get_or_create(
 107                         professor=dbprof,
 108                         course=dbcourse)
 109
 110                     if 'noteLinks' not in course or not course['noteLinks']:
 111                         print "No Notes in course."
 112                         continue
 113
 114                     # process notes for each course
 115                     for note in course['noteLinks']:
 116                         # Check to see if the Note is already uploaded.
 117                         url = note['link']
 118                         dbnote = Note.objects.filter(upstream_link=url)
 119                         if len(dbnote) > 2:
 120                             print "WARNING Skipping Note: Too many notes for {0}".format(url)
 121                             continue
 122                         if len(dbnote) == 1:
 123                             dbnote = dbnote[0]
 124                             if dbnote.text and len(dbnote.text) or \
 125                                dbnote.html and len(dbnote.html):
 126                                 print "Already there, moving on: {0}".format(url)
 127                                 continue
 128                             else:
 129                                 # Partially completed note. Remove it and try
 130                                 # again.
 131                                 dbnote.tags.set() # clear tags
 132                                 dbnote.delete() # delete note
 133                                 print "Found and removed incomplete note {0}.".format(url)
 134
 135                         # Upload URL of note to Filepicker if it is not already
 136                         # in RawDocument.
 137                         rd_test = RawDocument.objects.filter(upstream_link=url)
 138                         if not len(rd_test):
 139                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
 140                             print "Uploading link {0} to FP.".format(url)
 141                             ulresp = requests.post(fpurl, data={
 142                               'url': url,
 143                             })
 144                             try:
 145                                 ulresp.raise_for_status()
 146                             except Exception, e:
 147                                 print "Failed to upload note: " + str(e)
 148                                 print "Skipping."
 149                                 continue
 150                             # Filepicker returns JSON, so use that
 151                             uljson = ulresp.json()
 152
 153                             print "Saving raw document to database."
 154                             # Extract the note info
 155                             dbnote = RawDocument()
 156                             dbnote.course = dbcourse
 157                             dbnote.name = note['fileName']
 158                             dbnote.license = dblicense
 159                             dbnote.upstream_link = url
 160                             dbnote.fp_file = uljson['url']
 161                             dbnote.mimetype = uljson['type']
 162                             dbnote.is_processed = True # hack to bypass celery
 163                             # Create the RawDocument object.
 164                             dbnote.save()
 165                         else:
 166                             # Find the right RawDocument
 167                             print "Already uploaded link {0} to FP.".format(url)
 168                             dbnote = rd_test[0]
 169
 170                         # Do tags separately
 171                         dbnote.tags.add('mit-ocw','karma')
 172
 173                         print "Converting document and saving note to S3."
 174                         while True:
 175                             try:
 176                                 convert_raw_document(dbnote)
 177                             except ValueError, e:
 178                                 # only catch one specific error
 179                                 if not str(e).startswith('PDF file could not be'):
 180                                     raise e
 181                                 # write the link to file.
 182                                 with open('pdferrors.log', 'a') as pdferrs:
 183                                     pdferrs.write(url + '\n')
 184                                 # delete the partial Note created in convert_raw_doc
 185                                 dbnote = Note.objects.filter(upstream_link=url)[0]
 186                                 dbnote.tags.set()
 187                                 dbnote.delete()
 188                                 print "This note errored, so it is removed :("
 189                                 break
 190                             except Exception, e:
 191                                 if '403' in str(e):
 192                                     print "Failed: " + str(e)
 193                                     print "Trying again."
 194                                     continue
 195                                 else:
 196                                     print "Failed: " + str(e)
 197                                     print "Aborting."
 198                                     break
 199                             else:
 200                                 print "This note is done."
 201                                 break
 202
 203                     print "Notes for {0} are done.".format(dbcourse.name)