karmaworld/apps/notes/management/commands/import_ocw_json.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import json
   6 import os.path
   7 import requests
   8
   9 from apps.notes.models import Note
  10 from apps.notes.gdrive import convert_raw_document
  11 from apps.courses.models import Course
  12 from apps.courses.models import School
  13 from apps.courses.models import Professor
  14 from apps.courses.models import Department
  15 from apps.courses.models import ProfessorTaught
  16 from apps.courses.models import ProfessorAffiliation
  17 from apps.licenses.models import License
  18 from apps.document_upload.models import RawDocument
  19 from django.core.management.base import BaseCommand
  20 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
  21
  22 class Command(BaseCommand):
  23     args = 'directory containing json files from mit-ocw-scraper'
  24     help = """
  25            This command will systematically parse all *.json files in the given
  26            directory and load them into the database as course notes, uploaded
  27            through Filepicker.
  28
  29            It is assumed the json files are generated by (or follow the same
  30            format as) mit-ocw-scraper:
  31            https://github.com/AndrewMagliozzi/mit-ocw-scraper
  32            """
  33
  34     def handle(self, *args, **kwargs):
  35         if len(args) != 1:
  36             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
  37
  38         # Convert given path to an absolute path, not relative.
  39         path = os.path.abspath(args[0])
  40
  41         if not os.path.isdir(path):
  42             raise ArgumentError("First argument should be a directory to parse.")
  43
  44         # for now, assume the school is MIT and find by its US DepEd ID.
  45         # TODO for later, do something more clever
  46         dbschool = School.objects.filter(usde_id=121415)[0]
  47
  48         # for now, assume license is the default OCW license: CC-BY-NC 3
  49         # TODO for later, do something more clever.
  50         dblicense = License.objects.get_or_create(
  51           name='cc-by-nc-3.0',
  52           html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
  53         )[0]
  54
  55         # build Filepicker upload URL
  56         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
  57         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
  58
  59         # find all *.json files in the given directory
  60         def is_json_file(filename):
  61             return filename[-5:].lower() == '.json'
  62         json_files = filter(is_json_file, os.listdir(path))
  63         # prepend filenames with absolute paths
  64         def full_path_to_file(filename):
  65             return os.path.sep.join((path, filename))
  66         json_files = map(full_path_to_file, json_files)
  67
  68         # parse each json file and process it for courses and notes.
  69         for filename in json_files:
  70             with open(filename, 'r') as jsondata:
  71                 # parse JSON into python
  72                 parsed = json.load(jsondata)
  73
  74                 # find the department or create one.
  75                 dept_info = {
  76                     'name': parsed['subject'],
  77                     'school': dbschool,
  78                     'url': parsed['departmentLink'],
  79                 }
  80                 dbdept = Department.objects.get_or_create(**dept_info)[0]
  81
  82                 # process courses
  83                 for course in parsed['courses']:
  84                     # Assume first hit is always right. Solving the identity
  85                     # problem by name alone will always be a fool's errand.
  86                     dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
  87
  88                     # Associate the professor with the department.
  89                     # (no need to track the result)
  90                     ProfessorAffiliation.objects.get_or_create(
  91                         professor=dbprof,
  92                         department=dbdept)
  93
  94                     # Extract the course info
  95                     course_info = {
  96                       'name': course['courseTitle'],
  97                       'department': dbdept,
  98                     }
  99                     # Create or Find the Course object.
 100                     dbcourse = Course.objects.get_or_create(**course_info)[0]
 101                     dbcourse.professor = dbprof
 102                     dbcourse.instructor_name = course['professor']
 103                     dbcourse.school = dbschool
 104                     dbcourse.save()
 105                     print "Course is in the database: {0}".format(dbcourse.name)
 106
 107                     ProfessorTaught.objects.get_or_create(
 108                         professor=dbprof,
 109                         course=dbcourse)
 110
 111                     if 'noteLinks' not in course or not course['noteLinks']:
 112                         print "No Notes in course."
 113                         continue
 114
 115                     # process notes for each course
 116                     for note in course['noteLinks']:
 117                         # Check to see if the Note is already uploaded.
 118                         url = note['link']
 119                         dbnote = Note.objects.filter(upstream_link=url)
 120                         if len(dbnote) > 2:
 121                             print "WARNING Skipping Note: Too many notes for {0}".format(url)
 122                             continue
 123                         if len(dbnote) == 1:
 124                             dbnote = dbnote[0]
 125                             if dbnote.text and len(dbnote.text) or \
 126                                dbnote.html and len(dbnote.html):
 127                                 print "Already there, moving on: {0}".format(url)
 128                                 continue
 129                             else:
 130                                 # Partially completed note. Remove it and try
 131                                 # again.
 132                                 dbnote.tags.set() # clear tags
 133                                 dbnote.delete() # delete note
 134                                 print "Found and removed incomplete note {0}.".format(url)
 135
 136                         # Upload URL of note to Filepicker if it is not already
 137                         # in RawDocument.
 138                         rd_test = RawDocument.objects.filter(upstream_link=url)
 139                         if not len(rd_test):
 140                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
 141                             print "Uploading link {0} to FP.".format(url)
 142                             ulresp = requests.post(fpurl, data={
 143                               'url': url,
 144                             })
 145                             try:
 146                                 ulresp.raise_for_status()
 147                             except Exception, e:
 148                                 print "Failed to upload note: " + str(e)
 149                                 print "Skipping."
 150                                 continue
 151                             # Filepicker returns JSON, so use that
 152                             uljson = ulresp.json()
 153
 154                             print "Saving raw document to database."
 155                             # Extract the note info
 156                             dbnote = RawDocument()
 157                             dbnote.course = dbcourse
 158                             dbnote.name = note['fileName']
 159                             dbnote.license = dblicense
 160                             dbnote.upstream_link = url
 161                             dbnote.fp_file = uljson['url']
 162                             dbnote.mimetype = uljson['type']
 163                             dbnote.is_processed = True # hack to bypass celery
 164                             # Create the RawDocument object.
 165                             dbnote.save()
 166                         else:
 167                             # Find the right RawDocument
 168                             print "Already uploaded link {0} to FP.".format(url)
 169                             dbnote = rd_test[0]
 170
 171                         # Do tags separately
 172                         dbnote.tags.add('mit-ocw','karma')
 173
 174                         print "Converting document and saving note to S3."
 175                         while True:
 176                             try:
 177                                 convert_raw_document(dbnote)
 178                             except ValueError, e:
 179                                 # only catch one specific error
 180                                 if not str(e).startswith('PDF file could not be'):
 181                                     raise e
 182                                 # write the link to file.
 183                                 with open('pdferrors.log', 'a') as pdferrs:
 184                                     pdferrs.write(url + '\n')
 185                                 # delete the partial Note created in convert_raw_doc
 186                                 dbnote = Note.objects.filter(upstream_link=url)[0]
 187                                 dbnote.tags.set()
 188                                 dbnote.delete()
 189                                 print "This note errored, so it is removed :("
 190                                 break
 191                             except Exception, e:
 192                                 if '403' in str(e):
 193                                     print "Failed: " + str(e)
 194                                     print "Trying again."
 195                                     continue
 196                                 else:
 197                                     print "Failed: " + str(e)
 198                                     print "Aborting."
 199                                     break
 200                             else:
 201                                 print "This note is done."
 202                                 break
 203
 204                     print "Notes for {0} are done.".format(dbcourse.name)