karmaworld/apps/notes/management/commands/import_ocw_json.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import json
   6 import os.path
   7 import requests
   8
   9 from apps.notes.models import Note
  10 from apps.notes.gdrive import convert_raw_document
  11 from apps.courses.models import Course
  12 from apps.courses.models import School
  13 from apps.courses.models import Department
  14 from apps.licenses.models import License
  15 from apps.document_upload.models import RawDocument
  16 from django.core.management.base import BaseCommand
  17 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
  18
  19 class Command(BaseCommand):
  20     args = 'directory containing json files from mit-ocw-scraper'
  21     help = """
  22            This command will systematically parse all *.json files in the given
  23            directory and load them into the database as course notes, uploaded
  24            through Filepicker.
  25
  26            It is assumed the json files are generated by (or follow the same
  27            format as) mit-ocw-scraper:
  28            https://github.com/AndrewMagliozzi/mit-ocw-scraper
  29            """
  30
  31     def handle(self, *args, **kwargs):
  32         if len(args) != 1:
  33             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
  34
  35         # Convert given path to an absolute path, not relative.
  36         path = os.path.abspath(args[0])
  37
  38         if not os.path.isdir(path):
  39             raise ArgumentError("First argument should be a directory to parse.")
  40
  41         # for now, assume the school is MIT and find by its US DepEd ID.
  42         # TODO for later, do something more clever
  43         dbschool = School.objects.filter(usde_id=121415)[0]
  44
  45         # for now, assume license is the default OCW license: CC-BY-NC 3
  46         # TODO for later, do something more clever.
  47         dblicense = License.objects.filter(name='cc-by-nc-3.0')[0]
  48
  49         # build Filepicker upload URL
  50         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
  51         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
  52
  53         # find all *.json files in the given directory
  54         def is_json_file(filename):
  55             return filename[-5:].lower() == '.json'
  56         json_files = filter(is_json_file, os.listdir(path))
  57         # prepend filenames with absolute paths
  58         def full_path_to_file(filename):
  59             return os.path.sep.join((path, filename))
  60         json_files = map(full_path_to_file, json_files)
  61
  62         # parse each json file and process it for courses and notes.
  63         for filename in json_files:
  64             with open(filename, 'r') as jsondata:
  65                 # parse JSON into python
  66                 parsed = json.load(jsondata)
  67
  68                 # find the department or create one.
  69                 dept_info = {
  70                     'name': parsed['subject'],
  71                     'school': dbschool,
  72                     'url': parsed['departmentLink'],
  73                 }
  74                 dbdept = Department.objects.get_or_create(**dept_info)[0]
  75
  76                 # process courses
  77                 for course in parsed['courses']:
  78                     # Extract the course info
  79                     course_info = {
  80                       'name': course['courseTitle'],
  81                       'instructor_name': course['professor'],
  82                       'school': dbschool,
  83                     }
  84                     # Create or Find the Course object.
  85                     dbcourse = Course.objects.get_or_create(**course_info)[0]
  86                     dbcourse.department = dbdept;
  87                     dbcourse.save()
  88                     print "Course is in the database: {0}".format(dbcourse.name)
  89
  90                     if 'noteLinks' not in course:
  91                         print "No Notes in course."
  92                         continue
  93
  94                     # process notes for each course
  95                     for note in course['noteLinks']:
  96                         # Check to see if the Note is already uploaded.
  97                         if len(Note.objects.filter(upstream_link=note['link'])):
  98                             print "Already there, moving on: {0}".format(note['link'])
  99                             continue
 100
 101                         # Upload URL of note to Filepicker if it is not already
 102                         # in RawDocument.
 103                         rd_test = RawDocument.objects.filter(upstream_link=note['link'])
 104                         if not len(rd_test):
 105                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
 106                             print "Uploading link {0} to FP.".format(note['link'])
 107                             ulresp = requests.post(fpurl, data={
 108                               'url': note['link'],
 109                             })
 110                             ulresp.raise_for_status()
 111                             # Filepicker returns JSON, so use that
 112                             uljson = ulresp.json()
 113
 114                             print "Saving raw document to database."
 115                             # Extract the note info
 116                             dbnote = RawDocument()
 117                             dbnote.course = dbcourse
 118                             dbnote.name = note['fileName']
 119                             dbnote.license = dblicense
 120                             dbnote.upstream_link = note['link']
 121                             dbnote.fp_file = uljson['url']
 122                             dbnote.mimetype = uljson['type']
 123                             dbnote.is_processed = True # hack to bypass celery
 124                             # Create the RawDocument object.
 125                             dbnote.save()
 126                         else:
 127                             # Find the right RawDocument
 128                             print "Already uploaded link {0} to FP.".format(note['link'])
 129                             dbnote = rd_test[0]
 130
 131                         # Do tags separately
 132                         dbnote.tags.add('mit-ocw','karma')
 133
 134                         print "Sending to GDrive and saving note to database."
 135                         convert_raw_document(dbnote)
 136                         print "This note is done."
 137
 138
 139                     print "Notes for {0} are done.".format(dbcourse.name)