build courses and notes out of OCW scraped JSON, except FP won't get the mimetype...
authorBryan <btbonval@gmail.com>
Mon, 6 Jan 2014 08:08:36 +0000 (03:08 -0500)
committerBryan <btbonval@gmail.com>
Mon, 6 Jan 2014 08:08:36 +0000 (03:08 -0500)
karmaworld/apps/notes/management/commands/import_ocw_json.py [new file with mode: 0644]

diff --git a/karmaworld/apps/notes/management/commands/import_ocw_json.py b/karmaworld/apps/notes/management/commands/import_ocw_json.py
new file mode 100644 (file)
index 0000000..3cf2e46
--- /dev/null
@@ -0,0 +1,126 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2012  FinalsClub Foundation
+
+import json
+import os.path
+import requests
+
+from apps.notes.models import Note
+from apps.notes.gdrive import convert_raw_document
+from apps.courses.models import Course
+from apps.courses.models import School
+from apps.licenses.models import License
+from apps.document_upload.models import RawDocument
+from django.core.management.base import BaseCommand
+from karmaworld.secret.filepicker import FILEPICKER_API_KEY
+
+class Command(BaseCommand):
+    args = 'directory containing json files from mit-ocw-scraper'
+    help = """
+           This command will systematically parse all *.json files in the given
+           directory and load them into the database as course notes, uploaded
+           through Filepicker.
+
+           It is assumed the json files are generated by (or follow the same
+           format as) mit-ocw-scraper:
+           https://github.com/AndrewMagliozzi/mit-ocw-scraper
+           """
+
+    def handle(self, *args, **kwargs):
+        if len(args) != 1:
+            raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
+
+        # Convert given path to an absolute path, not relative.
+        path = os.path.abspath(args[0])
+
+        if not os.path.isdir(path):
+            raise ArgumentError("First argument should be a directory to parse.")
+
+        # for now, assume the school is MIT and find by its US DepEd ID.
+        # TODO for later, do something more clever
+        dbschool = School.objects.filter(usde_id=121415)[0]
+
+        # for now, assume license is the default OCW license: CC-BY-NC 3
+        # TODO for later, do something more clever.
+        dblicense = License.objects.filter(name='cc-by-nc-3.0')[0]
+
+        # build Filepicker upload URL
+        # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
+        fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
+
+        # find all *.json files in the given directory
+        def is_json_file(filename):
+            return filename[-5:].lower() == '.json'
+        json_files = filter(is_json_file, os.listdir(path))
+        # prepend filenames with absolute paths
+        def full_path_to_file(filename):
+            return os.path.sep.join((path, filename))
+        json_files = map(full_path_to_file, json_files)
+
+        # parse each json file and process it for courses and notes.
+        for filename in json_files:
+            with open(filename, 'r') as jsondata:
+                # parse JSON into python
+                parsed = json.load(jsondata)
+                # process courses
+                for course in parsed['courses']:
+
+                    # Extract the course info
+                    course_info = {
+                      'name': course['courseTitle'],
+                      'instructor_name': course['professor'],
+                      'school': dbschool,
+                      # courseLink is "course-number-name-semester-year"
+                      'academic_year': \
+                        int(course['courseLink'].split('-')[-1])
+                    }
+                    # Create or Find the Course object.
+                    dbcourse = Course.objects.get_or_create(**course_info)[0]
+                    print "Course is in the database: {0}".format(dbcourse.name)
+
+                    # process notes for each course
+                    for note in course['noteLinks']:
+                        # Check to see if the Note is already there.
+                        if len(RawDocument.objects.filter(upstream_link=note['link'])):
+                            print "Already there, moving on: {0}".format(note['link'])
+                            continue
+
+                        # Download the note into memory.
+                        print "Downloading {0}".format(note['link'])
+                        dlresp = requests.get(note['link'])
+                        # Check there weren't any problems
+                        dlresp.raise_for_status()
+
+                        # Upload raw contents of note to Filepicker
+                        # https://developers.inkfilepicker.com/docs/web/#inkblob-store
+                        print "Uploading to FP."
+                        ulresp = requests.post(fpurl, files={
+                          #'fileUpload': (note['fileName'], dlresp.raw)
+                          'fileUpload': dlresp.raw,
+                        })
+                        ulresp.raise_for_status()
+                        # Filepicker returns JSON, so use that
+                        uljson = ulresp.json()
+
+                        print "Saving raw document to database."
+                        # Extract the note info
+                        dbnote = RawDocument()
+                        dbnote.course = dbcourse
+                        dbnote.name = note['fileName']
+                        dbnote.license = dblicense
+                        dbnote.upstream_link = note['link']
+                        dbnote.fp_file = uljson['url']
+                        dbnote.mimetype = uljson['type']
+                        # Create the RawDocument object.
+                        dbnote.save()
+
+                        # Do tags separately
+                        dbnote.tags.add('mit-ocw','karma')
+
+                        print "Sending to GDrive and saving note to database."
+                        convert_raw_document(dbnote)
+                        print "This note is done."
+
+
+                    print "Notes for {0} are done.".format(dbcourse.name)