From: Bryan Date: Mon, 6 Jan 2014 08:08:36 +0000 (-0500) Subject: build courses and notes out of OCW scraped JSON, except FP won't get the mimetype... X-Git-Tag: release-20150131~305 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=3eb6d5eba963c7f30011ec330e9465f1670c5e95;p=oweals%2Fkarmaworld.git build courses and notes out of OCW scraped JSON, except FP won't get the mimetype right --- diff --git a/karmaworld/apps/notes/management/commands/import_ocw_json.py b/karmaworld/apps/notes/management/commands/import_ocw_json.py new file mode 100644 index 0000000..3cf2e46 --- /dev/null +++ b/karmaworld/apps/notes/management/commands/import_ocw_json.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# -*- coding:utf8 -*- +# Copyright (C) 2012 FinalsClub Foundation + +import json +import os.path +import requests + +from apps.notes.models import Note +from apps.notes.gdrive import convert_raw_document +from apps.courses.models import Course +from apps.courses.models import School +from apps.licenses.models import License +from apps.document_upload.models import RawDocument +from django.core.management.base import BaseCommand +from karmaworld.secret.filepicker import FILEPICKER_API_KEY + +class Command(BaseCommand): + args = 'directory containing json files from mit-ocw-scraper' + help = """ + This command will systematically parse all *.json files in the given + directory and load them into the database as course notes, uploaded + through Filepicker. + + It is assumed the json files are generated by (or follow the same + format as) mit-ocw-scraper: + https://github.com/AndrewMagliozzi/mit-ocw-scraper + """ + + def handle(self, *args, **kwargs): + if len(args) != 1: + raise ArgumentError("Expected one argument, got none: please specify a directory to parse.") + + # Convert given path to an absolute path, not relative. + path = os.path.abspath(args[0]) + + if not os.path.isdir(path): + raise ArgumentError("First argument should be a directory to parse.") + + # for now, assume the school is MIT and find by its US DepEd ID. + # TODO for later, do something more clever + dbschool = School.objects.filter(usde_id=121415)[0] + + # for now, assume license is the default OCW license: CC-BY-NC 3 + # TODO for later, do something more clever. + dblicense = License.objects.filter(name='cc-by-nc-3.0')[0] + + # build Filepicker upload URL + # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line + fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY) + + # find all *.json files in the given directory + def is_json_file(filename): + return filename[-5:].lower() == '.json' + json_files = filter(is_json_file, os.listdir(path)) + # prepend filenames with absolute paths + def full_path_to_file(filename): + return os.path.sep.join((path, filename)) + json_files = map(full_path_to_file, json_files) + + # parse each json file and process it for courses and notes. + for filename in json_files: + with open(filename, 'r') as jsondata: + # parse JSON into python + parsed = json.load(jsondata) + # process courses + for course in parsed['courses']: + + # Extract the course info + course_info = { + 'name': course['courseTitle'], + 'instructor_name': course['professor'], + 'school': dbschool, + # courseLink is "course-number-name-semester-year" + 'academic_year': \ + int(course['courseLink'].split('-')[-1]) + } + # Create or Find the Course object. + dbcourse = Course.objects.get_or_create(**course_info)[0] + print "Course is in the database: {0}".format(dbcourse.name) + + # process notes for each course + for note in course['noteLinks']: + # Check to see if the Note is already there. + if len(RawDocument.objects.filter(upstream_link=note['link'])): + print "Already there, moving on: {0}".format(note['link']) + continue + + # Download the note into memory. + print "Downloading {0}".format(note['link']) + dlresp = requests.get(note['link']) + # Check there weren't any problems + dlresp.raise_for_status() + + # Upload raw contents of note to Filepicker + # https://developers.inkfilepicker.com/docs/web/#inkblob-store + print "Uploading to FP." + ulresp = requests.post(fpurl, files={ + #'fileUpload': (note['fileName'], dlresp.raw) + 'fileUpload': dlresp.raw, + }) + ulresp.raise_for_status() + # Filepicker returns JSON, so use that + uljson = ulresp.json() + + print "Saving raw document to database." + # Extract the note info + dbnote = RawDocument() + dbnote.course = dbcourse + dbnote.name = note['fileName'] + dbnote.license = dblicense + dbnote.upstream_link = note['link'] + dbnote.fp_file = uljson['url'] + dbnote.mimetype = uljson['type'] + # Create the RawDocument object. + dbnote.save() + + # Do tags separately + dbnote.tags.add('mit-ocw','karma') + + print "Sending to GDrive and saving note to database." + convert_raw_document(dbnote) + print "This note is done." + + + print "Notes for {0} are done.".format(dbcourse.name)