closes #253 by removing field, adds backend for #236, and nearly completes #68
[oweals/karmaworld.git] / karmaworld / apps / notes / management / commands / import_ocw_json.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 import json
6 import os.path
7 import requests
8
9 from apps.notes.models import Note
10 from apps.notes.gdrive import convert_raw_document
11 from apps.courses.models import Course
12 from apps.courses.models import School
13 from apps.courses.models import Department
14 from apps.licenses.models import License
15 from apps.document_upload.models import RawDocument
16 from django.core.management.base import BaseCommand
17 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
18
19 class Command(BaseCommand):
20     args = 'directory containing json files from mit-ocw-scraper'
21     help = """
22            This command will systematically parse all *.json files in the given
23            directory and load them into the database as course notes, uploaded
24            through Filepicker.
25
26            It is assumed the json files are generated by (or follow the same
27            format as) mit-ocw-scraper:
28            https://github.com/AndrewMagliozzi/mit-ocw-scraper
29            """
30
31     def handle(self, *args, **kwargs):
32         if len(args) != 1:
33             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
34
35         # Convert given path to an absolute path, not relative.
36         path = os.path.abspath(args[0])
37
38         if not os.path.isdir(path):
39             raise ArgumentError("First argument should be a directory to parse.")
40
41         # for now, assume the school is MIT and find by its US DepEd ID.
42         # TODO for later, do something more clever
43         dbschool = School.objects.filter(usde_id=121415)[0]
44
45         # for now, assume license is the default OCW license: CC-BY-NC 3
46         # TODO for later, do something more clever.
47         dblicense = License.objects.filter(name='cc-by-nc-3.0')[0]
48
49         # build Filepicker upload URL
50         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
51         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
52
53         # find all *.json files in the given directory
54         def is_json_file(filename):
55             return filename[-5:].lower() == '.json'
56         json_files = filter(is_json_file, os.listdir(path))
57         # prepend filenames with absolute paths
58         def full_path_to_file(filename):
59             return os.path.sep.join((path, filename))
60         json_files = map(full_path_to_file, json_files)
61
62         # parse each json file and process it for courses and notes.
63         for filename in json_files:
64             with open(filename, 'r') as jsondata:
65                 # parse JSON into python
66                 parsed = json.load(jsondata)
67
68                 # find the department or create one.
69                 dept_info = {
70                     'name': parsed['subject'],
71                     'school': dbschool,
72                     'url': parsed['departmentLink'],
73                 }
74                 dbdept = Department.objects.get_or_create(**dept_info)[0]
75
76                 # process courses
77                 for course in parsed['courses']:
78                     # Extract the course info
79                     course_info = {
80                       'name': course['courseTitle'],
81                       'instructor_name': course['professor'],
82                       'school': dbschool,
83                     }
84                     # Create or Find the Course object.
85                     dbcourse = Course.objects.get_or_create(**course_info)[0]
86                     dbcourse.department = dbdept;
87                     dbcourse.save()
88                     print "Course is in the database: {0}".format(dbcourse.name)
89
90                     if 'noteLinks' not in course:
91                         print "No Notes in course."
92                         continue
93
94                     # process notes for each course
95                     for note in course['noteLinks']:
96                         # Check to see if the Note is already uploaded.
97                         if len(Note.objects.filter(upstream_link=note['link'])):
98                             print "Already there, moving on: {0}".format(note['link'])
99                             continue
100
101                         # Upload URL of note to Filepicker if it is not already
102                         # in RawDocument.
103                         rd_test = RawDocument.objects.filter(upstream_link=note['link'])
104                         if not len(rd_test):
105                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
106                             print "Uploading link {0} to FP.".format(note['link'])
107                             ulresp = requests.post(fpurl, data={
108                               'url': note['link'],
109                             })
110                             ulresp.raise_for_status()
111                             # Filepicker returns JSON, so use that
112                             uljson = ulresp.json()
113
114                             print "Saving raw document to database."
115                             # Extract the note info
116                             dbnote = RawDocument()
117                             dbnote.course = dbcourse
118                             dbnote.name = note['fileName']
119                             dbnote.license = dblicense
120                             dbnote.upstream_link = note['link']
121                             dbnote.fp_file = uljson['url']
122                             dbnote.mimetype = uljson['type']
123                             dbnote.is_processed = True # hack to bypass celery
124                             # Create the RawDocument object.
125                             dbnote.save()
126                         else:
127                             # Find the right RawDocument
128                             print "Already uploaded link {0} to FP.".format(note['link'])
129                             dbnote = rd_test[0]
130
131                         # Do tags separately
132                         dbnote.tags.add('mit-ocw','karma')
133
134                         print "Sending to GDrive and saving note to database."
135                         convert_raw_document(dbnote)
136                         print "This note is done."
137
138
139                     print "Notes for {0} are done.".format(dbcourse.name)