4e43e344f2218b6421037114cf6155091428659a
[oweals/karmaworld.git] / karmaworld / apps / notes / management / commands / import_ocw_json.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 import json
6 import os.path
7 import requests
8
9 from apps.notes.models import Note
10 from apps.notes.gdrive import convert_raw_document
11 from apps.courses.models import Course
12 from apps.courses.models import School
13 from apps.courses.models import Professor
14 from apps.courses.models import Department
15 from apps.courses.models import ProfessorTaught
16 from apps.courses.models import ProfessorAffiliation
17 from apps.licenses.models import License
18 from apps.document_upload.models import RawDocument
19 from django.core.management.base import BaseCommand
20 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
21
22 class Command(BaseCommand):
23     args = 'directory containing json files from mit-ocw-scraper'
24     help = """
25            This command will systematically parse all *.json files in the given
26            directory and load them into the database as course notes, uploaded
27            through Filepicker.
28
29            It is assumed the json files are generated by (or follow the same
30            format as) mit-ocw-scraper:
31            https://github.com/AndrewMagliozzi/mit-ocw-scraper
32            """
33
34     def handle(self, *args, **kwargs):
35         if len(args) != 1:
36             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
37
38         # Convert given path to an absolute path, not relative.
39         path = os.path.abspath(args[0])
40
41         if not os.path.isdir(path):
42             raise ArgumentError("First argument should be a directory to parse.")
43
44         # for now, assume the school is MIT and find by its US DepEd ID.
45         # TODO for later, do something more clever
46         dbschool = School.objects.filter(usde_id=121415)[0]
47
48         # for now, assume license is the default OCW license: CC-BY-NC 3
49         # TODO for later, do something more clever.
50         dblicense = License.objects.get_or_create(
51           name='cc-by-nc-3.0',
52           html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
53         )[0]
54
55         # build Filepicker upload URL
56         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
57         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
58
59         # find all *.json files in the given directory
60         def is_json_file(filename):
61             return filename[-5:].lower() == '.json'
62         json_files = filter(is_json_file, os.listdir(path))
63         # prepend filenames with absolute paths
64         def full_path_to_file(filename):
65             return os.path.sep.join((path, filename))
66         json_files = map(full_path_to_file, json_files)
67
68         # parse each json file and process it for courses and notes.
69         for filename in json_files:
70             with open(filename, 'r') as jsondata:
71                 # parse JSON into python
72                 parsed = json.load(jsondata)
73
74                 # find the department or create one.
75                 dept_info = {
76                     'name': parsed['subject'],
77                     'school': dbschool,
78                     'url': parsed['departmentLink'],
79                 }
80                 dbdept = Department.objects.get_or_create(**dept_info)[0]
81
82                 # process courses
83                 for course in parsed['courses']:
84                     # Assume first hit is always right. Solving the identity
85                     # problem by name alone will always be a fool's errand.
86                     dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
87
88                     # Associate the professor with the department.
89                     # (no need to track the result)
90                     ProfessorAffiliation.objects.get_or_create(
91                         professor=dbprof,
92                         department=dbdept)
93
94                     # Extract the course info
95                     course_info = {
96                       'name': course['courseTitle'],
97                       'department': dbdept,
98                     }
99                     # Create or Find the Course object.
100                     dbcourse = Course.objects.get_or_create(**course_info)[0]
101                     dbcourse.professor = dbprof
102                     dbcourse.instructor_name = course['professor']
103                     dbcourse.school = dbschool
104                     dbcourse.save()
105                     print "Course is in the database: {0}".format(dbcourse.name)
106
107                     ProfessorTaught.objects.get_or_create(
108                         professor=dbprof,
109                         course=dbcourse)
110
111                     if 'noteLinks' not in course or not course['noteLinks']:
112                         print "No Notes in course."
113                         continue
114
115                     # process notes for each course
116                     for note in course['noteLinks']:
117                         # Check to see if the Note is already uploaded.
118                         url = note['link']
119                         dbnote = Note.objects.filter(upstream_link=url)
120                         if len(dbnote) > 2:
121                             print "WARNING Skipping Note: Too many notes for {0}".format(url)
122                             continue
123                         if len(dbnote) == 1:
124                             dbnote = dbnote[0]
125                             if dbnote.text and len(dbnote.text) or \
126                                dbnote.html and len(dbnote.html):
127                                 print "Already there, moving on: {0}".format(url)
128                                 continue
129                             else:
130                                 # Partially completed note. Remove it and try
131                                 # again.
132                                 dbnote.tags.set() # clear tags
133                                 dbnote.delete() # delete note
134                                 print "Found and removed incomplete note {0}.".format(url)
135
136                         # Upload URL of note to Filepicker if it is not already
137                         # in RawDocument.
138                         rd_test = RawDocument.objects.filter(upstream_link=url)
139                         if not len(rd_test):
140                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
141                             print "Uploading link {0} to FP.".format(url)
142                             ulresp = requests.post(fpurl, data={
143                               'url': url,
144                             })
145                             try:
146                                 ulresp.raise_for_status()
147                             except Exception, e:
148                                 print "Failed to upload note: " + str(e)
149                                 print "Skipping."
150                                 continue
151                             # Filepicker returns JSON, so use that
152                             uljson = ulresp.json()
153
154                             print "Saving raw document to database."
155                             # Extract the note info
156                             dbnote = RawDocument()
157                             dbnote.course = dbcourse
158                             dbnote.name = note['fileName']
159                             dbnote.license = dblicense
160                             dbnote.upstream_link = url
161                             dbnote.fp_file = uljson['url']
162                             dbnote.mimetype = uljson['type']
163                             dbnote.is_processed = True # hack to bypass celery
164                             # Create the RawDocument object.
165                             dbnote.save()
166                         else:
167                             # Find the right RawDocument
168                             print "Already uploaded link {0} to FP.".format(url)
169                             dbnote = rd_test[0]
170
171                         # Do tags separately
172                         dbnote.tags.add('mit-ocw','karma')
173
174                         print "Converting document and saving note to S3."
175                         while True:
176                             try:
177                                 convert_raw_document(dbnote)
178                             except ValueError, e:
179                                 # only catch one specific error
180                                 if not str(e).startswith('PDF file could not be'):
181                                     raise e
182                                 # write the link to file.
183                                 with open('pdferrors.log', 'a') as pdferrs:
184                                     pdferrs.write(url + '\n')
185                                 # delete the partial Note created in convert_raw_doc
186                                 dbnote = Note.objects.filter(upstream_link=url)[0]
187                                 dbnote.tags.set()
188                                 dbnote.delete()
189                                 print "This note errored, so it is removed :("
190                                 break
191                             except Exception, e:
192                                 if '403' in str(e):
193                                     print "Failed: " + str(e)
194                                     print "Trying again."
195                                     continue
196                                 else:
197                                     print "Failed: " + str(e)
198                                     print "Aborting."
199                                     break
200                             else:
201                                 print "This note is done."
202                                 break
203
204                     print "Notes for {0} are done.".format(dbcourse.name)