Migrate secrets to environment variables
[oweals/karmaworld.git] / karmaworld / apps / notes / management / commands / import_ocw_json.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 import json
6 import os.path
7 import requests
8
9 from karmaworld.apps.notes.models import Note
10 from karmaworld.apps.notes.gdrive import convert_raw_document
11 from karmaworld.apps.courses.models import Course
12 from karmaworld.apps.courses.models import School
13 from karmaworld.apps.courses.models import Professor
14 from karmaworld.apps.courses.models import Department
15 from karmaworld.apps.licenses.models import License
16 from karmaworld.apps.document_upload.models import RawDocument
17 from django.core.management.base import BaseCommand
18
19 FILEPICKER_API_KEY = os.environ['FILEPICKER_API_KEY']
20
21 class Command(BaseCommand):
22     args = 'directory containing json files from mit-ocw-scraper'
23     help = """
24            This command will systematically parse all *.json files in the given
25            directory and load them into the database as course notes, uploaded
26            through Filepicker.
27
28            It is assumed the json files are generated by (or follow the same
29            format as) mit-ocw-scraper:
30            https://github.com/AndrewMagliozzi/mit-ocw-scraper
31            """
32
33     def handle(self, *args, **kwargs):
34         if len(args) != 1:
35             raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
36
37         # Convert given path to an absolute path, not relative.
38         path = os.path.abspath(args[0])
39
40         if not os.path.isdir(path):
41             raise ArgumentError("First argument should be a directory to parse.")
42
43         # for now, assume the school is MIT and find by its US DepEd ID.
44         # TODO for later, do something more clever
45         dbschool = School.objects.filter(usde_id=121415)[0]
46
47         # for now, assume license is the default OCW license: CC-BY-NC 3
48         # TODO for later, do something more clever.
49         dblicense = License.objects.get_or_create(
50           name='cc-by-nc-3.0',
51           html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
52         )[0]
53
54         # build Filepicker upload URL
55         # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
56         fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
57
58         # find all *.json files in the given directory
59         def is_json_file(filename):
60             return filename[-5:].lower() == '.json'
61         json_files = filter(is_json_file, os.listdir(path))
62         # prepend filenames with absolute paths
63         def full_path_to_file(filename):
64             return os.path.sep.join((path, filename))
65         json_files = map(full_path_to_file, json_files)
66
67         # parse each json file and process it for courses and notes.
68         for filename in json_files:
69             with open(filename, 'r') as jsondata:
70                 # parse JSON into python
71                 parsed = json.load(jsondata)
72
73                 # find the department or create one.
74                 dept_info = {
75                     'name': parsed['subject'],
76                     'school': dbschool,
77                     'url': parsed['departmentLink'],
78                 }
79                 dbdept = Department.objects.get_or_create(**dept_info)[0]
80
81                 # process courses
82                 for course in parsed['courses']:
83                     # Assume first hit is always right. Solving the identity
84                     # problem by name alone will always be a fool's errand.
85                     dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
86
87                     # Associate the professor with the department.
88                     # (no need to track the result)
89                     ProfessorAffiliation.objects.get_or_create(
90                         professor=dbprof,
91                         department=dbdept)
92
93                     # Extract the course info
94                     course_info = {
95                       'name': course['courseTitle'],
96                       'department': dbdept,
97                     }
98                     # Create or Find the Course object.
99                     dbcourse = Course.objects.get_or_create(**course_info)[0]
100                     dbcourse.professor = dbprof
101                     dbcourse.instructor_name = course['professor']
102                     dbcourse.school = dbschool
103                     dbcourse.save()
104                     print "Course is in the database: {0}".format(dbcourse.name)
105
106                     ProfessorTaught.objects.get_or_create(
107                         professor=dbprof,
108                         course=dbcourse)
109
110                     if 'noteLinks' not in course or not course['noteLinks']:
111                         print "No Notes in course."
112                         continue
113
114                     # process notes for each course
115                     for note in course['noteLinks']:
116                         # Check to see if the Note is already uploaded.
117                         url = note['link']
118                         dbnote = Note.objects.filter(upstream_link=url)
119                         if len(dbnote) > 2:
120                             print "WARNING Skipping Note: Too many notes for {0}".format(url)
121                             continue
122                         if len(dbnote) == 1:
123                             dbnote = dbnote[0]
124                             if dbnote.text and len(dbnote.text) or \
125                                dbnote.html and len(dbnote.html):
126                                 print "Already there, moving on: {0}".format(url)
127                                 continue
128                             else:
129                                 # Partially completed note. Remove it and try
130                                 # again.
131                                 dbnote.tags.set() # clear tags
132                                 dbnote.delete() # delete note
133                                 print "Found and removed incomplete note {0}.".format(url)
134
135                         # Upload URL of note to Filepicker if it is not already
136                         # in RawDocument.
137                         rd_test = RawDocument.objects.filter(upstream_link=url)
138                         if not len(rd_test):
139                             # https://developers.inkfilepicker.com/docs/web/#inkblob-store
140                             print "Uploading link {0} to FP.".format(url)
141                             ulresp = requests.post(fpurl, data={
142                               'url': url,
143                             })
144                             try:
145                                 ulresp.raise_for_status()
146                             except Exception, e:
147                                 print "Failed to upload note: " + str(e)
148                                 print "Skipping."
149                                 continue
150                             # Filepicker returns JSON, so use that
151                             uljson = ulresp.json()
152
153                             print "Saving raw document to database."
154                             # Extract the note info
155                             dbnote = RawDocument()
156                             dbnote.course = dbcourse
157                             dbnote.name = note['fileName']
158                             dbnote.license = dblicense
159                             dbnote.upstream_link = url
160                             dbnote.fp_file = uljson['url']
161                             dbnote.mimetype = uljson['type']
162                             dbnote.is_processed = True # hack to bypass celery
163                             # Create the RawDocument object.
164                             dbnote.save()
165                         else:
166                             # Find the right RawDocument
167                             print "Already uploaded link {0} to FP.".format(url)
168                             dbnote = rd_test[0]
169
170                         # Do tags separately
171                         dbnote.tags.add('mit-ocw','karma')
172
173                         print "Converting document and saving note to S3."
174                         while True:
175                             try:
176                                 convert_raw_document(dbnote)
177                             except ValueError, e:
178                                 # only catch one specific error
179                                 if not str(e).startswith('PDF file could not be'):
180                                     raise e
181                                 # write the link to file.
182                                 with open('pdferrors.log', 'a') as pdferrs:
183                                     pdferrs.write(url + '\n')
184                                 # delete the partial Note created in convert_raw_doc
185                                 dbnote = Note.objects.filter(upstream_link=url)[0]
186                                 dbnote.tags.set()
187                                 dbnote.delete()
188                                 print "This note errored, so it is removed :("
189                                 break
190                             except Exception, e:
191                                 if '403' in str(e):
192                                     print "Failed: " + str(e)
193                                     print "Trying again."
194                                     continue
195                                 else:
196                                     print "Failed: " + str(e)
197                                     print "Aborting."
198                                     break
199                             else:
200                                 print "This note is done."
201                                 break
202
203                     print "Notes for {0} are done.".format(dbcourse.name)