3 # Copyright (C) 2012 FinalsClub Foundation
9 from apps.notes.models import Note
10 from apps.notes.gdrive import convert_raw_document
11 from apps.courses.models import Course
12 from apps.courses.models import School
13 from apps.courses.models import Professor
14 from apps.courses.models import Department
15 from apps.courses.models import ProfessorTaught
16 from apps.courses.models import ProfessorAffiliation
17 from apps.licenses.models import License
18 from apps.document_upload.models import RawDocument
19 from django.core.management.base import BaseCommand
20 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
22 class Command(BaseCommand):
23 args = 'directory containing json files from mit-ocw-scraper'
25 This command will systematically parse all *.json files in the given
26 directory and load them into the database as course notes, uploaded
29 It is assumed the json files are generated by (or follow the same
30 format as) mit-ocw-scraper:
31 https://github.com/AndrewMagliozzi/mit-ocw-scraper
34 def handle(self, *args, **kwargs):
36 raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
38 # Convert given path to an absolute path, not relative.
39 path = os.path.abspath(args[0])
41 if not os.path.isdir(path):
42 raise ArgumentError("First argument should be a directory to parse.")
44 # for now, assume the school is MIT and find by its US DepEd ID.
45 # TODO for later, do something more clever
46 dbschool = School.objects.filter(usde_id=121415)[0]
48 # for now, assume license is the default OCW license: CC-BY-NC 3
49 # TODO for later, do something more clever.
50 dblicense = License.objects.get_or_create(
52 html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
55 # build Filepicker upload URL
56 # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
57 fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
59 # find all *.json files in the given directory
60 def is_json_file(filename):
61 return filename[-5:].lower() == '.json'
62 json_files = filter(is_json_file, os.listdir(path))
63 # prepend filenames with absolute paths
64 def full_path_to_file(filename):
65 return os.path.sep.join((path, filename))
66 json_files = map(full_path_to_file, json_files)
68 # parse each json file and process it for courses and notes.
69 for filename in json_files:
70 with open(filename, 'r') as jsondata:
71 # parse JSON into python
72 parsed = json.load(jsondata)
74 # find the department or create one.
76 'name': parsed['subject'],
78 'url': parsed['departmentLink'],
80 dbdept = Department.objects.get_or_create(**dept_info)[0]
83 for course in parsed['courses']:
84 # Assume first hit is always right. Solving the identity
85 # problem by name alone will always be a fool's errand.
86 dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
88 # Associate the professor with the department.
89 # (no need to track the result)
90 ProfessorAffiliation.objects.get_or_create(
94 # Extract the course info
96 'name': course['courseTitle'],
99 # Create or Find the Course object.
100 dbcourse = Course.objects.get_or_create(**course_info)[0]
101 dbcourse.professor = dbprof
102 dbcourse.instructor_name = course['professor']
103 dbcourse.school = dbschool
105 print "Course is in the database: {0}".format(dbcourse.name)
107 ProfessorTaught.objects.get_or_create(
111 if 'noteLinks' not in course or not course['noteLinks']:
112 print "No Notes in course."
115 # process notes for each course
116 for note in course['noteLinks']:
117 # Check to see if the Note is already uploaded.
119 dbnote = Note.objects.filter(upstream_link=url)
121 print "WARNING Skipping Note: Too many notes for {0}".format(url)
125 if dbnote.text and len(dbnote.text) or \
126 dbnote.html and len(dbnote.html):
127 print "Already there, moving on: {0}".format(url)
130 # Partially completed note. Remove it and try
132 dbnote.tags.set() # clear tags
133 dbnote.delete() # delete note
134 print "Found and removed incomplete note {0}.".format(url)
136 # Upload URL of note to Filepicker if it is not already
138 rd_test = RawDocument.objects.filter(upstream_link=url)
140 # https://developers.inkfilepicker.com/docs/web/#inkblob-store
141 print "Uploading link {0} to FP.".format(url)
142 ulresp = requests.post(fpurl, data={
146 ulresp.raise_for_status()
148 print "Failed to upload note: " + str(e)
151 # Filepicker returns JSON, so use that
152 uljson = ulresp.json()
154 print "Saving raw document to database."
155 # Extract the note info
156 dbnote = RawDocument()
157 dbnote.course = dbcourse
158 dbnote.name = note['fileName']
159 dbnote.license = dblicense
160 dbnote.upstream_link = url
161 dbnote.fp_file = uljson['url']
162 dbnote.mimetype = uljson['type']
163 dbnote.is_processed = True # hack to bypass celery
164 # Create the RawDocument object.
167 # Find the right RawDocument
168 print "Already uploaded link {0} to FP.".format(url)
172 dbnote.tags.add('mit-ocw','karma')
174 print "Converting document and saving note to S3."
177 convert_raw_document(dbnote)
178 except ValueError, e:
179 # only catch one specific error
180 if not str(e).startswith('PDF file could not be'):
182 # write the link to file.
183 with open('pdferrors.log', 'a') as pdferrs:
184 pdferrs.write(url + '\n')
185 # delete the partial Note created in convert_raw_doc
186 dbnote = Note.objects.filter(upstream_link=url)[0]
189 print "This note errored, so it is removed :("
193 print "Failed: " + str(e)
194 print "Trying again."
197 print "Failed: " + str(e)
201 print "This note is done."
204 print "Notes for {0} are done.".format(dbcourse.name)