3 # Copyright (C) 2012 FinalsClub Foundation
9 from karmaworld.apps.notes.models import Note
10 from karmaworld.apps.notes.gdrive import convert_raw_document
11 from karmaworld.apps.courses.models import Course
12 from karmaworld.apps.courses.models import School
13 from karmaworld.apps.courses.models import Professor
14 from karmaworld.apps.courses.models import Department
15 from karmaworld.apps.licenses.models import License
16 from karmaworld.apps.document_upload.models import RawDocument
17 from django.core.management.base import BaseCommand
19 FILEPICKER_API_KEY = os.environ['FILEPICKER_API_KEY']
21 class Command(BaseCommand):
22 args = 'directory containing json files from mit-ocw-scraper'
24 This command will systematically parse all *.json files in the given
25 directory and load them into the database as course notes, uploaded
28 It is assumed the json files are generated by (or follow the same
29 format as) mit-ocw-scraper:
30 https://github.com/AndrewMagliozzi/mit-ocw-scraper
33 def handle(self, *args, **kwargs):
35 raise ArgumentError("Expected one argument, got none: please specify a directory to parse.")
37 # Convert given path to an absolute path, not relative.
38 path = os.path.abspath(args[0])
40 if not os.path.isdir(path):
41 raise ArgumentError("First argument should be a directory to parse.")
43 # for now, assume the school is MIT and find by its US DepEd ID.
44 # TODO for later, do something more clever
45 dbschool = School.objects.filter(usde_id=121415)[0]
47 # for now, assume license is the default OCW license: CC-BY-NC 3
48 # TODO for later, do something more clever.
49 dblicense = License.objects.get_or_create(
51 html='<a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="http://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a>'
54 # build Filepicker upload URL
55 # http://stackoverflow.com/questions/14115280/store-files-to-filepicker-io-from-the-command-line
56 fpurl = 'https://www.filepicker.io/api/store/S3?key={0}'.format(FILEPICKER_API_KEY)
58 # find all *.json files in the given directory
59 def is_json_file(filename):
60 return filename[-5:].lower() == '.json'
61 json_files = filter(is_json_file, os.listdir(path))
62 # prepend filenames with absolute paths
63 def full_path_to_file(filename):
64 return os.path.sep.join((path, filename))
65 json_files = map(full_path_to_file, json_files)
67 # parse each json file and process it for courses and notes.
68 for filename in json_files:
69 with open(filename, 'r') as jsondata:
70 # parse JSON into python
71 parsed = json.load(jsondata)
73 # find the department or create one.
75 'name': parsed['subject'],
77 'url': parsed['departmentLink'],
79 dbdept = Department.objects.get_or_create(**dept_info)[0]
82 for course in parsed['courses']:
83 # Assume first hit is always right. Solving the identity
84 # problem by name alone will always be a fool's errand.
85 dbprof = Professor.objects.get_or_create(name=course['professor'])[0]
87 # Associate the professor with the department.
88 # (no need to track the result)
89 ProfessorAffiliation.objects.get_or_create(
93 # Extract the course info
95 'name': course['courseTitle'],
98 # Create or Find the Course object.
99 dbcourse = Course.objects.get_or_create(**course_info)[0]
100 dbcourse.professor = dbprof
101 dbcourse.instructor_name = course['professor']
102 dbcourse.school = dbschool
104 print "Course is in the database: {0}".format(dbcourse.name)
106 ProfessorTaught.objects.get_or_create(
110 if 'noteLinks' not in course or not course['noteLinks']:
111 print "No Notes in course."
114 # process notes for each course
115 for note in course['noteLinks']:
116 # Check to see if the Note is already uploaded.
118 dbnote = Note.objects.filter(upstream_link=url)
120 print "WARNING Skipping Note: Too many notes for {0}".format(url)
124 if dbnote.text and len(dbnote.text) or \
125 dbnote.html and len(dbnote.html):
126 print "Already there, moving on: {0}".format(url)
129 # Partially completed note. Remove it and try
131 dbnote.tags.set() # clear tags
132 dbnote.delete() # delete note
133 print "Found and removed incomplete note {0}.".format(url)
135 # Upload URL of note to Filepicker if it is not already
137 rd_test = RawDocument.objects.filter(upstream_link=url)
139 # https://developers.inkfilepicker.com/docs/web/#inkblob-store
140 print "Uploading link {0} to FP.".format(url)
141 ulresp = requests.post(fpurl, data={
145 ulresp.raise_for_status()
147 print "Failed to upload note: " + str(e)
150 # Filepicker returns JSON, so use that
151 uljson = ulresp.json()
153 print "Saving raw document to database."
154 # Extract the note info
155 dbnote = RawDocument()
156 dbnote.course = dbcourse
157 dbnote.name = note['fileName']
158 dbnote.license = dblicense
159 dbnote.upstream_link = url
160 dbnote.fp_file = uljson['url']
161 dbnote.mimetype = uljson['type']
162 dbnote.is_processed = True # hack to bypass celery
163 # Create the RawDocument object.
166 # Find the right RawDocument
167 print "Already uploaded link {0} to FP.".format(url)
171 dbnote.tags.add('mit-ocw','karma')
173 print "Converting document and saving note to S3."
176 convert_raw_document(dbnote)
177 except ValueError, e:
178 # only catch one specific error
179 if not str(e).startswith('PDF file could not be'):
181 # write the link to file.
182 with open('pdferrors.log', 'a') as pdferrs:
183 pdferrs.write(url + '\n')
184 # delete the partial Note created in convert_raw_doc
185 dbnote = Note.objects.filter(upstream_link=url)[0]
188 print "This note errored, so it is removed :("
192 print "Failed: " + str(e)
193 print "Trying again."
196 print "Failed: " + str(e)
200 print "This note is done."
203 print "Notes for {0} are done.".format(dbcourse.name)