3 # Copyright (C) 2012 FinalsClub Foundation
8 from django.contrib.auth.models import User
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
11 from karmaworld.apps.notes.models import UserUploadMapping
12 from karmaworld.apps.notes.models import NoteMarkdown
13 from karmaworld.apps.notes import sanitizer
14 from karmaworld.apps.quizzes.models import Keyword
15 from karmaworld.apps.users.models import NoteKarmaEvent
27 from apiclient.discovery import build
28 from apiclient.http import MediaInMemoryUpload
29 from oauth2client.client import SignedJwtAssertionCredentials
31 logger = logging.getLogger(__name__)
33 PDF_MIMETYPE = 'application/pdf'
34 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
36 GOOGLE_SERVICE_EMAIL = os.environ['GOOGLE_SERVICE_EMAIL']
37 GOOGLE_SERVICE_KEY_BASE64 = os.environ['GOOGLE_SERVICE_KEY_BASE64']
38 GOOGLE_USER = os.environ['GOOGLE_USER']
40 def build_api_service():
42 Build and returns a Drive service object authorized with the service
43 accounts that act on behalf of the given user.
45 Will target the Google Drive of GOOGLE_USER email address.
46 Returns a Google Drive service object.
48 Code herein adapted from:
49 https://developers.google.com/drive/delegation
52 # Pull in the service's p12 private key.
53 p12 = base64.decodestring(GOOGLE_SERVICE_KEY_BASE64)
54 credentials = SignedJwtAssertionCredentials(GOOGLE_SERVICE_EMAIL, p12,
55 scope='https://www.googleapis.com/auth/drive',
58 return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
61 def pdf2html(content):
62 pdf_file = tempfile.NamedTemporaryFile()
63 pdf_file.write(content)
65 tmp_dir = tempfile.gettempdir()
66 html_file_name = uuid.uuid4().hex
67 html_file_path = os.path.join(tmp_dir, html_file_name)
69 command = ['pdf2htmlEX', pdf_file.name, html_file_name]
70 devnull = open('/dev/null', 'w')
72 call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
74 call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
77 if call.returncode != 0:
78 raise ValueError("PDF file could not be processed")
83 html_file = open(html_file_path, 'r')
84 html = html_file.read()
86 os.remove(html_file_path)
88 raise ValueError("PDF file could not be processed")
91 raise ValueError("PDF file results in empty HTML file")
96 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
97 """ Take in a gdrive service, file_dict from upload, and either an
98 extension or mimetype.
99 You must provide an `extension` or `mimetype`
100 Returns contextual files from google
103 download_urls['text'] = file_dict[u'exportLinks']['text/plain']
106 extension = extension.lower()
108 if extension in ['.ppt', 'pptx'] \
109 or mimetype in PPT_MIMETYPES:
110 download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
111 elif mimetype == PDF_MIMETYPE:
114 download_urls['html'] = file_dict[u'exportLinks']['text/html']
117 for download_type, download_url in download_urls.items():
118 print "\n%s -- %s" % (download_type, download_url)
119 resp, content = service._http.request(download_url)
121 if resp.status in [200]:
122 print "\t downloaded!"
123 # save to the File.property resulting field
124 content_dict[download_type] = content
126 print "\t Download failed: %s" % resp.status
131 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
132 """ take a gdrive service object, and a media wrapper and upload to gdrive
134 You must provide an `extension` or `mimetype`
136 _resource = {'title': filename}
138 # clean up extensions for type checking
140 extension = extension.lower()
142 # perform OCR on files that are image intensive
143 ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
144 mimetype in ['application/pdf']
146 file_dict = service.files().insert(body=_resource, media_body=media,\
147 convert=True, ocr=ocr).execute()
149 # increase exponent of 2 for exponential growth.
150 # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
152 # exponentially wait for exportLinks to be returned if missing
153 while u'exportLinks' not in file_dict or \
154 u'text/plain' not in file_dict[u'exportLinks']:
155 # if a bunch seconds have passed, give up
157 raise ValueError('Google Drive failed to read the document.')
160 print "upload_check_sleep({0})".format(2. ** delay_exp)
161 time.sleep(2. ** delay_exp)
162 delay_exp = delay_exp + 1
164 # try to get the doc from gdrive
165 file_dict = service.files().get(fileId=file_dict[u'id']).execute()
170 def convert_raw_document(raw_document, user=None):
171 """ Upload a raw document to google drive and get a Note back"""
172 fp_file = raw_document.get_file()
174 # extract some properties from the document metadata
175 filename = raw_document.name
176 print "this is the mimetype of the document to check:"
177 mimetype = raw_document.mimetype
181 # A special case for Evernotes
182 if raw_document.mimetype == 'text/enml':
183 raw_document.mimetype = 'text/html'
185 original_content = fp_file.read()
187 # Include mimetype parameter if there is one to include
188 extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
190 media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
191 resumable=True, **extra_flags)
194 service = build_api_service()
196 # upload to google drive
197 file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
199 # download from google drive
200 content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
202 # this should have already happened, lets see why it hasn't
203 raw_document.is_processed = True
206 note = raw_document.convert_to_note()
208 # Cache the uploaded file's URL
209 note.gdrive_url = file_dict['alternateLink']
210 note.text = content_dict['text']
212 # Extract HTML from the appropriate place
214 if raw_document.mimetype == PDF_MIMETYPE:
215 html = pdf2html(original_content)
216 elif raw_document.mimetype in PPT_MIMETYPES:
217 html = pdf2html(content_dict['pdf'])
218 elif 'html' in content_dict and content_dict['html']:
219 html = content_dict['html']
222 html = sanitizer.data_uris_to_s3(html)
223 NoteMarkdown.objects.create(note=note, html=html)
225 # If we know the user who uploaded this,
226 # associate them with the note
227 if user and not user.is_anonymous():
229 NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
232 mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
233 note.user = mapping.user
235 NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
236 except (ObjectDoesNotExist, MultipleObjectsReturned):
237 logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
239 # Finally, save whatever data we got back from google