3 # Copyright (C) 2012 FinalsClub Foundation
8 from django.contrib.auth.models import User
9 from django.conf import settings
10 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
11 from karmaworld.apps.notes.models import UserUploadMapping
12 from karmaworld.apps.notes.models import NoteMarkdown
13 from karmaworld.apps.notes import sanitizer
14 from karmaworld.apps.quizzes.models import Keyword
15 from karmaworld.apps.users.models import NoteKarmaEvent
27 from apiclient.discovery import build
28 from apiclient.http import MediaInMemoryUpload
29 from oauth2client.client import SignedJwtAssertionCredentials
31 logger = logging.getLogger(__name__)
33 PDF_MIMETYPE = 'application/pdf'
34 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
36 GOOGLE_CLIENT_SECRETS = os.environ['GOOGLE_CLIENT_SECRETS']
37 GOOGLE_SERVICE_KEY_BASE64 = os.environ['GOOGLE_SERVICE_KEY_BASE64']
38 GOOGLE_USER = os.environ['GOOGLE_USER']
40 def build_api_service():
42 Build and returns a Drive service object authorized with the service
43 accounts that act on behalf of the given user.
45 Will target the Google Drive of GOOGLE_USER email address.
46 Returns a Google Drive service object.
48 Code herein adapted from:
49 https://developers.google.com/drive/delegation
52 # Extract the service address from the client secret
53 service_user = json.loads(GOOGLE_CLIENT_SECRETS)['web']['client_email']
55 # Pull in the service's p12 private key.
56 p12 = base64.decodestring(GOOGLE_SERVICE_KEY_BASE64)
57 credentials = SignedJwtAssertionCredentials(service_user, p12,
58 scope='https://www.googleapis.com/auth/drive',
61 return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
64 def pdf2html(content):
65 pdf_file = tempfile.NamedTemporaryFile()
66 pdf_file.write(content)
68 tmp_dir = tempfile.gettempdir()
69 html_file_name = uuid.uuid4().hex
70 html_file_path = os.path.join(tmp_dir, html_file_name)
72 command = ['pdf2htmlEX', pdf_file.name, html_file_name]
73 devnull = open('/dev/null', 'w')
75 call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
77 call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
80 if call.returncode != 0:
81 raise ValueError("PDF file could not be processed")
86 html_file = open(html_file_path, 'r')
87 html = html_file.read()
89 os.remove(html_file_path)
91 raise ValueError("PDF file could not be processed")
94 raise ValueError("PDF file results in empty HTML file")
99 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
100 """ Take in a gdrive service, file_dict from upload, and either an
101 extension or mimetype.
102 You must provide an `extension` or `mimetype`
103 Returns contextual files from google
106 download_urls['text'] = file_dict[u'exportLinks']['text/plain']
109 extension = extension.lower()
111 if extension in ['.ppt', 'pptx'] \
112 or mimetype in PPT_MIMETYPES:
113 download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
114 elif mimetype == PDF_MIMETYPE:
117 download_urls['html'] = file_dict[u'exportLinks']['text/html']
120 for download_type, download_url in download_urls.items():
121 print "\n%s -- %s" % (download_type, download_url)
122 resp, content = service._http.request(download_url)
124 if resp.status in [200]:
125 print "\t downloaded!"
126 # save to the File.property resulting field
127 content_dict[download_type] = content
129 print "\t Download failed: %s" % resp.status
134 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
135 """ take a gdrive service object, and a media wrapper and upload to gdrive
137 You must provide an `extension` or `mimetype`
139 _resource = {'title': filename}
141 # clean up extensions for type checking
143 extension = extension.lower()
145 # perform OCR on files that are image intensive
146 ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
147 mimetype in ['application/pdf']
149 file_dict = service.files().insert(body=_resource, media_body=media,\
150 convert=True, ocr=ocr).execute()
152 # increase exponent of 2 for exponential growth.
153 # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
155 # exponentially wait for exportLinks to be returned if missing
156 while u'exportLinks' not in file_dict or \
157 u'text/plain' not in file_dict[u'exportLinks']:
158 # if a bunch seconds have passed, give up
160 raise ValueError('Google Drive failed to read the document.')
163 print "upload_check_sleep({0})".format(2. ** delay_exp)
164 time.sleep(2. ** delay_exp)
165 delay_exp = delay_exp + 1
167 # try to get the doc from gdrive
168 file_dict = service.files().get(fileId=file_dict[u'id']).execute()
173 def convert_raw_document(raw_document, user=None):
174 """ Upload a raw document to google drive and get a Note back"""
175 fp_file = raw_document.get_file()
177 # extract some properties from the document metadata
178 filename = raw_document.name
179 print "this is the mimetype of the document to check:"
180 mimetype = raw_document.mimetype
184 # A special case for Evernotes
185 if raw_document.mimetype == 'text/enml':
186 raw_document.mimetype = 'text/html'
188 original_content = fp_file.read()
190 # Include mimetype parameter if there is one to include
191 extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
193 media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
194 resumable=True, **extra_flags)
197 service = build_api_service()
199 # upload to google drive
200 file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
202 # download from google drive
203 content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
205 # this should have already happened, lets see why it hasn't
206 raw_document.is_processed = True
209 note = raw_document.convert_to_note()
211 # Cache the uploaded file's URL
212 note.gdrive_url = file_dict['alternateLink']
213 note.text = content_dict['text']
215 # Extract HTML from the appropriate place
217 convert_to_markdown = False
218 if raw_document.mimetype == PDF_MIMETYPE:
219 html = pdf2html(original_content)
220 elif raw_document.mimetype in PPT_MIMETYPES:
221 html = pdf2html(content_dict['pdf'])
222 elif 'html' in content_dict and content_dict['html']:
223 html = content_dict['html']
226 html = sanitizer.data_uris_to_s3(html)
227 NoteMarkdown.objects.create(note=note, html=html)
229 # If we know the user who uploaded this,
230 # associate them with the note
231 if user and not user.is_anonymous():
233 NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
236 mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
237 note.user = mapping.user
239 NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
240 except (ObjectDoesNotExist, MultipleObjectsReturned):
241 logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
243 # Finally, save whatever data we got back from google