3 # Copyright (C) 2012 FinalsClub Foundation
7 from django.contrib.auth.models import User
8 from django.conf import settings
9 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
10 from karmaworld.apps.notes.models import UserUploadMapping
11 from karmaworld.apps.notes.models import NoteMarkdown
12 from karmaworld.apps.users.models import NoteKarmaEvent
24 from apiclient.discovery import build
25 from apiclient.http import MediaInMemoryUpload
26 from oauth2client.client import SignedJwtAssertionCredentials
28 import karmaworld.secret.drive as drive
30 logger = logging.getLogger(__name__)
32 PDF_MIMETYPE = 'application/pdf'
33 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
36 def build_api_service():
38 Build and returns a Drive service object authorized with the service
39 accounts that act on behalf of the given user.
41 Will target the Google Drive of GOOGLE_USER email address.
42 Returns a Google Drive service object.
44 Code herein adapted from:
45 https://developers.google.com/drive/delegation
48 # Extract the service address from the client secret
49 with open(drive.CLIENT_SECRET, 'r') as fp:
50 service_user = json.load(fp)['web']['client_email']
52 # Pull in the service's p12 private key.
53 with open(drive.SERVICE_KEY, 'rb') as p12:
54 # Use the private key to auth as the service user for access to the
55 # Google Drive of the GOOGLE_USER
56 credentials = SignedJwtAssertionCredentials(service_user, p12.read(),
57 scope='https://www.googleapis.com/auth/drive',
58 sub=drive.GOOGLE_USER)
60 return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
63 def pdf2html(content):
64 pdf_file = tempfile.NamedTemporaryFile()
65 pdf_file.write(content)
67 tmp_dir = tempfile.gettempdir()
68 html_file_name = uuid.uuid4().hex
69 html_file_path = os.path.join(tmp_dir, html_file_name)
71 command = ['pdf2htmlEX', pdf_file.name, html_file_name]
72 devnull = open('/dev/null', 'w')
74 call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
76 call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
79 if call.returncode != 0:
80 raise ValueError("PDF file could not be processed")
85 html_file = open(html_file_path, 'r')
86 html = html_file.read()
88 os.remove(html_file_path)
90 raise ValueError("PDF file could not be processed")
93 raise ValueError("PDF file results in empty HTML file")
98 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
99 """ Take in a gdrive service, file_dict from upload, and either an
100 extension or mimetype.
101 You must provide an `extension` or `mimetype`
102 Returns contextual files from google
105 download_urls['text'] = file_dict[u'exportLinks']['text/plain']
108 extension = extension.lower()
110 if extension in ['.ppt', 'pptx'] \
111 or mimetype in PPT_MIMETYPES:
112 download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
113 elif mimetype == PDF_MIMETYPE:
116 download_urls['html'] = file_dict[u'exportLinks']['text/html']
119 for download_type, download_url in download_urls.items():
120 print "\n%s -- %s" % (download_type, download_url)
121 resp, content = service._http.request(download_url)
123 if resp.status in [200]:
124 print "\t downloaded!"
125 # save to the File.property resulting field
126 content_dict[download_type] = content
128 print "\t Download failed: %s" % resp.status
133 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
134 """ take a gdrive service object, and a media wrapper and upload to gdrive
136 You must provide an `extension` or `mimetype`
138 _resource = {'title': filename}
140 # clean up extensions for type checking
142 extension = extension.lower()
144 # perform OCR on files that are image intensive
145 ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
146 mimetype in ['application/pdf']
148 file_dict = service.files().insert(body=_resource, media_body=media,\
149 convert=True, ocr=ocr).execute()
151 # increase exponent of 2 for exponential growth.
152 # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
154 # exponentially wait for exportLinks to be returned if missing
155 while u'exportLinks' not in file_dict or \
156 u'text/plain' not in file_dict[u'exportLinks']:
157 # if a bunch seconds have passed, give up
159 raise ValueError('Google Drive failed to read the document.')
162 print "upload_check_sleep({0})".format(2. ** delay_exp)
163 time.sleep(2. ** delay_exp)
164 delay_exp = delay_exp + 1
166 # try to get the doc from gdrive
167 file_dict = service.files().get(fileId=file_dict[u'id']).execute()
172 def convert_raw_document(raw_document, user=None):
173 """ Upload a raw document to google drive and get a Note back"""
174 fp_file = raw_document.get_file()
176 # extract some properties from the document metadata
177 filename = raw_document.name
178 print "this is the mimetype of the document to check:"
179 mimetype = raw_document.mimetype
183 # A special case for Evernotes
184 if raw_document.mimetype == 'text/enml':
185 raw_document.mimetype = 'text/html'
187 original_content = fp_file.read()
189 # Include mimetype parameter if there is one to include
190 extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
192 media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
193 resumable=True, **extra_flags)
196 service = build_api_service()
198 # upload to google drive
199 file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
201 # download from google drive
202 content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
204 # this should have already happened, lets see why it hasn't
205 raw_document.is_processed = True
208 note = raw_document.convert_to_note()
210 # Cache the uploaded file's URL
211 note.gdrive_url = file_dict['alternateLink']
213 # Extract HTML from the appropriate place
215 convert_to_markdown = False
216 if raw_document.mimetype == PDF_MIMETYPE:
217 html = pdf2html(original_content)
218 elif raw_document.mimetype in PPT_MIMETYPES:
219 html = pdf2html(content_dict['pdf'])
220 elif 'html' in content_dict and content_dict['html']:
221 html = content_dict['html']
222 convert_to_markdown = True
224 html = note.filter_html(html)
226 # upload the HTML file to static host if it is not already there
227 note.send_to_s3(html, do_save=False)
229 note.text = content_dict['text']
231 if convert_to_markdown:
232 h = html2text.HTML2Text()
235 h.unicode_snob = True
236 markdown = h.handle(html.decode('utf8', 'ignore'))
238 note_markdown = NoteMarkdown(note=note, markdown=markdown)
242 # If we know the user who uploaded this,
243 # associate them with the note
246 NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
249 mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
250 note.user = mapping.user
252 NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
253 except (ObjectDoesNotExist, MultipleObjectsReturned):
254 logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
256 # Finally, save whatever data we got back from google