3 # Copyright (C) 2012 FinalsClub Foundation
13 from apiclient.discovery import build
14 from apiclient.http import MediaFileUpload
15 from apiclient.http import MediaInMemoryUpload
16 from django.conf import settings
17 from django.core.files.base import ContentFile
18 from oauth2client.client import flow_from_clientsecrets
20 from karmaworld.apps.notes.models import DriveAuth, Note
22 CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
23 'secret/client_secrets.json')
24 #from credentials import GOOGLE_USER # FIXME
26 from secret.drive import GOOGLE_USER
28 GOOGLE_USER = 'admin@karmanotes.org' # FIXME
30 EXT_TO_MIME = {'.docx': 'application/msword'}
32 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
34 def extract_file_details(fileobj):
39 filebuf = fileobj.read()
40 with magic.Magic() as m:
41 details = m.id_buffer(filebuf)
44 result = re.search(r'Create Time/Date:[^,]+(?P<year>\d{4})', details)
46 if 'year' in result.groupdict():
47 year = result.groupdict()['year']
52 """ Create an oauth2 autentication object with our preferred details """
54 'https://www.googleapis.com/auth/drive',
55 'https://www.googleapis.com/auth/drive.file',
56 'https://www.googleapis.com/auth/userinfo.email',
57 'https://www.googleapis.com/auth/userinfo.profile',
60 flow = flow_from_clientsecrets(CLIENT_SECRET, ' '.join(scopes), \
61 redirect_uri='http://localhost:8000/oauth2callback')
62 flow.params['access_type'] = 'offline'
63 flow.params['approval_prompt'] = 'force'
64 flow.params['user_id'] = GOOGLE_USER
69 """ Use an oauth2client flow object to generate the web url to create a new
70 auth that can be then stored """
72 print flow.step1_get_authorize_url()
75 def accept_auth(code):
76 """ Callback endpoint for accepting the post `authorize()` google drive
77 response, and generate a credentials object
78 :code: An authentication token from a WEB oauth dialog
79 returns a oauth2client credentials object """
81 creds = flow.step2_exchange(code)
85 def build_api_service(creds):
86 http = httplib2.Http()
87 http = creds.authorize(http)
88 return build('drive', 'v2', http=http), http
91 def check_and_refresh(creds, auth):
92 """ Check a Credentials object's expiration token
93 if it is out of date, refresh the token and save
94 :creds: a Credentials object
95 :auth: a DriveAuth that backs the cred object
96 :returns: updated creds and auth objects
98 if creds.token_expiry < datetime.datetime.utcnow():
99 # if we are passed the token expiry,
100 # refresh the creds and store them
101 http = httplib2.Http()
102 http = creds.authorize(http)
104 auth.credentials = creds.to_json()
108 def download_from_gdrive(file_dict, http, extension=None, mimetype=None):
109 """ get urls from file_dict and download contextual files from google """
111 download_urls['text'] = file_dict[u'exportLinks']['text/plain']
114 extension = extension.lower()
116 if extension in ['.ppt', 'pptx'] \
117 or mimetype in PPT_MIMETYPES:
118 download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
120 download_urls['html'] = file_dict[u'exportLinks']['text/html']
124 for download_type, download_url in download_urls.items():
125 print "\n%s -- %s" % (download_type, download_urls)
126 resp, content = http.request(download_url, "GET")
128 if resp.status in [200]:
129 print "\t downloaded!"
130 # save to the File.property resulting field
131 content_dict[download_type] = content
133 print "\t Download failed: %s" % resp.status
137 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
138 """ take a gdrive service object, and a media wrapper and upload to gdrive
140 You must provide an `extension` or `mimetype`
142 _resource = {'title': filename}
144 # clean up extensions for type checking
146 extension = extension.lower()
148 if extension in ['.pdf', '.jpeg', '.jpg', '.png'] \
149 or mimetype in ['application/pdf']:
150 # include OCR on ocr-able files
151 file_dict = service.files().insert(body=_resource, media_body=media, convert=True, ocr=True).execute()
154 file_dict = service.files().insert(body=_resource, media_body=media, convert=True).execute()
156 if u'exportLinks' not in file_dict:
158 # get the doc from gdrive
160 file_dict = service.files().get(fileId=file_dict[u'id']).execute()
164 def convert_with_google_drive(note):
165 """ Upload a local note and download HTML
167 :note: a File model instance # FIXME
169 # TODO: set the permission of the file to permissive so we can use the
170 # gdrive_url to serve files directly to users
172 # Get file_type and encoding of uploaded file
173 # i.e: file_type = 'text/plain', encoding = None
174 (file_type, encoding) = mimetypes.guess_type(note.note_file.path)
178 if file_type != None:
179 media = MediaFileUpload(note.note_file.path, mimetype=file_type,
180 chunksize=1024*1024, resumable=True)
183 media = MediaFileUpload(note.note_file.path,
184 chunksize=1024*1024, resumable=True)
186 auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
187 creds = auth.transform_to_cred()
190 creds, auth = check_and_refresh(creds, auth)
192 service, http = build_api_service(creds)
194 # get the file extension
195 filename, extension = os.path.splitext(note.note_file.path)
197 file_dict = upload_to_gdrive(service, media, filename, extension)
199 content_dict = download_from_gdrive(file_dict, http, extension)
201 # Get a new copy of the file from the database with the new metadata from filemeta
202 new_note = Note.objects.get(id=note.id)
204 if extension.lower() == '.pdf':
205 new_note.file_type = 'pdf'
207 elif extension.lower() in ['.ppt', '.pptx']:
208 new_note.file_type = 'ppt'
209 new_note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
212 # PPT files do not have this export ability
213 new_note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
214 new_note.html = content_dict['html']
216 new_note.text = content_dict['text']
218 # before we save new html, sanitize a tags in note.html
219 #new_note.sanitize_html(save=False)
220 #FIXME: ^^^ disabled until we can get html out of an Etree html element
222 # Finally, save whatever data we got back from google
225 def convert_raw_document(raw_document):
226 """ Upload a raw document to google drive and get a Note back """
227 fp_file = raw_document.get_file()
229 # download the file to memory
230 # get the file's mimetype
231 #file_type, _ = mimetypes.guess_type(raw_document.fp_file.path)
232 # get the file extension
233 #filename, extension = os.path.splitext(raw_document.fp_file.path)
234 filename = raw_document.name
235 print "this is the mimetype of the document to check:"
236 print raw_document.mimetype
239 if raw_document.mimetype == None:
240 media = MediaInMemoryUpload(fp_file.read(),
241 chunksize=1024*1024, resumable=True)
243 media = MediaInMemoryUpload(fp_file.read(), mimetype=raw_document.mimetype,
244 chunksize=1024*1024, resumable=True)
246 auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
247 creds = auth.transform_to_cred()
249 creds, auth = check_and_refresh(creds, auth)
250 service, http = build_api_service(creds)
253 file_dict = upload_to_gdrive(service, media, filename, mimetype=raw_document.mimetype)
254 content_dict = download_from_gdrive(file_dict, http, mimetype=raw_document.mimetype)
256 # this should have already happened, lets see why it hasn't
257 raw_document.is_processed = True
260 note = raw_document.convert_to_note()
262 if raw_document.mimetype == 'application/pdf':
263 note.file_type = 'pdf'
265 elif raw_document.mimetype in PPT_MIMETYPES:
266 note.file_type = 'ppt'
267 note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
270 # PPT files do not have this export ability
271 note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
272 note.html = content_dict['html']
274 note.text = content_dict['text']
276 note_details = extract_file_details(fp_file)
277 if 'year' in note_details and note_details['year']:
278 note.year = note_details['year']
280 # before we save new html, sanitize a tags in note.html
281 #note.sanitize_html(save=False)
282 #FIXME: ^^^ disabled until we can get html out of an Etree html element
284 # Finally, save whatever data we got back from google