3 # Copyright (C) 2012 FinalsClub Foundation
6 from io import FileIO, BufferedWriter
12 from apiclient.discovery import build
13 from apiclient.http import MediaFileUpload
14 from apiclient.http import MediaInMemoryUpload
15 from django.conf import settings
16 from django.core.files import File
17 from oauth2client.client import flow_from_clientsecrets
19 from karmaworld.apps.notes.models import DriveAuth, Note
21 CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
22 'secret/client_secrets.json')
23 #from credentials import GOOGLE_USER # FIXME
25 from secret.drive import GOOGLE_USER
27 GOOGLE_USER = 'admin@karmanotes.org' # FIXME
29 EXT_TO_MIME = {'.docx': 'application/msword'}
31 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
34 """ Create an oauth2 autentication object with our preferred details """
36 'https://www.googleapis.com/auth/drive',
37 'https://www.googleapis.com/auth/drive.file',
38 'https://www.googleapis.com/auth/userinfo.email',
39 'https://www.googleapis.com/auth/userinfo.profile',
42 flow = flow_from_clientsecrets(CLIENT_SECRET, ' '.join(scopes), \
43 redirect_uri='http://localhost:8000/oauth2callback')
44 flow.params['access_type'] = 'offline'
45 flow.params['approval_prompt'] = 'force'
46 flow.params['user_id'] = GOOGLE_USER
51 """ Use an oauth2client flow object to generate the web url to create a new
52 auth that can be then stored """
54 print flow.step1_get_authorize_url()
57 def accept_auth(code):
58 """ Callback endpoint for accepting the post `authorize()` google drive
59 response, and generate a credentials object
60 :code: An authentication token from a WEB oauth dialog
61 returns a oauth2client credentials object """
63 creds = flow.step2_exchange(code)
67 def build_api_service(creds):
68 http = httplib2.Http()
69 http = creds.authorize(http)
70 return build('drive', 'v2', http=http), http
73 def check_and_refresh(creds, auth):
74 """ Check a Credentials object's expiration token
75 if it is out of date, refresh the token and save
76 :creds: a Credentials object
77 :auth: a DriveAuth that backs the cred object
78 :returns: updated creds and auth objects
80 if creds.token_expiry < datetime.datetime.utcnow():
81 # if we are passed the token expiry,
82 # refresh the creds and store them
83 http = httplib2.Http()
84 http = creds.authorize(http)
86 auth.credentials = creds.to_json()
90 def download_from_gdrive(file_dict, http, extension=None, mimetype=None):
91 """ get urls from file_dict and download contextual files from google """
93 download_urls['text'] = file_dict[u'exportLinks']['text/plain']
96 extension = extension.lower()
98 if extension in ['.ppt', 'pptx'] \
99 or mimetype in PPT_MIMETYPES:
100 download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
102 download_urls['html'] = file_dict[u'exportLinks']['text/html']
106 for download_type, download_url in download_urls.items():
107 print "\n%s -- %s" % (download_type, download_urls)
108 resp, content = http.request(download_url, "GET")
110 if resp.status in [200]:
111 print "\t downloaded!"
112 # save to the File.property resulting field
113 content_dict[download_type] = content
115 print "\t Download failed: %s" % resp.status
119 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
120 """ take a gdrive service object, and a media wrapper and upload to gdrive
122 You must provide an `extension` or `mimetype`
124 _resource = {'title': filename}
126 # clean up extensions for type checking
128 extension = extension.lower()
130 if extension in ['.pdf', '.jpeg', '.jpg', '.png'] \
131 or mimetype in ['application/pdf']:
132 # include OCR on ocr-able files
133 file_dict = service.files().insert(body=_resource, media_body=media, convert=True, ocr=True).execute()
136 file_dict = service.files().insert(body=_resource, media_body=media, convert=True).execute()
138 if u'exportLinks' not in file_dict:
140 # get the doc from gdrive
142 file_dict = service.files().get(fileId=file_dict[u'id']).execute()
146 def convert_with_google_drive(note):
147 """ Upload a local note and download HTML
149 :note: a File model instance # FIXME
151 # TODO: set the permission of the file to permissive so we can use the
152 # gdrive_url to serve files directly to users
154 # Get file_type and encoding of uploaded file
155 # i.e: file_type = 'text/plain', encoding = None
156 (file_type, encoding) = mimetypes.guess_type(note.note_file.path)
160 if file_type != None:
161 media = MediaFileUpload(note.note_file.path, mimetype=file_type,
162 chunksize=1024*1024, resumable=True)
165 media = MediaFileUpload(note.note_file.path,
166 chunksize=1024*1024, resumable=True)
168 auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
169 creds = auth.transform_to_cred()
172 creds, auth = check_and_refresh(creds, auth)
174 service, http = build_api_service(creds)
176 # get the file extension
177 filename, extension = os.path.splitext(note.note_file.path)
179 file_dict = upload_to_gdrive(service, media, filename, extension)
181 content_dict = download_from_gdrive(file_dict, http, extension)
183 # Get a new copy of the file from the database with the new metadata from filemeta
184 new_note = Note.objects.get(id=note.id)
186 if extension.lower() == '.pdf':
187 new_note.file_type = 'pdf'
189 elif extension.lower() in ['.ppt', '.pptx']:
190 new_note.file_type = 'ppt'
191 now = datetime.datetime.utcnow()
192 # create a folder path to store the ppt > pdf file with year and month folders
193 nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
195 _path = filename + '.pdf'
197 # If those folders don't exist, create them
198 os.makedirs(os.path.realpath(os.path.dirname(_path)))
200 print "we failed to create those directories"
202 _writer = BufferedWriter(FileIO(_path, "w"))
203 _writer.write(content_dict['pdf'])
206 new_note.pdf_file = _path
209 # PPT files do not have this export ability
210 new_note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
211 new_note.html = content_dict['html']
213 new_note.text = content_dict['text']
215 # before we save new html, sanitize a tags in note.html
216 #new_note.sanitize_html(save=False)
217 #FIXME: ^^^ disabled until we can get html out of an Etree html element
219 # Finally, save whatever data we got back from google
222 def convert_raw_document(raw_document):
223 """ Upload a raw document to google drive and get a Note back """
224 fp_file = raw_document.get_file()
226 # download the file to memory
227 # get the file's mimetype
228 #file_type, _ = mimetypes.guess_type(raw_document.fp_file.path)
229 # get the file extension
230 #filename, extension = os.path.splitext(raw_document.fp_file.path)
231 filename = raw_document.name
232 print "this is the mimetype of the document to check:"
233 print raw_document.mimetype
236 if raw_document.mimetype == None:
237 media = MediaInMemoryUpload(fp_file.read(),
238 chunksize=1024*1024, resumable=True)
240 media = MediaInMemoryUpload(fp_file.read(), mimetype=raw_document.mimetype,
241 chunksize=1024*1024, resumable=True)
243 auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
244 creds = auth.transform_to_cred()
246 creds, auth = check_and_refresh(creds, auth)
247 service, http = build_api_service(creds)
250 file_dict = upload_to_gdrive(service, media, filename, mimetype=raw_document.mimetype)
251 content_dict = download_from_gdrive(file_dict, http, mimetype=raw_document.mimetype)
253 # this should have already happened, lets see why it hasn't
254 raw_document.is_processed = True
257 note = raw_document.convert_to_note()
259 if raw_document.mimetype == 'application/pdf':
260 note.file_type = 'pdf'
262 elif raw_document.mimetype in PPT_MIMETYPES:
263 note.file_type = 'ppt'
264 now = datetime.datetime.utcnow()
265 # create a folder path to store the ppt > pdf file with year and month folders
266 nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
268 _path = filename + '.pdf'
270 # If those folders don't exist, create them
271 os.makedirs(os.path.realpath(os.path.dirname(_path)))
273 print "we failed to create those directories"
275 _writer = BufferedWriter(FileIO(_path, "w"))
276 _writer.write(content_dict['pdf'])
279 note.pdf_file = _path
282 # PPT files do not have this export ability
283 note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
284 note.html = content_dict['html']
286 note.text = content_dict['text']
288 # before we save new html, sanitize a tags in note.html
289 #note.sanitize_html(save=False)
290 #FIXME: ^^^ disabled until we can get html out of an Etree html element
292 # Finally, save whatever data we got back from google