karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import datetime
   6 from io import FileIO, BufferedWriter
   7 import mimetypes
   8 import os
   9 import time
  10
  11 import httplib2
  12 from apiclient.discovery import build
  13 from apiclient.http import MediaFileUpload
  14 from apiclient.http import MediaInMemoryUpload
  15 from django.conf import settings
  16 from django.core.files import File
  17 from oauth2client.client import flow_from_clientsecrets
  18
  19 from karmaworld.apps.notes.models import DriveAuth, Note
  20
  21 CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
  22                     'secret/client_secrets.json')
  23 #from credentials import GOOGLE_USER # FIXME
  24 try:
  25     from secret.drive import GOOGLE_USER
  26 except:
  27     GOOGLE_USER = 'admin@karmanotes.org' # FIXME
  28
  29 EXT_TO_MIME = {'.docx': 'application/msword'}
  30
  31 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  32
  33 def build_flow():
  34     """ Create an oauth2 autentication object with our preferred details """
  35     scopes = [
  36         'https://www.googleapis.com/auth/drive',
  37         'https://www.googleapis.com/auth/drive.file',
  38         'https://www.googleapis.com/auth/userinfo.email',
  39         'https://www.googleapis.com/auth/userinfo.profile',
  40     ]
  41
  42     flow = flow_from_clientsecrets(CLIENT_SECRET, ' '.join(scopes), \
  43             redirect_uri='http://localhost:8000/oauth2callback')
  44     flow.params['access_type'] = 'offline'
  45     flow.params['approval_prompt'] = 'force'
  46     flow.params['user_id'] = GOOGLE_USER
  47     return flow
  48
  49
  50 def authorize():
  51     """ Use an oauth2client flow object to generate the web url to create a new
  52         auth that can be then stored """
  53     flow = build_flow()
  54     print flow.step1_get_authorize_url()
  55
  56
  57 def accept_auth(code):
  58     """ Callback endpoint for accepting the post `authorize()` google drive
  59         response, and generate a credentials object
  60         :code:  An authentication token from a WEB oauth dialog
  61         returns a oauth2client credentials object """
  62     flow = build_flow()
  63     creds = flow.step2_exchange(code)
  64     return creds
  65
  66
  67 def build_api_service(creds):
  68     http = httplib2.Http()
  69     http = creds.authorize(http)
  70     return build('drive', 'v2', http=http), http
  71
  72
  73 def check_and_refresh(creds, auth):
  74     """ Check a Credentials object's expiration token
  75         if it is out of date, refresh the token and save
  76         :creds: a Credentials object
  77         :auth:  a DriveAuth that backs the cred object
  78         :returns: updated creds and auth objects
  79     """
  80     if creds.token_expiry < datetime.datetime.utcnow():
  81         # if we are passed the token expiry,
  82         # refresh the creds and store them
  83         http = httplib2.Http()
  84         http = creds.authorize(http)
  85         creds.refresh(http)
  86         auth.credentials = creds.to_json()
  87         auth.save()
  88     return creds, auth
  89
  90 def download_from_gdrive(file_dict, http, extension=None, mimetype=None):
  91     """ get urls from file_dict and download contextual files from google """
  92     download_urls = {}
  93     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
  94
  95     if extension:
  96         extension = extension.lower()
  97
  98     if extension in ['.ppt', 'pptx'] \
  99         or mimetype in PPT_MIMETYPES:
 100         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 101     else:
 102         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 103
 104
 105     content_dict = {}
 106     for download_type, download_url in download_urls.items():
 107         print "\n%s -- %s" % (download_type, download_urls)
 108         resp, content = http.request(download_url, "GET")
 109
 110         if resp.status in [200]:
 111             print "\t downloaded!"
 112             # save to the File.property resulting field
 113             content_dict[download_type] = content
 114         else:
 115             print "\t Download failed: %s" % resp.status
 116
 117     return content_dict
 118
 119 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 120     """ take a gdrive service object, and a media wrapper and upload to gdrive
 121         returns a file_dict
 122         You must provide an `extension` or `mimetype`
 123     """
 124     _resource = {'title': filename}
 125
 126     # clean up extensions for type checking
 127     if extension:
 128         extension = extension.lower()
 129
 130     if extension in ['.pdf', '.jpeg', '.jpg', '.png'] \
 131         or mimetype in ['application/pdf']:
 132         # include OCR on ocr-able files
 133         file_dict = service.files().insert(body=_resource, media_body=media, convert=True, ocr=True).execute()
 134
 135     else:
 136         file_dict = service.files().insert(body=_resource, media_body=media, convert=True).execute()
 137
 138     if u'exportLinks' not in file_dict:
 139         # wait some seconds
 140         # get the doc from gdrive
 141         time.sleep(30)
 142         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 143
 144     return file_dict
 145
 146 def convert_with_google_drive(note):
 147     """ Upload a local note and download HTML
 148         using Google Drive
 149         :note: a File model instance # FIXME
 150     """
 151     # TODO: set the permission of the file to permissive so we can use the
 152     #       gdrive_url to serve files directly to users
 153
 154     # Get file_type and encoding of uploaded file
 155     # i.e: file_type = 'text/plain', encoding = None
 156     (file_type, encoding) = mimetypes.guess_type(note.note_file.path)
 157
 158
 159
 160     if file_type != None:
 161         media = MediaFileUpload(note.note_file.path, mimetype=file_type,
 162                     chunksize=1024*1024, resumable=True)
 163
 164     else:
 165         media = MediaFileUpload(note.note_file.path,
 166                     chunksize=1024*1024, resumable=True)
 167
 168     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 169     creds = auth.transform_to_cred()
 170
 171
 172     creds, auth = check_and_refresh(creds, auth)
 173
 174     service, http = build_api_service(creds)
 175
 176     # get the file extension
 177     filename, extension = os.path.splitext(note.note_file.path)
 178
 179     file_dict = upload_to_gdrive(service, media, filename, extension)
 180
 181     content_dict = download_from_gdrive(file_dict, http, extension)
 182
 183     # Get a new copy of the file from the database with the new metadata from filemeta
 184     new_note = Note.objects.get(id=note.id)
 185
 186     if extension.lower() == '.pdf':
 187         new_note.file_type = 'pdf'
 188
 189     elif extension.lower() in ['.ppt', '.pptx']:
 190         new_note.file_type = 'ppt'
 191         now = datetime.datetime.utcnow()
 192         # create a folder path to store the ppt > pdf file with year and month folders
 193         nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
 194
 195         _path = filename + '.pdf'
 196         try:
 197             # If those folders don't exist, create them
 198             os.makedirs(os.path.realpath(os.path.dirname(_path)))
 199         except:
 200             print "we failed to create those directories"
 201
 202         _writer = BufferedWriter(FileIO(_path, "w"))
 203         _writer.write(content_dict['pdf'])
 204         _writer.close()
 205
 206         new_note.pdf_file = _path
 207
 208     else:
 209         # PPT files do not have this export ability
 210         new_note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
 211         new_note.html = content_dict['html']
 212
 213     new_note.text = content_dict['text']
 214
 215     # before we save new html, sanitize a tags in note.html
 216     #new_note.sanitize_html(save=False)
 217     #FIXME: ^^^ disabled until we can get html out of an Etree html element
 218
 219     # Finally, save whatever data we got back from google
 220     new_note.save()
 221
 222 def convert_raw_document(raw_document):
 223     """ Upload a raw document to google drive and get a Note back """
 224     fp_file = raw_document.get_file()
 225
 226     # download the file to memory
 227     # get the file's mimetype
 228     #file_type, _ = mimetypes.guess_type(raw_document.fp_file.path)
 229     # get the file extension
 230     #filename, extension = os.path.splitext(raw_document.fp_file.path)
 231     filename = raw_document.name
 232     print "this is the mimetype of the document to check:"
 233     print raw_document.mimetype
 234     print ""
 235
 236     if raw_document.mimetype == None:
 237         media = MediaInMemoryUpload(fp_file.read(),
 238                     chunksize=1024*1024, resumable=True)
 239     else:
 240         media = MediaInMemoryUpload(fp_file.read(), mimetype=raw_document.mimetype,
 241                     chunksize=1024*1024, resumable=True)
 242
 243     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 244     creds = auth.transform_to_cred()
 245
 246     creds, auth = check_and_refresh(creds, auth)
 247     service, http = build_api_service(creds)
 248
 249     # prepare the upload
 250     file_dict = upload_to_gdrive(service, media, filename, mimetype=raw_document.mimetype)
 251     content_dict = download_from_gdrive(file_dict, http, mimetype=raw_document.mimetype)
 252
 253     # this should have already happened, lets see why it hasn't
 254     raw_document.is_processed = True
 255     raw_document.save()
 256
 257     note = raw_document.convert_to_note()
 258
 259     if raw_document.mimetype == 'application/pdf':
 260         note.file_type = 'pdf'
 261
 262     elif raw_document.mimetype in PPT_MIMETYPES:
 263         note.file_type = 'ppt'
 264         now = datetime.datetime.utcnow()
 265         # create a folder path to store the ppt > pdf file with year and month folders
 266         nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
 267
 268         _path = filename + '.pdf'
 269         try:
 270             # If those folders don't exist, create them
 271             os.makedirs(os.path.realpath(os.path.dirname(_path)))
 272         except:
 273             print "we failed to create those directories"
 274
 275         _writer = BufferedWriter(FileIO(_path, "w"))
 276         _writer.write(content_dict['pdf'])
 277         _writer.close()
 278
 279         note.pdf_file = _path
 280
 281     else:
 282         # PPT files do not have this export ability
 283         note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
 284         note.html = content_dict['html']
 285
 286     note.text = content_dict['text']
 287
 288     # before we save new html, sanitize a tags in note.html
 289     #note.sanitize_html(save=False)
 290     #FIXME: ^^^ disabled until we can get html out of an Etree html element
 291
 292     # Finally, save whatever data we got back from google
 293     note.save()
 294