karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import datetime
   6 import mimetypes
   7 import os
   8 import time
   9
  10 import httplib2
  11 from apiclient.discovery import build
  12 from apiclient.http import MediaFileUpload
  13 from apiclient.http import MediaInMemoryUpload
  14 from django.conf import settings
  15 from django.core.files.base import ContentFile
  16 from oauth2client.client import flow_from_clientsecrets
  17
  18 from karmaworld.apps.notes.models import DriveAuth, Note
  19
  20 CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
  21                     'secret/client_secrets.json')
  22 #from credentials import GOOGLE_USER # FIXME
  23 try:
  24     from secret.drive import GOOGLE_USER
  25 except:
  26     GOOGLE_USER = 'admin@karmanotes.org' # FIXME
  27
  28 EXT_TO_MIME = {'.docx': 'application/msword'}
  29
  30 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  31
  32 def build_flow():
  33     """ Create an oauth2 autentication object with our preferred details """
  34     scopes = [
  35         'https://www.googleapis.com/auth/drive',
  36         'https://www.googleapis.com/auth/drive.file',
  37         'https://www.googleapis.com/auth/userinfo.email',
  38         'https://www.googleapis.com/auth/userinfo.profile',
  39     ]
  40
  41     flow = flow_from_clientsecrets(CLIENT_SECRET, ' '.join(scopes), \
  42             redirect_uri='http://localhost:8000/oauth2callback')
  43     flow.params['access_type'] = 'offline'
  44     flow.params['approval_prompt'] = 'force'
  45     flow.params['user_id'] = GOOGLE_USER
  46     return flow
  47
  48
  49 def authorize():
  50     """ Use an oauth2client flow object to generate the web url to create a new
  51         auth that can be then stored """
  52     flow = build_flow()
  53     print flow.step1_get_authorize_url()
  54
  55
  56 def accept_auth(code):
  57     """ Callback endpoint for accepting the post `authorize()` google drive
  58         response, and generate a credentials object
  59         :code:  An authentication token from a WEB oauth dialog
  60         returns a oauth2client credentials object """
  61     flow = build_flow()
  62     creds = flow.step2_exchange(code)
  63     return creds
  64
  65
  66 def build_api_service(creds):
  67     http = httplib2.Http()
  68     http = creds.authorize(http)
  69     return build('drive', 'v2', http=http), http
  70
  71
  72 def check_and_refresh(creds, auth):
  73     """ Check a Credentials object's expiration token
  74         if it is out of date, refresh the token and save
  75         :creds: a Credentials object
  76         :auth:  a DriveAuth that backs the cred object
  77         :returns: updated creds and auth objects
  78     """
  79     if creds.token_expiry < datetime.datetime.utcnow():
  80         # if we are passed the token expiry,
  81         # refresh the creds and store them
  82         http = httplib2.Http()
  83         http = creds.authorize(http)
  84         creds.refresh(http)
  85         auth.credentials = creds.to_json()
  86         auth.save()
  87     return creds, auth
  88
  89 def download_from_gdrive(file_dict, http, extension=None, mimetype=None):
  90     """ get urls from file_dict and download contextual files from google """
  91     download_urls = {}
  92     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
  93
  94     if extension:
  95         extension = extension.lower()
  96
  97     if extension in ['.ppt', 'pptx'] \
  98         or mimetype in PPT_MIMETYPES:
  99         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 100     else:
 101         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 102
 103
 104     content_dict = {}
 105     for download_type, download_url in download_urls.items():
 106         print "\n%s -- %s" % (download_type, download_urls)
 107         resp, content = http.request(download_url, "GET")
 108
 109         if resp.status in [200]:
 110             print "\t downloaded!"
 111             # save to the File.property resulting field
 112             content_dict[download_type] = content
 113         else:
 114             print "\t Download failed: %s" % resp.status
 115
 116     return content_dict
 117
 118 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 119     """ take a gdrive service object, and a media wrapper and upload to gdrive
 120         returns a file_dict
 121         You must provide an `extension` or `mimetype`
 122     """
 123     _resource = {'title': filename}
 124
 125     # clean up extensions for type checking
 126     if extension:
 127         extension = extension.lower()
 128
 129     if extension in ['.pdf', '.jpeg', '.jpg', '.png'] \
 130         or mimetype in ['application/pdf']:
 131         # include OCR on ocr-able files
 132         file_dict = service.files().insert(body=_resource, media_body=media, convert=True, ocr=True).execute()
 133
 134     else:
 135         file_dict = service.files().insert(body=_resource, media_body=media, convert=True).execute()
 136
 137     if u'exportLinks' not in file_dict:
 138         # wait some seconds
 139         # get the doc from gdrive
 140         time.sleep(30)
 141         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 142
 143     return file_dict
 144
 145 def convert_with_google_drive(note):
 146     """ Upload a local note and download HTML
 147         using Google Drive
 148         :note: a File model instance # FIXME
 149     """
 150     # TODO: set the permission of the file to permissive so we can use the
 151     #       gdrive_url to serve files directly to users
 152
 153     # Get file_type and encoding of uploaded file
 154     # i.e: file_type = 'text/plain', encoding = None
 155     (file_type, encoding) = mimetypes.guess_type(note.note_file.path)
 156
 157
 158
 159     if file_type != None:
 160         media = MediaFileUpload(note.note_file.path, mimetype=file_type,
 161                     chunksize=1024*1024, resumable=True)
 162
 163     else:
 164         media = MediaFileUpload(note.note_file.path,
 165                     chunksize=1024*1024, resumable=True)
 166
 167     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 168     creds = auth.transform_to_cred()
 169
 170
 171     creds, auth = check_and_refresh(creds, auth)
 172
 173     service, http = build_api_service(creds)
 174
 175     # get the file extension
 176     filename, extension = os.path.splitext(note.note_file.path)
 177
 178     file_dict = upload_to_gdrive(service, media, filename, extension)
 179
 180     content_dict = download_from_gdrive(file_dict, http, extension)
 181
 182     # Get a new copy of the file from the database with the new metadata from filemeta
 183     new_note = Note.objects.get(id=note.id)
 184
 185     if extension.lower() == '.pdf':
 186         new_note.file_type = 'pdf'
 187
 188     elif extension.lower() in ['.ppt', '.pptx']:
 189         new_note.file_type = 'ppt'
 190         new_note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 191
 192     else:
 193         # PPT files do not have this export ability
 194         new_note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
 195         new_note.html = content_dict['html']
 196
 197     new_note.text = content_dict['text']
 198
 199     # before we save new html, sanitize a tags in note.html
 200     #new_note.sanitize_html(save=False)
 201     #FIXME: ^^^ disabled until we can get html out of an Etree html element
 202
 203     # Finally, save whatever data we got back from google
 204     new_note.save()
 205
 206 def convert_raw_document(raw_document):
 207     """ Upload a raw document to google drive and get a Note back """
 208     fp_file = raw_document.get_file()
 209
 210     # download the file to memory
 211     # get the file's mimetype
 212     #file_type, _ = mimetypes.guess_type(raw_document.fp_file.path)
 213     # get the file extension
 214     #filename, extension = os.path.splitext(raw_document.fp_file.path)
 215     filename = raw_document.name
 216     print "this is the mimetype of the document to check:"
 217     print raw_document.mimetype
 218     print ""
 219
 220     if raw_document.mimetype == None:
 221         media = MediaInMemoryUpload(fp_file.read(),
 222                     chunksize=1024*1024, resumable=True)
 223     else:
 224         media = MediaInMemoryUpload(fp_file.read(), mimetype=raw_document.mimetype,
 225                     chunksize=1024*1024, resumable=True)
 226
 227     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 228     creds = auth.transform_to_cred()
 229
 230     creds, auth = check_and_refresh(creds, auth)
 231     service, http = build_api_service(creds)
 232
 233     # prepare the upload
 234     file_dict = upload_to_gdrive(service, media, filename, mimetype=raw_document.mimetype)
 235     content_dict = download_from_gdrive(file_dict, http, mimetype=raw_document.mimetype)
 236
 237     # this should have already happened, lets see why it hasn't
 238     raw_document.is_processed = True
 239     raw_document.save()
 240
 241     note = raw_document.convert_to_note()
 242
 243     if raw_document.mimetype == 'application/pdf':
 244         note.file_type = 'pdf'
 245
 246     elif raw_document.mimetype in PPT_MIMETYPES:
 247         note.file_type = 'ppt'
 248         note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 249
 250     else:
 251         # PPT files do not have this export ability
 252         note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
 253         note.html = content_dict['html']
 254
 255     note.text = content_dict['text']
 256
 257     # before we save new html, sanitize a tags in note.html
 258     #note.sanitize_html(save=False)
 259     #FIXME: ^^^ disabled until we can get html out of an Etree html element
 260
 261     # Finally, save whatever data we got back from google
 262     note.save()
 263