karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 import datetime
   6 import magic
   7 import mimetypes
   8 import os
   9 import re
  10 import time
  11
  12 import httplib2
  13 from apiclient.discovery import build
  14 from apiclient.http import MediaFileUpload
  15 from apiclient.http import MediaInMemoryUpload
  16 from django.conf import settings
  17 from django.core.files.base import ContentFile
  18 from oauth2client.client import flow_from_clientsecrets
  19
  20 from karmaworld.apps.notes.models import DriveAuth, Note
  21
  22 CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
  23                     'secret/client_secrets.json')
  24 #from credentials import GOOGLE_USER # FIXME
  25 try:
  26     from secrets.drive import GOOGLE_USER
  27 except:
  28     GOOGLE_USER = 'admin@karmanotes.org' # FIXME
  29
  30 EXT_TO_MIME = {'.docx': 'application/msword'}
  31
  32 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  33
  34 def extract_file_details(fileobj):
  35     details = None
  36     year = None
  37
  38     fileobj.open()
  39     filebuf = fileobj.read()
  40     with magic.Magic() as m:
  41         details = m.id_buffer(filebuf)
  42     fileobj.close()
  43
  44     result = re.search(r'Create Time/Date:[^,]+(?P<year>\d{4})', details)
  45     if result:
  46         if 'year' in result.groupdict():
  47             year = result.groupdict()['year']
  48
  49     return {'year': year}
  50
  51 def build_flow():
  52     """ Create an oauth2 autentication object with our preferred details """
  53     scopes = [
  54         'https://www.googleapis.com/auth/drive',
  55         'https://www.googleapis.com/auth/drive.file',
  56         'https://www.googleapis.com/auth/userinfo.email',
  57         'https://www.googleapis.com/auth/userinfo.profile',
  58     ]
  59
  60     flow = flow_from_clientsecrets(CLIENT_SECRET, ' '.join(scopes), \
  61             redirect_uri='http://localhost:8000/oauth2callback')
  62     flow.params['access_type'] = 'offline'
  63     flow.params['approval_prompt'] = 'force'
  64     flow.params['user_id'] = GOOGLE_USER
  65     return flow
  66
  67
  68 def authorize():
  69     """ Use an oauth2client flow object to generate the web url to create a new
  70         auth that can be then stored """
  71     flow = build_flow()
  72     print flow.step1_get_authorize_url()
  73
  74
  75 def accept_auth(code):
  76     """ Callback endpoint for accepting the post `authorize()` google drive
  77         response, and generate a credentials object
  78         :code:  An authentication token from a WEB oauth dialog
  79         returns a oauth2client credentials object """
  80     flow = build_flow()
  81     creds = flow.step2_exchange(code)
  82     return creds
  83
  84
  85 def build_api_service(creds):
  86     http = httplib2.Http()
  87     http = creds.authorize(http)
  88     return build('drive', 'v2', http=http), http
  89
  90
  91 def check_and_refresh(creds, auth):
  92     """ Check a Credentials object's expiration token
  93         if it is out of date, refresh the token and save
  94         :creds: a Credentials object
  95         :auth:  a DriveAuth that backs the cred object
  96         :returns: updated creds and auth objects
  97     """
  98     if creds.token_expiry < datetime.datetime.utcnow():
  99         # if we are passed the token expiry,
 100         # refresh the creds and store them
 101         http = httplib2.Http()
 102         http = creds.authorize(http)
 103         creds.refresh(http)
 104         auth.credentials = creds.to_json()
 105         auth.save()
 106     return creds, auth
 107
 108 def download_from_gdrive(file_dict, http, extension=None, mimetype=None):
 109     """ get urls from file_dict and download contextual files from google """
 110     download_urls = {}
 111     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
 112
 113     if extension:
 114         extension = extension.lower()
 115
 116     if extension in ['.ppt', 'pptx'] \
 117         or mimetype in PPT_MIMETYPES:
 118         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 119     else:
 120         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 121
 122
 123     content_dict = {}
 124     for download_type, download_url in download_urls.items():
 125         print "\n%s -- %s" % (download_type, download_urls)
 126         resp, content = http.request(download_url, "GET")
 127
 128         if resp.status in [200]:
 129             print "\t downloaded!"
 130             # save to the File.property resulting field
 131             content_dict[download_type] = content
 132         else:
 133             print "\t Download failed: %s" % resp.status
 134
 135     return content_dict
 136
 137 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 138     """ take a gdrive service object, and a media wrapper and upload to gdrive
 139         returns a file_dict
 140         You must provide an `extension` or `mimetype`
 141     """
 142     _resource = {'title': filename}
 143
 144     # clean up extensions for type checking
 145     if extension:
 146         extension = extension.lower()
 147
 148     if extension in ['.pdf', '.jpeg', '.jpg', '.png'] \
 149         or mimetype in ['application/pdf']:
 150         # include OCR on ocr-able files
 151         file_dict = service.files().insert(body=_resource, media_body=media, convert=True, ocr=True).execute()
 152
 153     else:
 154         file_dict = service.files().insert(body=_resource, media_body=media, convert=True).execute()
 155
 156     if u'exportLinks' not in file_dict:
 157         # wait some seconds
 158         # get the doc from gdrive
 159         time.sleep(30)
 160         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 161
 162     return file_dict
 163
 164 def convert_with_google_drive(note):
 165     """ Upload a local note and download HTML
 166         using Google Drive
 167         :note: a File model instance # FIXME
 168     """
 169     # TODO: set the permission of the file to permissive so we can use the
 170     #       gdrive_url to serve files directly to users
 171
 172     # Get file_type and encoding of uploaded file
 173     # i.e: file_type = 'text/plain', encoding = None
 174     (file_type, encoding) = mimetypes.guess_type(note.note_file.path)
 175
 176
 177     if file_type == 'text/enml': file_type = 'text/xml'
 178
 179     if file_type != None:
 180         media = MediaFileUpload(note.note_file.path, mimetype=file_type,
 181                     chunksize=1024*1024, resumable=True)
 182
 183     else:
 184         media = MediaFileUpload(note.note_file.path,
 185                     chunksize=1024*1024, resumable=True)
 186
 187     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 188     creds = auth.transform_to_cred()
 189
 190
 191     creds, auth = check_and_refresh(creds, auth)
 192
 193     service, http = build_api_service(creds)
 194
 195     # get the file extension
 196     filename, extension = os.path.splitext(note.note_file.path)
 197
 198     file_dict = upload_to_gdrive(service, media, filename, extension)
 199
 200     content_dict = download_from_gdrive(file_dict, http, extension)
 201
 202     # Get a new copy of the file from the database with the new metadata from filemeta
 203     new_note = Note.objects.get(id=note.id)
 204
 205     if extension.lower() == '.pdf':
 206         new_note.file_type = 'pdf'
 207
 208     elif extension.lower() in ['.ppt', '.pptx']:
 209         new_note.file_type = 'ppt'
 210         new_note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 211
 212     else:
 213         # PPT files do not have this export ability
 214         new_note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
 215         new_note.html = content_dict['html']
 216
 217     new_note.text = content_dict['text']
 218
 219     # before we save new html, sanitize a tags in note.html
 220     #new_note.sanitize_html(save=False)
 221     #FIXME: ^^^ disabled until we can get html out of an Etree html element
 222
 223     # Finally, save whatever data we got back from google
 224     new_note.save()
 225
 226 def convert_raw_document(raw_document):
 227     """ Upload a raw document to google drive and get a Note back """
 228     fp_file = raw_document.get_file()
 229
 230     # download the file to memory
 231     # get the file's mimetype
 232     #file_type, _ = mimetypes.guess_type(raw_document.fp_file.path)
 233     # get the file extension
 234     #filename, extension = os.path.splitext(raw_document.fp_file.path)
 235     filename = raw_document.name
 236     print "this is the mimetype of the document to check:"
 237     mimetype = raw_document.mimetype
 238     print mimetype
 239     print ""
 240
 241     if mimetype == 'text/enml': mimetype = 'text/xml'
 242
 243     if mimetype == None:
 244         media = MediaInMemoryUpload(fp_file.read(),
 245                     chunksize=1024*1024, resumable=True)
 246     else:
 247         media = MediaInMemoryUpload(fp_file.read(), mimetype=mimetype,
 248                     chunksize=1024*1024, resumable=True)
 249
 250     auth = DriveAuth.objects.filter(email=GOOGLE_USER).all()[0]
 251     creds = auth.transform_to_cred()
 252
 253     creds, auth = check_and_refresh(creds, auth)
 254     service, http = build_api_service(creds)
 255
 256     # prepare the upload
 257     file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
 258     content_dict = download_from_gdrive(file_dict, http, mimetype=mimetype)
 259
 260     # this should have already happened, lets see why it hasn't
 261     raw_document.is_processed = True
 262     raw_document.save()
 263
 264     note = raw_document.convert_to_note()
 265
 266     if raw_document.mimetype == 'application/pdf':
 267         note.file_type = 'pdf'
 268
 269     elif raw_document.mimetype in PPT_MIMETYPES:
 270         note.file_type = 'ppt'
 271         note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 272
 273     elif 'html' in content_dict and content_dict['html']:
 274         note.html = content_dict['html']
 275         # before we save new html, sanitize a tags in note.html
 276         #note.sanitize_html(save=False)
 277         #FIXME: ^^^ disabled
 278
 279     note.text = content_dict['text']
 280
 281     note_details = extract_file_details(fp_file)
 282     if 'year' in note_details and note_details['year']:
 283         note.year = note_details['year']
 284
 285
 286     # Finally, save whatever data we got back from google
 287     note.save()
 288