karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4 import base64
   5
   6 import datetime
   7 import logging
   8 from django.contrib.auth.models import User
   9 from django.conf import settings
  10 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  11 from karmaworld.apps.notes.models import UserUploadMapping
  12 from karmaworld.apps.notes.models import NoteMarkdown
  13 from karmaworld.apps.notes import sanitizer
  14 from karmaworld.apps.quizzes.models import Keyword
  15 from karmaworld.apps.users.models import NoteKarmaEvent
  16 import os
  17 import subprocess
  18 import tempfile
  19 import uuid
  20 import magic
  21 import re
  22 import json
  23 import time
  24
  25 import httplib2
  26 import html2text
  27 from apiclient.discovery import build
  28 from apiclient.http import MediaInMemoryUpload
  29 from oauth2client.client import SignedJwtAssertionCredentials
  30
  31 logger = logging.getLogger(__name__)
  32
  33 PDF_MIMETYPE = 'application/pdf'
  34 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  35
  36 GOOGLE_CLIENT_SECRETS = os.environ['GOOGLE_CLIENT_SECRETS']
  37 GOOGLE_SERVICE_KEY_BASE64 = os.environ['GOOGLE_SERVICE_KEY_BASE64']
  38 GOOGLE_USER = os.environ['GOOGLE_USER']
  39
  40 def build_api_service():
  41     """
  42     Build and returns a Drive service object authorized with the service
  43     accounts that act on behalf of the given user.
  44
  45     Will target the Google Drive of GOOGLE_USER email address.
  46     Returns a Google Drive service object.
  47
  48     Code herein adapted from:
  49     https://developers.google.com/drive/delegation
  50     """
  51
  52     # Extract the service address from the client secret
  53     service_user = json.loads(GOOGLE_CLIENT_SECRETS)['web']['client_email']
  54
  55     # Pull in the service's p12 private key.
  56     p12 = base64.decodestring(GOOGLE_SERVICE_KEY_BASE64)
  57     credentials = SignedJwtAssertionCredentials(service_user, p12,
  58                                scope='https://www.googleapis.com/auth/drive',
  59                                sub=GOOGLE_USER)
  60
  61     return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
  62
  63
  64 def pdf2html(content):
  65     pdf_file = tempfile.NamedTemporaryFile()
  66     pdf_file.write(content)
  67     pdf_file.flush()
  68     tmp_dir = tempfile.gettempdir()
  69     html_file_name = uuid.uuid4().hex
  70     html_file_path = os.path.join(tmp_dir, html_file_name)
  71
  72     command = ['pdf2htmlEX', pdf_file.name, html_file_name]
  73     devnull = open('/dev/null', 'w')
  74     if settings.TESTING:
  75         call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
  76     else:
  77         call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
  78     call.wait()
  79     devnull.close()
  80     if call.returncode != 0:
  81         raise ValueError("PDF file could not be processed")
  82
  83     pdf_file.close()
  84
  85     try:
  86         html_file = open(html_file_path, 'r')
  87         html = html_file.read()
  88         html_file.close()
  89         os.remove(html_file_path)
  90     except IOError, e:
  91         raise ValueError("PDF file could not be processed")
  92
  93     if len(html) == 0:
  94         raise ValueError("PDF file results in empty HTML file")
  95
  96     return html
  97
  98
  99 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
 100     """ Take in a gdrive service, file_dict from upload, and either an
 101         extension or mimetype.
 102         You must provide an `extension` or `mimetype`
 103         Returns contextual files from google
 104     """
 105     download_urls = {}
 106     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
 107
 108     if extension:
 109         extension = extension.lower()
 110
 111     if extension in ['.ppt', 'pptx'] \
 112         or mimetype in PPT_MIMETYPES:
 113         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 114     elif mimetype == PDF_MIMETYPE:
 115         pass
 116     else:
 117         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 118
 119     content_dict = {}
 120     for download_type, download_url in download_urls.items():
 121         print "\n%s -- %s" % (download_type, download_url)
 122         resp, content = service._http.request(download_url)
 123
 124         if resp.status in [200]:
 125             print "\t downloaded!"
 126             # save to the File.property resulting field
 127             content_dict[download_type] = content
 128         else:
 129             print "\t Download failed: %s" % resp.status
 130
 131     return content_dict
 132
 133
 134 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 135     """ take a gdrive service object, and a media wrapper and upload to gdrive
 136         returns a file_dict
 137         You must provide an `extension` or `mimetype`
 138     """
 139     _resource = {'title': filename}
 140
 141     # clean up extensions for type checking
 142     if extension:
 143         extension = extension.lower()
 144
 145     # perform OCR on files that are image intensive
 146     ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
 147           mimetype in ['application/pdf']
 148
 149     file_dict = service.files().insert(body=_resource, media_body=media,\
 150                                        convert=True, ocr=ocr).execute()
 151
 152     # increase exponent of 2 for exponential growth.
 153     # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
 154     delay_exp = -1
 155     # exponentially wait for exportLinks to be returned if missing
 156     while u'exportLinks' not in file_dict or \
 157           u'text/plain' not in file_dict[u'exportLinks']:
 158         # if a bunch  seconds have passed, give up
 159         if delay_exp == 7:
 160             raise ValueError('Google Drive failed to read the document.')
 161
 162         # wait some seconds
 163         print "upload_check_sleep({0})".format(2. ** delay_exp)
 164         time.sleep(2. ** delay_exp)
 165         delay_exp = delay_exp + 1
 166
 167         # try to get the doc from gdrive
 168         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 169
 170     return file_dict
 171
 172
 173 def convert_raw_document(raw_document, user=None):
 174     """ Upload a raw document to google drive and get a Note back"""
 175     fp_file = raw_document.get_file()
 176
 177     # extract some properties from the document metadata
 178     filename = raw_document.name
 179     print "this is the mimetype of the document to check:"
 180     mimetype = raw_document.mimetype
 181     print mimetype
 182     print ""
 183
 184     # A special case for Evernotes
 185     if raw_document.mimetype == 'text/enml':
 186         raw_document.mimetype = 'text/html'
 187
 188     original_content = fp_file.read()
 189
 190     # Include mimetype parameter if there is one to include
 191     extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
 192                   else {}
 193     media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
 194                                 resumable=True, **extra_flags)
 195
 196
 197     service = build_api_service()
 198
 199     # upload to google drive
 200     file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
 201
 202     # download from google drive
 203     content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
 204
 205     # this should have already happened, lets see why it hasn't
 206     raw_document.is_processed = True
 207     raw_document.save()
 208
 209     note = raw_document.convert_to_note()
 210
 211     # Cache the uploaded file's URL
 212     note.gdrive_url = file_dict['alternateLink']
 213     note.text = content_dict['text']
 214
 215     # Extract HTML from the appropriate place
 216     html = ''
 217     convert_to_markdown = False
 218     if raw_document.mimetype == PDF_MIMETYPE:
 219         html = pdf2html(original_content)
 220     elif raw_document.mimetype in PPT_MIMETYPES:
 221         html = pdf2html(content_dict['pdf'])
 222     elif 'html' in content_dict and content_dict['html']:
 223         html = content_dict['html']
 224
 225     if html:
 226         html = sanitizer.data_uris_to_s3(html)
 227         NoteMarkdown.objects.create(note=note, html=html)
 228
 229     # If we know the user who uploaded this,
 230     # associate them with the note
 231     if user and not user.is_anonymous():
 232         note.user = user
 233         NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 234     else:
 235         try:
 236             mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
 237             note.user = mapping.user
 238             note.save()
 239             NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
 240         except (ObjectDoesNotExist, MultipleObjectsReturned):
 241             logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
 242
 243     # Finally, save whatever data we got back from google
 244     note.save()
 245
 246
 247
 248
 249