karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4 import base64
   5
   6 import datetime
   7 import logging
   8 from django.contrib.auth.models import User
   9 from django.conf import settings
  10 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  11 from karmaworld.apps.notes.models import UserUploadMapping
  12 from karmaworld.apps.notes.models import NoteMarkdown
  13 from karmaworld.apps.notes import sanitizer
  14 from karmaworld.apps.quizzes.models import Keyword
  15 from karmaworld.apps.users.models import NoteKarmaEvent
  16 import os
  17 import subprocess
  18 import tempfile
  19 import uuid
  20 import magic
  21 import re
  22 import json
  23 import time
  24
  25 import httplib2
  26 import html2text
  27 from apiclient.discovery import build
  28 from apiclient.http import MediaInMemoryUpload
  29 from oauth2client.client import SignedJwtAssertionCredentials
  30
  31 logger = logging.getLogger(__name__)
  32
  33 PDF_MIMETYPE = 'application/pdf'
  34 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  35
  36 GOOGLE_SERVICE_EMAIL = os.environ['GOOGLE_SERVICE_EMAIL']
  37 GOOGLE_SERVICE_KEY_BASE64 = os.environ['GOOGLE_SERVICE_KEY_BASE64']
  38 GOOGLE_USER = os.environ['GOOGLE_USER']
  39
  40 def build_api_service():
  41     """
  42     Build and returns a Drive service object authorized with the service
  43     accounts that act on behalf of the given user.
  44
  45     Will target the Google Drive of GOOGLE_USER email address.
  46     Returns a Google Drive service object.
  47
  48     Code herein adapted from:
  49     https://developers.google.com/drive/delegation
  50     """
  51
  52     # Pull in the service's p12 private key.
  53     p12 = base64.decodestring(GOOGLE_SERVICE_KEY_BASE64)
  54     credentials = SignedJwtAssertionCredentials(GOOGLE_SERVICE_EMAIL, p12,
  55                                scope='https://www.googleapis.com/auth/drive',
  56                                sub=GOOGLE_USER)
  57
  58     return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
  59
  60
  61 def pdf2html(content):
  62     pdf_file = tempfile.NamedTemporaryFile()
  63     pdf_file.write(content)
  64     pdf_file.flush()
  65     tmp_dir = tempfile.gettempdir()
  66     html_file_name = uuid.uuid4().hex
  67     html_file_path = os.path.join(tmp_dir, html_file_name)
  68
  69     command = ['pdf2htmlEX', pdf_file.name, html_file_name]
  70     devnull = open('/dev/null', 'w')
  71     if settings.TESTING:
  72         call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
  73     else:
  74         call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
  75     call.wait()
  76     devnull.close()
  77     if call.returncode != 0:
  78         raise ValueError("PDF file could not be processed")
  79
  80     pdf_file.close()
  81
  82     try:
  83         html_file = open(html_file_path, 'r')
  84         html = html_file.read()
  85         html_file.close()
  86         os.remove(html_file_path)
  87     except IOError, e:
  88         raise ValueError("PDF file could not be processed")
  89
  90     if len(html) == 0:
  91         raise ValueError("PDF file results in empty HTML file")
  92
  93     return html
  94
  95
  96 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
  97     """ Take in a gdrive service, file_dict from upload, and either an
  98         extension or mimetype.
  99         You must provide an `extension` or `mimetype`
 100         Returns contextual files from google
 101     """
 102     download_urls = {}
 103     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
 104
 105     if extension:
 106         extension = extension.lower()
 107
 108     if extension in ['.ppt', 'pptx'] \
 109         or mimetype in PPT_MIMETYPES:
 110         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 111     elif mimetype == PDF_MIMETYPE:
 112         pass
 113     else:
 114         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 115
 116     content_dict = {}
 117     for download_type, download_url in download_urls.items():
 118         print "\n%s -- %s" % (download_type, download_url)
 119         resp, content = service._http.request(download_url)
 120
 121         if resp.status in [200]:
 122             print "\t downloaded!"
 123             # save to the File.property resulting field
 124             content_dict[download_type] = content
 125         else:
 126             print "\t Download failed: %s" % resp.status
 127
 128     return content_dict
 129
 130
 131 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 132     """ take a gdrive service object, and a media wrapper and upload to gdrive
 133         returns a file_dict
 134         You must provide an `extension` or `mimetype`
 135     """
 136     _resource = {'title': filename}
 137
 138     # clean up extensions for type checking
 139     if extension:
 140         extension = extension.lower()
 141
 142     # perform OCR on files that are image intensive
 143     ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
 144           mimetype in ['application/pdf']
 145
 146     file_dict = service.files().insert(body=_resource, media_body=media,\
 147                                        convert=True, ocr=ocr).execute()
 148
 149     # increase exponent of 2 for exponential growth.
 150     # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
 151     delay_exp = -1
 152     # exponentially wait for exportLinks to be returned if missing
 153     while u'exportLinks' not in file_dict or \
 154           u'text/plain' not in file_dict[u'exportLinks']:
 155         # if a bunch  seconds have passed, give up
 156         if delay_exp == 7:
 157             raise ValueError('Google Drive failed to read the document.')
 158
 159         # wait some seconds
 160         print "upload_check_sleep({0})".format(2. ** delay_exp)
 161         time.sleep(2. ** delay_exp)
 162         delay_exp = delay_exp + 1
 163
 164         # try to get the doc from gdrive
 165         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 166
 167     return file_dict
 168
 169
 170 def convert_raw_document(raw_document, user=None):
 171     """ Upload a raw document to google drive and get a Note back"""
 172     fp_file = raw_document.get_file()
 173
 174     # extract some properties from the document metadata
 175     filename = raw_document.name
 176     print "this is the mimetype of the document to check:"
 177     mimetype = raw_document.mimetype
 178     print mimetype
 179     print ""
 180
 181     # A special case for Evernotes
 182     if raw_document.mimetype == 'text/enml':
 183         raw_document.mimetype = 'text/html'
 184
 185     original_content = fp_file.read()
 186
 187     # Include mimetype parameter if there is one to include
 188     extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
 189                   else {}
 190     media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
 191                                 resumable=True, **extra_flags)
 192
 193
 194     service = build_api_service()
 195
 196     # upload to google drive
 197     file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
 198
 199     # download from google drive
 200     content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
 201
 202     # this should have already happened, lets see why it hasn't
 203     raw_document.is_processed = True
 204     raw_document.save()
 205
 206     note = raw_document.convert_to_note()
 207
 208     # Cache the uploaded file's URL
 209     note.gdrive_url = file_dict['alternateLink']
 210     note.text = content_dict['text']
 211
 212     # Extract HTML from the appropriate place
 213     html = ''
 214     if raw_document.mimetype == PDF_MIMETYPE:
 215         html = pdf2html(original_content)
 216     elif raw_document.mimetype in PPT_MIMETYPES:
 217         html = pdf2html(content_dict['pdf'])
 218     elif 'html' in content_dict and content_dict['html']:
 219         html = content_dict['html']
 220
 221     if html:
 222         html = sanitizer.data_uris_to_s3(html)
 223         NoteMarkdown.objects.create(note=note, html=html)
 224
 225     # If we know the user who uploaded this,
 226     # associate them with the note
 227     if user and not user.is_anonymous():
 228         note.user = user
 229         NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 230     else:
 231         try:
 232             mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
 233             note.user = mapping.user
 234             note.save()
 235             NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
 236         except (ObjectDoesNotExist, MultipleObjectsReturned):
 237             logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
 238
 239     # Finally, save whatever data we got back from google
 240     note.save()
 241
 242
 243
 244
 245