karmaworld/apps/notes/gdrive.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4 import base64
   5
   6 import datetime
   7 import logging
   8 from django.contrib.auth.models import User
   9 from django.conf import settings
  10 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  11 from karmaworld.apps.notes.models import UserUploadMapping
  12 from karmaworld.apps.notes.models import NoteMarkdown
  13 from karmaworld.apps.quizzes.models import Keyword
  14 from karmaworld.apps.users.models import NoteKarmaEvent
  15 import os
  16 import subprocess
  17 import tempfile
  18 import uuid
  19 import magic
  20 import re
  21 import json
  22 import time
  23
  24 import httplib2
  25 import html2text
  26 from apiclient.discovery import build
  27 from apiclient.http import MediaInMemoryUpload
  28 from oauth2client.client import SignedJwtAssertionCredentials
  29
  30 logger = logging.getLogger(__name__)
  31
  32 PDF_MIMETYPE = 'application/pdf'
  33 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  34
  35 GOOGLE_CLIENT_SECRETS = os.environ['GOOGLE_CLIENT_SECRETS']
  36 GOOGLE_SERVICE_KEY_BASE64 = os.environ['GOOGLE_SERVICE_KEY_BASE64']
  37 GOOGLE_USER = os.environ['GOOGLE_USER']
  38
  39 def build_api_service():
  40     """
  41     Build and returns a Drive service object authorized with the service
  42     accounts that act on behalf of the given user.
  43
  44     Will target the Google Drive of GOOGLE_USER email address.
  45     Returns a Google Drive service object.
  46
  47     Code herein adapted from:
  48     https://developers.google.com/drive/delegation
  49     """
  50
  51     # Extract the service address from the client secret
  52     service_user = json.loads(GOOGLE_CLIENT_SECRETS)['web']['client_email']
  53
  54     # Pull in the service's p12 private key.
  55     p12 = base64.decodestring(GOOGLE_SERVICE_KEY_BASE64)
  56     credentials = SignedJwtAssertionCredentials(service_user, p12,
  57                                scope='https://www.googleapis.com/auth/drive',
  58                                sub=GOOGLE_USER)
  59
  60     return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
  61
  62
  63 def pdf2html(content):
  64     pdf_file = tempfile.NamedTemporaryFile()
  65     pdf_file.write(content)
  66     pdf_file.flush()
  67     tmp_dir = tempfile.gettempdir()
  68     html_file_name = uuid.uuid4().hex
  69     html_file_path = os.path.join(tmp_dir, html_file_name)
  70
  71     command = ['pdf2htmlEX', pdf_file.name, html_file_name]
  72     devnull = open('/dev/null', 'w')
  73     if settings.TESTING:
  74         call = subprocess.Popen(command, shell=False, cwd=tmp_dir, stdout=devnull, stderr=devnull)
  75     else:
  76         call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
  77     call.wait()
  78     devnull.close()
  79     if call.returncode != 0:
  80         raise ValueError("PDF file could not be processed")
  81
  82     pdf_file.close()
  83
  84     try:
  85         html_file = open(html_file_path, 'r')
  86         html = html_file.read()
  87         html_file.close()
  88         os.remove(html_file_path)
  89     except IOError, e:
  90         raise ValueError("PDF file could not be processed")
  91
  92     if len(html) == 0:
  93         raise ValueError("PDF file results in empty HTML file")
  94
  95     return html
  96
  97
  98 def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
  99     """ Take in a gdrive service, file_dict from upload, and either an
 100         extension or mimetype.
 101         You must provide an `extension` or `mimetype`
 102         Returns contextual files from google
 103     """
 104     download_urls = {}
 105     download_urls['text'] = file_dict[u'exportLinks']['text/plain']
 106
 107     if extension:
 108         extension = extension.lower()
 109
 110     if extension in ['.ppt', 'pptx'] \
 111         or mimetype in PPT_MIMETYPES:
 112         download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
 113     elif mimetype == PDF_MIMETYPE:
 114         pass
 115     else:
 116         download_urls['html'] = file_dict[u'exportLinks']['text/html']
 117
 118     content_dict = {}
 119     for download_type, download_url in download_urls.items():
 120         print "\n%s -- %s" % (download_type, download_url)
 121         resp, content = service._http.request(download_url)
 122
 123         if resp.status in [200]:
 124             print "\t downloaded!"
 125             # save to the File.property resulting field
 126             content_dict[download_type] = content
 127         else:
 128             print "\t Download failed: %s" % resp.status
 129
 130     return content_dict
 131
 132
 133 def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
 134     """ take a gdrive service object, and a media wrapper and upload to gdrive
 135         returns a file_dict
 136         You must provide an `extension` or `mimetype`
 137     """
 138     _resource = {'title': filename}
 139
 140     # clean up extensions for type checking
 141     if extension:
 142         extension = extension.lower()
 143
 144     # perform OCR on files that are image intensive
 145     ocr = extension in ['.pdf', '.jpeg', '.jpg', '.png'] or \
 146           mimetype in ['application/pdf']
 147
 148     file_dict = service.files().insert(body=_resource, media_body=media,\
 149                                        convert=True, ocr=ocr).execute()
 150
 151     # increase exponent of 2 for exponential growth.
 152     # 2 ** -1 = 0.5, 2 ** 0 = 1, 2 ** 1 = 2, 4, 8, 16, ...
 153     delay_exp = -1
 154     # exponentially wait for exportLinks to be returned if missing
 155     while u'exportLinks' not in file_dict or \
 156           u'text/plain' not in file_dict[u'exportLinks']:
 157         # if a bunch  seconds have passed, give up
 158         if delay_exp == 7:
 159             raise ValueError('Google Drive failed to read the document.')
 160
 161         # wait some seconds
 162         print "upload_check_sleep({0})".format(2. ** delay_exp)
 163         time.sleep(2. ** delay_exp)
 164         delay_exp = delay_exp + 1
 165
 166         # try to get the doc from gdrive
 167         file_dict = service.files().get(fileId=file_dict[u'id']).execute()
 168
 169     return file_dict
 170
 171
 172 def convert_raw_document(raw_document, user=None):
 173     """ Upload a raw document to google drive and get a Note back"""
 174     fp_file = raw_document.get_file()
 175
 176     # extract some properties from the document metadata
 177     filename = raw_document.name
 178     print "this is the mimetype of the document to check:"
 179     mimetype = raw_document.mimetype
 180     print mimetype
 181     print ""
 182
 183     # A special case for Evernotes
 184     if raw_document.mimetype == 'text/enml':
 185         raw_document.mimetype = 'text/html'
 186
 187     original_content = fp_file.read()
 188
 189     # Include mimetype parameter if there is one to include
 190     extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
 191                   else {}
 192     media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
 193                                 resumable=True, **extra_flags)
 194
 195
 196     service = build_api_service()
 197
 198     # upload to google drive
 199     file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)
 200
 201     # download from google drive
 202     content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
 203
 204     # this should have already happened, lets see why it hasn't
 205     raw_document.is_processed = True
 206     raw_document.save()
 207
 208     note = raw_document.convert_to_note()
 209
 210     # Cache the uploaded file's URL
 211     note.gdrive_url = file_dict['alternateLink']
 212
 213     # Extract HTML from the appropriate place
 214     html = ''
 215     convert_to_markdown = False
 216     if raw_document.mimetype == PDF_MIMETYPE:
 217         html = pdf2html(original_content)
 218     elif raw_document.mimetype in PPT_MIMETYPES:
 219         html = pdf2html(content_dict['pdf'])
 220     elif 'html' in content_dict and content_dict['html']:
 221         html = content_dict['html']
 222         convert_to_markdown = True
 223     # cleanup the HTML
 224     html = note.filter_html(html)
 225
 226     # upload the HTML file to static host if it is not already there
 227     note.send_to_s3(html, do_save=False)
 228
 229     note.text = content_dict['text']
 230
 231     if convert_to_markdown:
 232         h = html2text.HTML2Text()
 233         h.google_doc = True
 234         h.escape_snob = True
 235         h.unicode_snob = True
 236         markdown = h.handle(html.decode('utf8', 'ignore'))
 237
 238         note_markdown = NoteMarkdown(note=note, markdown=markdown)
 239         note_markdown.save()
 240
 241     # If we know the user who uploaded this,
 242     # associate them with the note
 243     if user:
 244         note.user = user
 245         NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 246     else:
 247         try:
 248             mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
 249             note.user = mapping.user
 250             note.save()
 251             NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
 252         except (ObjectDoesNotExist, MultipleObjectsReturned):
 253             logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)
 254
 255     # Finally, save whatever data we got back from google
 256     note.save()
 257
 258
 259
 260
 261