Convert PDF notes to HTML with PDF2HTMLEx

author Charles Connell <charles@connells.org>

Fri, 20 Dec 2013 23:58:00 +0000 (18:58 -0500)

committer Charles Connell <charles@connells.org>

Sat, 21 Dec 2013 01:47:28 +0000 (20:47 -0500)
author Charles Connell <charles@connells.org>
Fri, 20 Dec 2013 23:58:00 +0000 (18:58 -0500)
committer Charles Connell <charles@connells.org>
Sat, 21 Dec 2013 01:47:28 +0000 (20:47 -0500)
diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py

index a813ed41039e5806cbe7228e4bbcf3e86e339c02..ec450094808f89e4c3e23096956e13d8ff8b8910 100644 (file)
--- a/karmaworld/apps/notes/gdrive.py
+++ b/karmaworld/apps/notes/gdrive.py
@@ -2,6 +2,11 @@
  # -*- coding:utf8 -*-
  # Copyright (C) 2012  FinalsClub Foundation
  
+import datetime
+import os
+import subprocess
+import tempfile
+import uuid
  import magic
  import re
  import json
@@ -10,12 +15,12 @@ import time
  import httplib2
  from apiclient.discovery import build
  from apiclient.http import MediaInMemoryUpload
-from django.core.files.base import ContentFile
  from oauth2client.client import SignedJwtAssertionCredentials
  
  import karmaworld.secret.drive as drive
  
  
+PDF_MIMETYPE = 'application/pdf'
  PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  
  
@@ -63,6 +68,33 @@ def build_api_service():
  
      return build('drive', 'v2', http=credentials.authorize(httplib2.Http()))
  
+def pdf2html(content):
+    pdf_file = tempfile.NamedTemporaryFile()
+    pdf_file.write(content)
+    pdf_file.flush()
+    tmp_dir = tempfile.gettempdir()
+    html_file_name = uuid.uuid4().hex
+    html_file_path = tmp_dir + os.sep + html_file_name
+
+    command = ['pdf2htmlEX', pdf_file.name, html_file_name]
+    call = subprocess.Popen(command, shell=False, cwd=tmp_dir)
+    call.wait()
+    if call.returncode != 0:
+        raise ValueError("PDF file could not be processed")
+
+    try:
+        html_file = open(html_file_path, 'r')
+        html = html_file.read()
+        html_file.close()
+        os.remove(html_file_path)
+    except IOError, e:
+        raise ValueError("PDF file could not be processed")
+
+    if len(html) == 0:
+        raise ValueError("PDF file results in empty HTML file")
+
+    return html
+
  
  def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
      """ Take in a gdrive service, file_dict from upload, and either an
@@ -79,12 +111,14 @@ def download_from_gdrive(service, file_dict, extension=None, mimetype=None):
      if extension in ['.ppt', 'pptx'] \
          or mimetype in PPT_MIMETYPES:
          download_urls['pdf'] = file_dict[u'exportLinks']['application/pdf']
+    elif mimetype == PDF_MIMETYPE:
+        pass
      else:
          download_urls['html'] = file_dict[u'exportLinks']['text/html']
  
      content_dict = {}
      for download_type, download_url in download_urls.items():
-        print "\n%s -- %s" % (download_type, download_urls)
+        print "\n%s -- %s" % (download_type, download_url)
          resp, content = service._http.request(download_url)
  
          if resp.status in [200]:
@@ -137,7 +171,7 @@ def upload_to_gdrive(service, media, filename, extension=None, mimetype=None):
  
  
  def convert_raw_document(raw_document):
-    """ Upload a raw document to google drive and get a Note back """
+    """ Upload a raw document to google drive and get a Note back"""
      fp_file = raw_document.get_file()
  
      # extract some properties from the document metadata
@@ -151,12 +185,15 @@ def convert_raw_document(raw_document):
      if raw_document.mimetype == 'text/enml':
          raw_document.mimetype = 'text/html'
  
+    original_content = fp_file.read()
+
      # Include mimetype parameter if there is one to include
      extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
                    else {}
-    media = MediaInMemoryUpload(fp_file.read(), chunksize=1024*1024, \
+    media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
                                  resumable=True, **extra_flags)
  
+
      service = build_api_service()
  
      # upload to google drive
@@ -165,18 +202,19 @@ def convert_raw_document(raw_document):
      # download from google drive
      content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)
  
+
+
      # this should have already happened, lets see why it hasn't
      raw_document.is_processed = True
      raw_document.save()
  
      note = raw_document.convert_to_note()
  
-    if mimetype == 'application/pdf':
-        note.file_type = 'pdf'
+    if raw_document.mimetype == PDF_MIMETYPE:
+        note.html = pdf2html(original_content)
  
-    elif mimetype in PPT_MIMETYPES:
-        note.file_type = 'ppt'
-        note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
+    elif raw_document.mimetype in PPT_MIMETYPES:
+        note.html = pdf2html(content_dict['pdf'])
  
      elif 'html' in content_dict and content_dict['html']:
          note.html = content_dict['html']
diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py

index b56e7191b03f77bfe052f42780751d2fa7853763..c889fea1e1f4b3e4c0799ded1a995e3b3d3098d0 100644 (file)
--- a/karmaworld/apps/notes/models.py
+++ b/karmaworld/apps/notes/models.py
@@ -136,20 +136,20 @@ class Note(Document):
          (UNKNOWN_FILE, 'Unknown file'),
      )
  
-    file_type       = models.CharField(max_length=15,  \
-                            choices=FILE_TYPE_CHOICES, \
-                            default=UNKNOWN_FILE,      \
+    file_type       = models.CharField(max_length=15,
+                            choices=FILE_TYPE_CHOICES,
+                            default=UNKNOWN_FILE,
                              blank=True, null=True)
  
      # Upload files to MEDIA_ROOT/notes/YEAR/MONTH/DAY, 2012/10/30/filename
-    pdf_file       = models.FileField(                  \
-                            storage=fs,                 \
-                            upload_to="notes/%Y/%m/%d/",\
+    pdf_file       = models.FileField(
+                            storage=fs,
+                            upload_to="notes/%Y/%m/%d/",
                              blank=True, null=True)
      # No longer keeping a local copy backed by django
-    note_file       = models.FileField(                 \
-                            storage=fs,                 \
-                            upload_to="notes/%Y/%m/%d/",\
+    note_file       = models.FileField(
+                            storage=fs,
+                            upload_to="notes/%Y/%m/%d/",
                              blank=True, null=True)
  
      # Google Drive URLs
diff --git a/karmaworld/templates/notes/note_detail.html b/karmaworld/templates/notes/note_detail.html

index 351bdf41544afb51bf8c05328e06ce768c167e72..4c4c2949495cec9bd7d49a1534640dfe4d80c2cb 100644 (file)
--- a/karmaworld/templates/notes/note_detail.html
+++ b/karmaworld/templates/notes/note_detail.html
@@ -44,7 +44,7 @@
          <div class="row">
            <div class="small-12 columns">
              <iframe
-                style="border:none; width:100%; min-height: 700px; margins:auto" 
+                style="border:none; width:100%; min-height: 700px; margins: auto;"
                  id="noteframe"
                  src="/pdfview/{{ note.id }}"
                  onload="autoResize('noteframe')"> </iframe>
@@ -59,10 +59,10 @@
        {% else %} {# if pdf #}
  
        <div class="row">
-        <div class="small-12 small-centered columns medium-12 large-10 body_copy">
+        <div class="small-12 small-centered columns medium-12 large-12 body_copy">
            {% if note.html %}
              <div class="note-text">
-              <iframe style="border:none; width:100%;" id="noteframe" src="/raw/{{ note.id }}" onload="autoResize('noteframe')"> </iframe>
+              <iframe style="border:none; width:100%; min-height: 700px;" id="noteframe" src="/raw/{{ note.id }}" onload="autoResize('noteframe')"> </iframe>
              </div> <!-- .note-text -->
  
            {% else %} {# note.html #}
author	Charles Connell <charles@connells.org>
	Fri, 20 Dec 2013 23:58:00 +0000 (18:58 -0500)
committer	Charles Connell <charles@connells.org>
	Sat, 21 Dec 2013 01:47:28 +0000 (20:47 -0500)
karmaworld/apps/notes/gdrive.py		patch \| blob \| history
karmaworld/apps/notes/models.py		patch \| blob \| history
karmaworld/templates/notes/note_detail.html		patch \| blob \| history