Re-disable the HTML sanitize for now
[oweals/karmaworld.git] / karmaworld / apps / notes / gdrive.py
index 605ea016d5d93ca471825e021167139c3cfc450a..79600547c79fbbb4a4c4558cb70508628d504de5 100644 (file)
@@ -3,9 +3,10 @@
 # Copyright (C) 2012  FinalsClub Foundation
 
 import datetime
-from io import FileIO, BufferedWriter
+import magic
 import mimetypes
 import os
+import re
 import time
 
 import httplib2
@@ -13,7 +14,7 @@ from apiclient.discovery import build
 from apiclient.http import MediaFileUpload
 from apiclient.http import MediaInMemoryUpload
 from django.conf import settings
-from django.core.files import File
+from django.core.files.base import ContentFile
 from oauth2client.client import flow_from_clientsecrets
 
 from karmaworld.apps.notes.models import DriveAuth, Note
@@ -22,7 +23,7 @@ CLIENT_SECRET = os.path.join(settings.DJANGO_ROOT, \
                     'secret/client_secrets.json')
 #from credentials import GOOGLE_USER # FIXME
 try:
-    from secret.drive import GOOGLE_USER
+    from secrets.drive import GOOGLE_USER
 except:
     GOOGLE_USER = 'admin@karmanotes.org' # FIXME
 
@@ -30,6 +31,23 @@ EXT_TO_MIME = {'.docx': 'application/msword'}
 
 PPT_MIMETYPES = ['application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
 
+def extract_file_details(fileobj):
+    details = None
+    year = None
+
+    fileobj.open()
+    filebuf = fileobj.read()
+    with magic.Magic() as m:
+        details = m.id_buffer(filebuf)
+    fileobj.close()
+
+    result = re.search(r'Create Time/Date:[^,]+(?P<year>\d{4})', details)
+    if result:
+        if 'year' in result.groupdict():
+            year = result.groupdict()['year']
+
+    return {'year': year}
+
 def build_flow():
     """ Create an oauth2 autentication object with our preferred details """
     scopes = [
@@ -188,22 +206,7 @@ def convert_with_google_drive(note):
 
     elif extension.lower() in ['.ppt', '.pptx']:
         new_note.file_type = 'ppt'
-        now = datetime.datetime.utcnow()
-        # create a folder path to store the ppt > pdf file with year and month folders
-        nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
-
-        _path = filename + '.pdf'
-        try:
-            # If those folders don't exist, create them
-            os.makedirs(os.path.realpath(os.path.dirname(_path)))
-        except:
-            print "we failed to create those directories"
-
-        _writer = BufferedWriter(FileIO(_path, "w"))
-        _writer.write(content_dict['pdf'])
-        _writer.close()
-
-        new_note.pdf_file = _path
+        new_note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 
     else:
         # PPT files do not have this export ability
@@ -251,6 +254,7 @@ def convert_raw_document(raw_document):
     content_dict = download_from_gdrive(file_dict, http, mimetype=raw_document.mimetype)
 
     # this should have already happened, lets see why it hasn't
+    raw_document.is_processed = True
     raw_document.save()
 
     note = raw_document.convert_to_note()
@@ -260,33 +264,20 @@ def convert_raw_document(raw_document):
 
     elif raw_document.mimetype in PPT_MIMETYPES:
         note.file_type = 'ppt'
-        now = datetime.datetime.utcnow()
-        # create a folder path to store the ppt > pdf file with year and month folders
-        nonce_path = '/ppt_pdf/%s/%s/' % (now.year, now.month)
-
-        _path = filename + '.pdf'
-        try:
-            # If those folders don't exist, create them
-            os.makedirs(os.path.realpath(os.path.dirname(_path)))
-        except:
-            print "we failed to create those directories"
+        note.pdf_file.save(filename + '.pdf', ContentFile(content_dict['pdf']))
 
-        _writer = BufferedWriter(FileIO(_path, "w"))
-        _writer.write(content_dict['pdf'])
-        _writer.close()
-
-        note.pdf_file = _path
-
-    else:
-        # PPT files do not have this export ability
-        note.gdrive_url = file_dict[u'exportLinks']['application/vnd.oasis.opendocument.text']
+    elif 'html' in content_dict and content_dict['html']:
         note.html = content_dict['html']
+        # before we save new html, sanitize a tags in note.html
+        #note.sanitize_html(save=False)
+        #FIXME: ^^^ disabled
 
     note.text = content_dict['text']
 
-    # before we save new html, sanitize a tags in note.html
-    #note.sanitize_html(save=False)
-    #FIXME: ^^^ disabled until we can get html out of an Etree html element
+    note_details = extract_file_details(fp_file)
+    if 'year' in note_details and note_details['year']:
+        note.year = note_details['year']
+
 
     # Finally, save whatever data we got back from google
     note.save()