only convert non pdfs, add command to convert all non pdf files with static html
authorJacob Hilker <hilker.j@gmail.com>
Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
committerJacob Hilker <hilker.j@gmail.com>
Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
karmaworld/apps/notes/gdrive.py
karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py [new file with mode: 0644]
karmaworld/apps/notes/models.py
karmaworld/apps/notes/views.py

index 2c2254310c39ad2bced4b1f5e8013e1e291bbc01..d9ba0e8428b426aab727eb92223cc504fac287c7 100644 (file)
@@ -230,12 +230,14 @@ def convert_raw_document(raw_document, user=None):
 
     # Extract HTML from the appropriate place
     html = ''
+    convert_to_markdown = False
     if raw_document.mimetype == PDF_MIMETYPE:
         html = pdf2html(original_content)
     elif raw_document.mimetype in PPT_MIMETYPES:
         html = pdf2html(content_dict['pdf'])
     elif 'html' in content_dict and content_dict['html']:
         html = content_dict['html']
+        convert_to_markdown = True
     # cleanup the HTML
     html = note.filter_html(html)
 
@@ -244,15 +246,15 @@ def convert_raw_document(raw_document, user=None):
 
     note.text = content_dict['text']
 
+    if convert_to_markdown:
+        h = html2text.HTML2Text()
+        h.google_doc = True
+        h.escape_snob = True
+        h.unicode_snob = True
+        markdown = h.handle(html.decode('utf8', 'ignore'))
 
-    h = html2text.HTML2Text()
-    h.google_doc = True
-    h.escape_snob = True
-    h.unicode_snob = True
-    markdown = h.handle(html.decode('utf8', 'ignore'))
-
-    note_markdown = NoteMarkdown(note=note, markdown=markdown)
-    note_markdown.save()
+        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+        note_markdown.save()
 
     note_details = extract_file_details(fp_file)
     if 'year' in note_details and note_details['year']:
diff --git a/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py
new file mode 100644 (file)
index 0000000..6d98008
--- /dev/null
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2012  FinalsClub Foundation
+
+import html2text
+from django.core.files.storage import default_storage
+from django.core.management.base import BaseCommand
+from karmaworld.apps.notes.models import Note, NoteMarkdown
+
+class Command(BaseCommand):
+    """ Command to process all notes and add add a markdown version of the file 
+        into the database """
+    args = 'none'
+    help = "Take all notes and use their html to create a markdown version of the document"
+
+    def handle(self, *args, **kwargs):
+        """ On all calls, clean all notes with html and not text using html2text """
+        notes = Note.objects.all()
+
+        converted_notes = 0
+        for note in notes:
+            if note.static_html and not note.is_pdf():
+                h = html2text.HTML2Text()
+                h.google_doc = True
+                h.escape_snob = True
+                h.unicode_snob = True
+
+                with default_storage.open(note.get_relative_s3_path(),'r') as html:
+                    markdown = h.handle(html.read().decode('utf8', 'ignore'))
+                    if note.has_markdown():
+                        note_markdown = note.notemarkdown
+                        note_markdown.markdown = markdown
+                    else:
+                        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+                    note_markdown.save()
+                converted_notes += 1
+                print 'Processed {n}'.format(n=note)
+
+            """if not note.static_html:
+                # no HTML to fetch
+                continue
+            try:
+                h = html2text.HTML2Text()
+                h.escape_snob = True
+                h.unicode_snob = True
+                h.ignore_links = True
+                h.ignore_images = True
+                h.ignore_emphasis = True
+                # fetch data
+                with default_storage.open(note.get_relative_s3_path(),'r') as \
+                  html:
+                    note.text = h.handle(html.read())
+                note.save()
+                cleaned_notes += 1
+                print 'Processed {n}'.format(n=note)
+            except Exception, e:
+                print note
+                print e
+                continue"""
+        print 'Processed %s notes' % converted_notes
+
index 4a03516eb400b31e907c07a3fc8a9863637b5aac..3852bdd6c38f08e29c7e52fa5df69390d3382b13 100644 (file)
@@ -204,6 +204,12 @@ class Note(Document):
         (UNKNOWN_FILE, 'Unknown file'),
     )
 
+    PDF_MIMETYPES = (
+      'application/pdf',
+      'application/vnd.ms-powerpoint',
+      'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    )
+
     file_type       = models.CharField(max_length=15,
                             choices=FILE_TYPE_CHOICES,
                             default=UNKNOWN_FILE,
@@ -408,6 +414,9 @@ class Note(Document):
     def has_markdown(self):
         return hasattr(self, "notemarkdown")
 
+    def is_pdf(self):
+        return self.mimetype in Note.PDF_MIMETYPES
+
 
 class NoteMarkdown(models.Model):
     note     = models.OneToOneField(Note, primary_key=True)
index f8ab6704b7f62a7a0c6bbde467ed1e88432c3540..db274a4d75d74cd549d0bbdc172ff7e8b453e3e8 100644 (file)
@@ -28,12 +28,6 @@ from karmaworld.apps.notes.forms import NoteForm
 
 logger = logging.getLogger(__name__)
 
-PDF_MIMETYPES = (
-    'application/pdf',
-    'application/vnd.ms-powerpoint',
-    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
-)
-
 THANKS_FIELD = 'thanks'
 USER_PROFILE_THANKS_FIELD = 'thanked_notes'
 FLAG_FIELD = 'flags'
@@ -49,7 +43,7 @@ class NoteDetailView(DetailView):
         """ Generate custom context for the page rendering a Note
             + if pdf, set the `pdf` flag
         """
-        if self.object.mimetype in PDF_MIMETYPES:
+        if self.object.is_pdf():
             kwargs['pdf_controls'] = True
 
         if self.request.user.is_authenticated():