From: Jacob Hilker <hilker.j@gmail.com>
Date: Sun, 9 Mar 2014 23:03:57 +0000 (-0400)
Subject: only convert non pdfs, add command to convert all non pdf files with static html
X-Git-Tag: release-20150131~148^2~1^2~6
X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=0ba50ff8ba26847cac33a6bdd811a90de8d5082c;p=oweals%2Fkarmaworld.git

only convert non pdfs, add command to convert all non pdf files with static html
---

diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py
index 2c22543..d9ba0e8 100644
--- a/karmaworld/apps/notes/gdrive.py
+++ b/karmaworld/apps/notes/gdrive.py
@@ -230,12 +230,14 @@ def convert_raw_document(raw_document, user=None):
 
     # Extract HTML from the appropriate place
     html = ''
+    convert_to_markdown = False
     if raw_document.mimetype == PDF_MIMETYPE:
         html = pdf2html(original_content)
     elif raw_document.mimetype in PPT_MIMETYPES:
         html = pdf2html(content_dict['pdf'])
     elif 'html' in content_dict and content_dict['html']:
         html = content_dict['html']
+        convert_to_markdown = True
     # cleanup the HTML
     html = note.filter_html(html)
 
@@ -244,15 +246,15 @@ def convert_raw_document(raw_document, user=None):
 
     note.text = content_dict['text']
 
+    if convert_to_markdown:
+        h = html2text.HTML2Text()
+        h.google_doc = True
+        h.escape_snob = True
+        h.unicode_snob = True
+        markdown = h.handle(html.decode('utf8', 'ignore'))
 
-    h = html2text.HTML2Text()
-    h.google_doc = True
-    h.escape_snob = True
-    h.unicode_snob = True
-    markdown = h.handle(html.decode('utf8', 'ignore'))
-
-    note_markdown = NoteMarkdown(note=note, markdown=markdown)
-    note_markdown.save()
+        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+        note_markdown.save()
 
     note_details = extract_file_details(fp_file)
     if 'year' in note_details and note_details['year']:
diff --git a/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py
new file mode 100644
index 0000000..6d98008
--- /dev/null
+++ b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2012  FinalsClub Foundation
+
+import html2text
+from django.core.files.storage import default_storage
+from django.core.management.base import BaseCommand
+from karmaworld.apps.notes.models import Note, NoteMarkdown
+
+class Command(BaseCommand):
+    """ Command to process all notes and add add a markdown version of the file 
+        into the database """
+    args = 'none'
+    help = "Take all notes and use their html to create a markdown version of the document"
+
+    def handle(self, *args, **kwargs):
+        """ On all calls, clean all notes with html and not text using html2text """
+        notes = Note.objects.all()
+
+        converted_notes = 0
+        for note in notes:
+            if note.static_html and not note.is_pdf():
+                h = html2text.HTML2Text()
+                h.google_doc = True
+                h.escape_snob = True
+                h.unicode_snob = True
+
+                with default_storage.open(note.get_relative_s3_path(),'r') as html:
+                    markdown = h.handle(html.read().decode('utf8', 'ignore'))
+                    if note.has_markdown():
+                        note_markdown = note.notemarkdown
+                        note_markdown.markdown = markdown
+                    else:
+                        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+                    note_markdown.save()
+                converted_notes += 1
+                print 'Processed {n}'.format(n=note)
+
+            """if not note.static_html:
+                # no HTML to fetch
+                continue
+            try:
+                h = html2text.HTML2Text()
+                h.escape_snob = True
+                h.unicode_snob = True
+                h.ignore_links = True
+                h.ignore_images = True
+                h.ignore_emphasis = True
+                # fetch data
+                with default_storage.open(note.get_relative_s3_path(),'r') as \
+                  html:
+                    note.text = h.handle(html.read())
+                note.save()
+                cleaned_notes += 1
+                print 'Processed {n}'.format(n=note)
+            except Exception, e:
+                print note
+                print e
+                continue"""
+        print 'Processed %s notes' % converted_notes
+
diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py
index 4a03516..3852bdd 100644
--- a/karmaworld/apps/notes/models.py
+++ b/karmaworld/apps/notes/models.py
@@ -204,6 +204,12 @@ class Note(Document):
         (UNKNOWN_FILE, 'Unknown file'),
     )
 
+    PDF_MIMETYPES = (
+      'application/pdf',
+      'application/vnd.ms-powerpoint',
+      'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    )
+
     file_type       = models.CharField(max_length=15,
                             choices=FILE_TYPE_CHOICES,
                             default=UNKNOWN_FILE,
@@ -408,6 +414,9 @@ class Note(Document):
     def has_markdown(self):
         return hasattr(self, "notemarkdown")
 
+    def is_pdf(self):
+        return self.mimetype in Note.PDF_MIMETYPES
+
 
 class NoteMarkdown(models.Model):
     note     = models.OneToOneField(Note, primary_key=True)
diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py
index f8ab670..db274a4 100644
--- a/karmaworld/apps/notes/views.py
+++ b/karmaworld/apps/notes/views.py
@@ -28,12 +28,6 @@ from karmaworld.apps.notes.forms import NoteForm
 
 logger = logging.getLogger(__name__)
 
-PDF_MIMETYPES = (
-    'application/pdf',
-    'application/vnd.ms-powerpoint',
-    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
-)
-
 THANKS_FIELD = 'thanks'
 USER_PROFILE_THANKS_FIELD = 'thanked_notes'
 FLAG_FIELD = 'flags'
@@ -49,7 +43,7 @@ class NoteDetailView(DetailView):
         """ Generate custom context for the page rendering a Note
             + if pdf, set the `pdf` flag
         """
-        if self.object.mimetype in PDF_MIMETYPES:
+        if self.object.is_pdf():
             kwargs['pdf_controls'] = True
 
         if self.request.user.is_authenticated():