only convert non pdfs, add command to convert all non pdf files with static html

author Jacob Hilker <hilker.j@gmail.com>

Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)

committer Jacob Hilker <hilker.j@gmail.com>

Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
author Jacob Hilker <hilker.j@gmail.com>
Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
committer Jacob Hilker <hilker.j@gmail.com>
Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py

index 2c2254310c39ad2bced4b1f5e8013e1e291bbc01..d9ba0e8428b426aab727eb92223cc504fac287c7 100644 (file)
--- a/karmaworld/apps/notes/gdrive.py
+++ b/karmaworld/apps/notes/gdrive.py
@@ -230,12 +230,14 @@ def convert_raw_document(raw_document, user=None):
  
      # Extract HTML from the appropriate place
      html = ''
+    convert_to_markdown = False
      if raw_document.mimetype == PDF_MIMETYPE:
          html = pdf2html(original_content)
      elif raw_document.mimetype in PPT_MIMETYPES:
          html = pdf2html(content_dict['pdf'])
      elif 'html' in content_dict and content_dict['html']:
          html = content_dict['html']
+        convert_to_markdown = True
      # cleanup the HTML
      html = note.filter_html(html)
  
@@ -244,15 +246,15 @@ def convert_raw_document(raw_document, user=None):
  
      note.text = content_dict['text']
  
+    if convert_to_markdown:
+        h = html2text.HTML2Text()
+        h.google_doc = True
+        h.escape_snob = True
+        h.unicode_snob = True
+        markdown = h.handle(html.decode('utf8', 'ignore'))
  
-    h = html2text.HTML2Text()
-    h.google_doc = True
-    h.escape_snob = True
-    h.unicode_snob = True
-    markdown = h.handle(html.decode('utf8', 'ignore'))
-
-    note_markdown = NoteMarkdown(note=note, markdown=markdown)
-    note_markdown.save()
+        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+        note_markdown.save()
  
      note_details = extract_file_details(fp_file)
      if 'year' in note_details and note_details['year']:
diff --git a/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py

new file mode 100644 (file)

index 0000000..6d98008
--- /dev/null
+++ b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2012  FinalsClub Foundation
+
+import html2text
+from django.core.files.storage import default_storage
+from django.core.management.base import BaseCommand
+from karmaworld.apps.notes.models import Note, NoteMarkdown
+
+class Command(BaseCommand):
+    """ Command to process all notes and add add a markdown version of the file 
+        into the database """
+    args = 'none'
+    help = "Take all notes and use their html to create a markdown version of the document"
+
+    def handle(self, *args, **kwargs):
+        """ On all calls, clean all notes with html and not text using html2text """
+        notes = Note.objects.all()
+
+        converted_notes = 0
+        for note in notes:
+            if note.static_html and not note.is_pdf():
+                h = html2text.HTML2Text()
+                h.google_doc = True
+                h.escape_snob = True
+                h.unicode_snob = True
+
+                with default_storage.open(note.get_relative_s3_path(),'r') as html:
+                    markdown = h.handle(html.read().decode('utf8', 'ignore'))
+                    if note.has_markdown():
+                        note_markdown = note.notemarkdown
+                        note_markdown.markdown = markdown
+                    else:
+                        note_markdown = NoteMarkdown(note=note, markdown=markdown)
+                    note_markdown.save()
+                converted_notes += 1
+                print 'Processed {n}'.format(n=note)
+
+            """if not note.static_html:
+                # no HTML to fetch
+                continue
+            try:
+                h = html2text.HTML2Text()
+                h.escape_snob = True
+                h.unicode_snob = True
+                h.ignore_links = True
+                h.ignore_images = True
+                h.ignore_emphasis = True
+                # fetch data
+                with default_storage.open(note.get_relative_s3_path(),'r') as \
+                  html:
+                    note.text = h.handle(html.read())
+                note.save()
+                cleaned_notes += 1
+                print 'Processed {n}'.format(n=note)
+            except Exception, e:
+                print note
+                print e
+                continue"""
+        print 'Processed %s notes' % converted_notes
+
diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py

index 4a03516eb400b31e907c07a3fc8a9863637b5aac..3852bdd6c38f08e29c7e52fa5df69390d3382b13 100644 (file)
--- a/karmaworld/apps/notes/models.py
+++ b/karmaworld/apps/notes/models.py
@@ -204,6 +204,12 @@ class Note(Document):
          (UNKNOWN_FILE, 'Unknown file'),
      )
  
+    PDF_MIMETYPES = (
+      'application/pdf',
+      'application/vnd.ms-powerpoint',
+      'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+    )
+
      file_type       = models.CharField(max_length=15,
                              choices=FILE_TYPE_CHOICES,
                              default=UNKNOWN_FILE,
@@ -408,6 +414,9 @@ class Note(Document):
      def has_markdown(self):
          return hasattr(self, "notemarkdown")
  
+    def is_pdf(self):
+        return self.mimetype in Note.PDF_MIMETYPES
+
  
  class NoteMarkdown(models.Model):
      note     = models.OneToOneField(Note, primary_key=True)
diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py

index f8ab6704b7f62a7a0c6bbde467ed1e88432c3540..db274a4d75d74cd549d0bbdc172ff7e8b453e3e8 100644 (file)
--- a/karmaworld/apps/notes/views.py
+++ b/karmaworld/apps/notes/views.py
@@ -28,12 +28,6 @@ from karmaworld.apps.notes.forms import NoteForm
  
  logger = logging.getLogger(__name__)
  
-PDF_MIMETYPES = (
-    'application/pdf',
-    'application/vnd.ms-powerpoint',
-    'application/vnd.openxmlformats-officedocument.presentationml.presentation'
-)
-
  THANKS_FIELD = 'thanks'
  USER_PROFILE_THANKS_FIELD = 'thanked_notes'
  FLAG_FIELD = 'flags'
@@ -49,7 +43,7 @@ class NoteDetailView(DetailView):
          """ Generate custom context for the page rendering a Note
              + if pdf, set the `pdf` flag
          """
-        if self.object.mimetype in PDF_MIMETYPES:
+        if self.object.is_pdf():
              kwargs['pdf_controls'] = True
  
          if self.request.user.is_authenticated():
author	Jacob Hilker <hilker.j@gmail.com>
	Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
committer	Jacob Hilker <hilker.j@gmail.com>
	Sun, 9 Mar 2014 23:03:57 +0000 (19:03 -0400)
karmaworld/apps/notes/gdrive.py		patch \| blob \| history
karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py	[new file with mode: 0644]	patch \| blob
karmaworld/apps/notes/models.py		patch \| blob \| history
karmaworld/apps/notes/views.py		patch \| blob \| history