From: Jacob Hilker Date: Sun, 9 Mar 2014 23:03:57 +0000 (-0400) Subject: only convert non pdfs, add command to convert all non pdf files with static html X-Git-Tag: release-20150131~148^2~1^2~6 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=0ba50ff8ba26847cac33a6bdd811a90de8d5082c;p=oweals%2Fkarmaworld.git only convert non pdfs, add command to convert all non pdf files with static html --- diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index 2c22543..d9ba0e8 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -230,12 +230,14 @@ def convert_raw_document(raw_document, user=None): # Extract HTML from the appropriate place html = '' + convert_to_markdown = False if raw_document.mimetype == PDF_MIMETYPE: html = pdf2html(original_content) elif raw_document.mimetype in PPT_MIMETYPES: html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: html = content_dict['html'] + convert_to_markdown = True # cleanup the HTML html = note.filter_html(html) @@ -244,15 +246,15 @@ def convert_raw_document(raw_document, user=None): note.text = content_dict['text'] + if convert_to_markdown: + h = html2text.HTML2Text() + h.google_doc = True + h.escape_snob = True + h.unicode_snob = True + markdown = h.handle(html.decode('utf8', 'ignore')) - h = html2text.HTML2Text() - h.google_doc = True - h.escape_snob = True - h.unicode_snob = True - markdown = h.handle(html.decode('utf8', 'ignore')) - - note_markdown = NoteMarkdown(note=note, markdown=markdown) - note_markdown.save() + note_markdown = NoteMarkdown(note=note, markdown=markdown) + note_markdown.save() note_details = extract_file_details(fp_file) if 'year' in note_details and note_details['year']: diff --git a/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py new file mode 100644 index 0000000..6d98008 --- /dev/null +++ b/karmaworld/apps/notes/management/commands/convert_notes_to_markdown.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding:utf8 -*- +# Copyright (C) 2012 FinalsClub Foundation + +import html2text +from django.core.files.storage import default_storage +from django.core.management.base import BaseCommand +from karmaworld.apps.notes.models import Note, NoteMarkdown + +class Command(BaseCommand): + """ Command to process all notes and add add a markdown version of the file + into the database """ + args = 'none' + help = "Take all notes and use their html to create a markdown version of the document" + + def handle(self, *args, **kwargs): + """ On all calls, clean all notes with html and not text using html2text """ + notes = Note.objects.all() + + converted_notes = 0 + for note in notes: + if note.static_html and not note.is_pdf(): + h = html2text.HTML2Text() + h.google_doc = True + h.escape_snob = True + h.unicode_snob = True + + with default_storage.open(note.get_relative_s3_path(),'r') as html: + markdown = h.handle(html.read().decode('utf8', 'ignore')) + if note.has_markdown(): + note_markdown = note.notemarkdown + note_markdown.markdown = markdown + else: + note_markdown = NoteMarkdown(note=note, markdown=markdown) + note_markdown.save() + converted_notes += 1 + print 'Processed {n}'.format(n=note) + + """if not note.static_html: + # no HTML to fetch + continue + try: + h = html2text.HTML2Text() + h.escape_snob = True + h.unicode_snob = True + h.ignore_links = True + h.ignore_images = True + h.ignore_emphasis = True + # fetch data + with default_storage.open(note.get_relative_s3_path(),'r') as \ + html: + note.text = h.handle(html.read()) + note.save() + cleaned_notes += 1 + print 'Processed {n}'.format(n=note) + except Exception, e: + print note + print e + continue""" + print 'Processed %s notes' % converted_notes + diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py index 4a03516..3852bdd 100644 --- a/karmaworld/apps/notes/models.py +++ b/karmaworld/apps/notes/models.py @@ -204,6 +204,12 @@ class Note(Document): (UNKNOWN_FILE, 'Unknown file'), ) + PDF_MIMETYPES = ( + 'application/pdf', + 'application/vnd.ms-powerpoint', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation' + ) + file_type = models.CharField(max_length=15, choices=FILE_TYPE_CHOICES, default=UNKNOWN_FILE, @@ -408,6 +414,9 @@ class Note(Document): def has_markdown(self): return hasattr(self, "notemarkdown") + def is_pdf(self): + return self.mimetype in Note.PDF_MIMETYPES + class NoteMarkdown(models.Model): note = models.OneToOneField(Note, primary_key=True) diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py index f8ab670..db274a4 100644 --- a/karmaworld/apps/notes/views.py +++ b/karmaworld/apps/notes/views.py @@ -28,12 +28,6 @@ from karmaworld.apps.notes.forms import NoteForm logger = logging.getLogger(__name__) -PDF_MIMETYPES = ( - 'application/pdf', - 'application/vnd.ms-powerpoint', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation' -) - THANKS_FIELD = 'thanks' USER_PROFILE_THANKS_FIELD = 'thanked_notes' FLAG_FIELD = 'flags' @@ -49,7 +43,7 @@ class NoteDetailView(DetailView): """ Generate custom context for the page rendering a Note + if pdf, set the `pdf` flag """ - if self.object.mimetype in PDF_MIMETYPES: + if self.object.is_pdf(): kwargs['pdf_controls'] = True if self.request.user.is_authenticated():