From: Charles Connell Date: Fri, 3 Jan 2014 17:14:47 +0000 (-0500) Subject: Generate cleaner plaintext X-Git-Tag: release-20150131~320 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=55189e8bb719e79953d198270dbbf964d42bb94c;p=oweals%2Fkarmaworld.git Generate cleaner plaintext --- diff --git a/karmaworld/apps/notes/management/commands/strip_html_to_text.py b/karmaworld/apps/notes/management/commands/strip_html_to_text.py index bf32b31..8d42981 100644 --- a/karmaworld/apps/notes/management/commands/strip_html_to_text.py +++ b/karmaworld/apps/notes/management/commands/strip_html_to_text.py @@ -4,7 +4,7 @@ import html2text from django.core.management.base import BaseCommand -from apps.notes.models import Note +from karmaworld.apps.notes.models import Note class Command(BaseCommand): """ Command to process notes with html, and without text @@ -13,17 +13,21 @@ class Command(BaseCommand): help = "Take all notes with the .html property and use that to fill Note.text by stripping html" def handle(self, *args, **kwargs): - """ On all calls, clean all notes with html and not text using lxml """ + """ On all calls, clean all notes with html and not text using html2text """ notes = Note.objects.filter(html__isnull=False).filter(text__isnull=True) cleaned_notes = 0 for note in notes: try: h = html2text.HTML2Text() + h.escape_snob = True + h.unicode_snob = True h.ignore_links = True h.ignore_images = True + h.ignore_emphasis = True note.text = h.handle(note.html) note.save() cleaned_notes += 1 + print 'Processed {n}'.format(n=note) except Exception, e: print note print e