From 55189e8bb719e79953d198270dbbf964d42bb94c Mon Sep 17 00:00:00 2001 From: Charles Connell Date: Fri, 3 Jan 2014 12:14:47 -0500 Subject: [PATCH] Generate cleaner plaintext --- .../apps/notes/management/commands/strip_html_to_text.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/karmaworld/apps/notes/management/commands/strip_html_to_text.py b/karmaworld/apps/notes/management/commands/strip_html_to_text.py index bf32b31..8d42981 100644 --- a/karmaworld/apps/notes/management/commands/strip_html_to_text.py +++ b/karmaworld/apps/notes/management/commands/strip_html_to_text.py @@ -4,7 +4,7 @@ import html2text from django.core.management.base import BaseCommand -from apps.notes.models import Note +from karmaworld.apps.notes.models import Note class Command(BaseCommand): """ Command to process notes with html, and without text @@ -13,17 +13,21 @@ class Command(BaseCommand): help = "Take all notes with the .html property and use that to fill Note.text by stripping html" def handle(self, *args, **kwargs): - """ On all calls, clean all notes with html and not text using lxml """ + """ On all calls, clean all notes with html and not text using html2text """ notes = Note.objects.filter(html__isnull=False).filter(text__isnull=True) cleaned_notes = 0 for note in notes: try: h = html2text.HTML2Text() + h.escape_snob = True + h.unicode_snob = True h.ignore_links = True h.ignore_images = True + h.ignore_emphasis = True note.text = h.handle(note.html) note.save() cleaned_notes += 1 + print 'Processed {n}'.format(n=note) except Exception, e: print note print e -- 2.25.1