Better plain text output
authorCharles Connell <charles@connells.org>
Thu, 2 Jan 2014 21:17:10 +0000 (16:17 -0500)
committerCharles Connell <charles@connells.org>
Thu, 2 Jan 2014 21:17:25 +0000 (16:17 -0500)
karmaworld/apps/notes/management/commands/strip_html_to_text.py

index 3c37b6294b87cfea37712756d39297ae6dd62f6e..bf32b3198a0a78f7a82ddb2eddfd7e4c4b0ed5c0 100644 (file)
@@ -3,7 +3,6 @@
 # Copyright (C) 2012  FinalsClub Foundation
 
 import html2text
-
 from django.core.management.base import BaseCommand
 from apps.notes.models import Note
 
@@ -18,9 +17,16 @@ class Command(BaseCommand):
         notes = Note.objects.filter(html__isnull=False).filter(text__isnull=True)
         cleaned_notes = 0
         for note in notes:
-            #TODO: find style tags and drop them and their contents first
-            note.text = html2text.html2text(note.html)
-            note.save()
-            cleaned_notes += 1
-        self.stdout.write('Processed %s notes' % cleaned_notes)
+            try:
+                h = html2text.HTML2Text()
+                h.ignore_links = True
+                h.ignore_images = True
+                note.text = h.handle(note.html)
+                note.save()
+                cleaned_notes += 1
+            except Exception, e:
+                print note
+                print e
+                continue
+        print 'Processed %s notes' % cleaned_notes