From 6fb64b36e92f52579ca5b310638f36d66cba35b1 Mon Sep 17 00:00:00 2001 From: Bryan Date: Sat, 18 Jan 2014 00:07:41 -0500 Subject: [PATCH] beautifulsoup no longer adds extraneous whitespace, so the formatting is no longer broken on pdf2html files. closes #290 --- karmaworld/apps/notes/gdrive.py | 4 ++-- karmaworld/apps/notes/models.py | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index f39ab95..6ad9f38 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -225,8 +225,8 @@ def convert_raw_document(raw_document, user=None, session_key=None): html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: html = content_dict['html'] - # cleanup the HTML - html = note.filter_html(html) + # cleanup the HTML + html = note.filter_html(html) # upload the HTML file to static host if it is not already there note.send_to_s3(html, do_save=False) diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py index 7ce782b..93ab3f3 100644 --- a/karmaworld/apps/notes/models.py +++ b/karmaworld/apps/notes/models.py @@ -285,17 +285,13 @@ class Note(Document): # if there was no HTML, return an empty string return '' - # TODO adding from_encoding (if known) will speed up the process - # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings soup = BS(html) # Iterate through filters, applying all to the soup object. for soupfilter in ( self.sanitize_anchor_html, ): soup = soupfilter(soup) - # Return BeautifulSoup cleaned up HTML in UTF-8 - # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-encoding - return soup.prettify("utf-8") + return str(soup) def sanitize_anchor_html(self, soup): """ -- 2.25.1