From dd784436eb2d10d92b498078e3067573f5819d69 Mon Sep 17 00:00:00 2001 From: Charlie DeTar Date: Sat, 31 Jan 2015 16:27:06 -0700 Subject: [PATCH] WIP supporting data-uris --- karmaworld/apps/notes/gdrive.py | 21 +---- .../notes/migrations/0020_markdown_to_html.py | 3 +- karmaworld/apps/notes/sanitizer.py | 77 ++++++++++++++++++- karmaworld/apps/notes/tests.py | 17 +++- karmaworld/apps/notes/views.py | 2 - 5 files changed, 93 insertions(+), 27 deletions(-) diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index c921c87..c6d6084 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -210,6 +210,7 @@ def convert_raw_document(raw_document, user=None): # Cache the uploaded file's URL note.gdrive_url = file_dict['alternateLink'] + note.text = content_dict['text'] # Extract HTML from the appropriate place html = '' @@ -220,25 +221,9 @@ def convert_raw_document(raw_document, user=None): html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: html = content_dict['html'] - convert_to_markdown = True - # cleanup the HTML - html = sanitizer.sanitize_html(html) - html = sanitizer.set_canonical_rel(note.get_canonical_url()) - - # upload the HTML file to static host if it is not already there - note.send_to_s3(html, do_save=False) - - note.text = content_dict['text'] - - if convert_to_markdown: - h = html2text.HTML2Text() - h.google_doc = True - h.escape_snob = True - h.unicode_snob = True - markdown = h.handle(html.decode('utf8', 'ignore')) - note_markdown = NoteMarkdown(note=note, markdown=markdown) - note_markdown.save() + if html: + NoteMarkdown.objects.create(note=note, html=html) # If we know the user who uploaded this, # associate them with the note diff --git a/karmaworld/apps/notes/migrations/0020_markdown_to_html.py b/karmaworld/apps/notes/migrations/0020_markdown_to_html.py index c57881c..4efa2ac 100644 --- a/karmaworld/apps/notes/migrations/0020_markdown_to_html.py +++ b/karmaworld/apps/notes/migrations/0020_markdown_to_html.py @@ -5,6 +5,7 @@ from south.v2 import DataMigration from django.db import models import markdown from notes.models import NoteMarkdown +from notes.sanitizer import sanitize_html class Migration(DataMigration): @@ -14,7 +15,7 @@ class Migration(DataMigration): # Use orm.ModelName to refer to models in this application, # and orm['appname.ModelName'] for models in other applications. for notemarkdown in orm['notes.NoteMarkdown'].objects.exclude(markdown=""): - notemarkdown.html = NoteMarkdown.sanitize(markdown.markdown(notemarkdown.markdown)) + notemarkdown.html = sanitize_html(markdown.markdown(notemarkdown.markdown)) notemarkdown.save() def backwards(self, orm): diff --git a/karmaworld/apps/notes/sanitizer.py b/karmaworld/apps/notes/sanitizer.py index 885218e..895c622 100644 --- a/karmaworld/apps/notes/sanitizer.py +++ b/karmaworld/apps/notes/sanitizer.py @@ -1,4 +1,9 @@ +import re import bleach +import html5lib +from html5lib.constants import tokenTypes +from bleach.sanitizer import BleachSanitizer +from bleach import _render import bleach_whitelist from bs4 import BeautifulSoup @@ -7,15 +12,79 @@ def _canonical_link_predicate(tag): tag.has_attr('rel') and \ u'canonical' in tag['rel'] +class Sanitizer(BleachSanitizer): + """ + The default bleach clean method uses a sanitizer that handles disallowed + tags either by escaping them. With the bad HTML + + + + if ``strip=False``, bleach will output: + + <script>alert('bad')</script> + + if ``strip=True``, bleach will output: + + alert('bad') + + But we want to strip both the tag and contents for certain tags like script + and style. This subclass does that. + + Also support data URI's. + """ + allowed_elements = bleach_whitelist.markdown_tags + allowed_attributes = bleach_whitelist.markdown_attrs + suppressed_elements = ["script", "style"] + strip_disallowed_elements = True + strip_html_comments = True + + def __init__(self, *args, **kwargs): + self.suppressing = None + super(Sanitizer, self).__init__(*args, **kwargs) + + def sanitize_token(self, token): + extra_data = [] + # Allow data URIs of some types for images. Store them in 'extra_data' + # so we can appendthem to the result. + if token.get('name') == "img": + for (name, val) in token['data']: + if name == u"src": + if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val): + extra_data.append((name, val)) + break + # Do sanitization on the parent. + result = super(Sanitizer, self).sanitize_token(token) + # Append extra data potentially including data URI's. + if extra_data: + if result['data']: + result['data'] += extra_data + else: + result['data'] = extra_data + print result + + # Suppress elements like script and style entirely. + if token.get('name') and token['name'] in self.suppressed_elements: + if token['type'] == tokenTypes['StartTag']: + self.suppressing = token['name'] + elif token['type'] == tokenTypes['EndTag'] and token['name'] == self.suppressing: + self.suppressing = False + if self.suppressing: + return {u'data': '', 'type': 2} + else: + return result + def sanitize_html(raw_html): """ Sanitize the given raw_html. """ # Strip tags to the few that we like - clean = bleach.clean(raw_html, - bleach_whitelist.markdown_tags, - bleach_whitelist.markdown_attrs, - strip=True) + parser = html5lib.HTMLParser(tokenizer=Sanitizer) + clean = _render(parser.parseFragment(raw_html)) + +# walker = html5lib.treewalkers.getTreeWalker('etree') +# stream = walker(parser.parseFragment(raw_html)) +# serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False) +# print unicode(serializer.render(stream)) # Set anchor tags' targets clean = bleach.linkify(clean, callbacks=[ diff --git a/karmaworld/apps/notes/tests.py b/karmaworld/apps/notes/tests.py index 08135c1..5695468 100644 --- a/karmaworld/apps/notes/tests.py +++ b/karmaworld/apps/notes/tests.py @@ -91,7 +91,6 @@ class TestNotes(TestCase): rich.save() self.assertHTMLEqual(rich.html, u""" - unsafe

Something

OK

& @@ -104,25 +103,39 @@ class TestSanitizer(TestCase): def test_clean(self): dirty = """ +

Something

OK

& ” This stuff That guy +
+

This should show up

+
""" self.assertHTMLEqual(sanitizer.sanitize_html(dirty), u""" - unsafe

Something

OK

& \u201d This stuff That guy +

This should show up

""") def test_canonical_rel(self): html = """

Hey there!

""" canonicalized = sanitizer.set_canonical_rel(html, "http://example.com") self.assertHTMLEqual(canonicalized, """

Hey there!

""") + + def test_data_uri(self): + #html = '' + #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing") + + html = '' + self.assertHTMLEqual(sanitizer.sanitize_html(html), html) + + html = '' + self.assertHTMLEqual(sanitizer.sanitize_html(html), "") diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py index 19ba637..daf9d83 100644 --- a/karmaworld/apps/notes/views.py +++ b/karmaworld/apps/notes/views.py @@ -47,7 +47,6 @@ def note_page_context_helper(note, request, context): initial["html"] = note.notemarkdown.html except NoteMarkdown.DoesNotExist: pass - print initial context['note_edit_form'] = NoteForm(initial=initial) context['note_delete_form'] = NoteDeleteForm(initial={'note': note.id}) @@ -92,7 +91,6 @@ class NoteView(UpdateView): ) context['note_delete_form'] = NoteDeleteForm(initial={'note': self.object.id}) context['note_edit_form'] = context.get('form') - print context return context def get_initial(self, **kwargs): -- 2.25.1