WIP supporting data-uris
authorCharlie DeTar <cfd@media.mit.edu>
Sat, 31 Jan 2015 23:27:06 +0000 (16:27 -0700)
committerBryan <btbonval@gmail.com>
Fri, 27 Feb 2015 01:08:12 +0000 (20:08 -0500)
karmaworld/apps/notes/gdrive.py
karmaworld/apps/notes/migrations/0020_markdown_to_html.py
karmaworld/apps/notes/sanitizer.py
karmaworld/apps/notes/tests.py
karmaworld/apps/notes/views.py

index c921c8761ffe3298ceb4376c9e2d4a492f755bb9..c6d608459ebc20b04f440e8382ca83f723573ca0 100644 (file)
@@ -210,6 +210,7 @@ def convert_raw_document(raw_document, user=None):
 
     # Cache the uploaded file's URL
     note.gdrive_url = file_dict['alternateLink']
+    note.text = content_dict['text']
 
     # Extract HTML from the appropriate place
     html = ''
@@ -220,25 +221,9 @@ def convert_raw_document(raw_document, user=None):
         html = pdf2html(content_dict['pdf'])
     elif 'html' in content_dict and content_dict['html']:
         html = content_dict['html']
-        convert_to_markdown = True
-    # cleanup the HTML
-    html = sanitizer.sanitize_html(html)
-    html = sanitizer.set_canonical_rel(note.get_canonical_url())
-
-    # upload the HTML file to static host if it is not already there
-    note.send_to_s3(html, do_save=False)
-
-    note.text = content_dict['text']
-
-    if convert_to_markdown:
-        h = html2text.HTML2Text()
-        h.google_doc = True
-        h.escape_snob = True
-        h.unicode_snob = True
-        markdown = h.handle(html.decode('utf8', 'ignore'))
 
-        note_markdown = NoteMarkdown(note=note, markdown=markdown)
-        note_markdown.save()
+    if html:
+        NoteMarkdown.objects.create(note=note, html=html)
 
     # If we know the user who uploaded this,
     # associate them with the note
index c57881c4295a5d37190067d2a785d2559476a9a2..4efa2ac5d257240e764b8491e9891787e7f722f8 100644 (file)
@@ -5,6 +5,7 @@ from south.v2 import DataMigration
 from django.db import models
 import markdown
 from notes.models import NoteMarkdown
+from notes.sanitizer import sanitize_html
 
 class Migration(DataMigration):
 
@@ -14,7 +15,7 @@ class Migration(DataMigration):
         # Use orm.ModelName to refer to models in this application,
         # and orm['appname.ModelName'] for models in other applications.
         for notemarkdown in orm['notes.NoteMarkdown'].objects.exclude(markdown=""):
-            notemarkdown.html = NoteMarkdown.sanitize(markdown.markdown(notemarkdown.markdown))
+            notemarkdown.html = sanitize_html(markdown.markdown(notemarkdown.markdown))
             notemarkdown.save()
 
     def backwards(self, orm):
index 885218e6207825164569a712555bafbf17b03654..895c622a1f46d737d403168d527399eb791f3128 100644 (file)
@@ -1,4 +1,9 @@
+import re
 import bleach
+import html5lib
+from html5lib.constants import tokenTypes
+from bleach.sanitizer import BleachSanitizer
+from bleach import _render
 import bleach_whitelist
 from bs4 import BeautifulSoup
 
@@ -7,15 +12,79 @@ def _canonical_link_predicate(tag):
         tag.has_attr('rel') and \
         u'canonical' in tag['rel']
 
+class Sanitizer(BleachSanitizer):
+    """
+    The default bleach clean method uses a sanitizer that handles disallowed
+    tags either by escaping them. With the bad HTML 
+
+        <script>alert('bad')</script>
+
+    if ``strip=False``, bleach will output:
+    
+        &lt;script&gt;alert('bad')&lt;/script&gt;
+
+    if ``strip=True``, bleach will output:
+
+        alert('bad')
+
+    But we want to strip both the tag and contents for certain tags like script
+    and style.  This subclass does that.
+
+    Also support data URI's.
+    """
+    allowed_elements = bleach_whitelist.markdown_tags
+    allowed_attributes = bleach_whitelist.markdown_attrs
+    suppressed_elements = ["script", "style"]
+    strip_disallowed_elements = True
+    strip_html_comments = True
+
+    def __init__(self, *args, **kwargs):
+        self.suppressing = None
+        super(Sanitizer, self).__init__(*args, **kwargs)
+
+    def sanitize_token(self, token):
+        extra_data = []
+        # Allow data URIs of some types for images. Store them in 'extra_data'
+        # so we can appendthem to the result.
+        if token.get('name') == "img":
+            for (name, val) in token['data']:
+                if name == u"src":
+                    if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val):
+                        extra_data.append((name, val))
+                    break
+        # Do sanitization on the parent.
+        result = super(Sanitizer, self).sanitize_token(token)
+        # Append extra data potentially including data URI's.
+        if extra_data:
+            if result['data']:
+                result['data'] += extra_data
+            else:
+                result['data'] = extra_data
+        print result
+
+        # Suppress elements like script and style entirely.
+        if token.get('name') and token['name'] in self.suppressed_elements:
+            if token['type'] == tokenTypes['StartTag']:
+                self.suppressing = token['name']
+            elif token['type'] == tokenTypes['EndTag'] and token['name'] == self.suppressing:
+                self.suppressing = False
+        if self.suppressing:
+            return {u'data': '', 'type': 2}
+        else:
+            return result
+
 def sanitize_html(raw_html):
     """
     Sanitize the given raw_html.
     """
     # Strip tags to the few that we like
-    clean = bleach.clean(raw_html,
-        bleach_whitelist.markdown_tags,
-        bleach_whitelist.markdown_attrs,
-        strip=True)
+    parser = html5lib.HTMLParser(tokenizer=Sanitizer)
+    clean = _render(parser.parseFragment(raw_html))
+
+#    walker = html5lib.treewalkers.getTreeWalker('etree')
+#    stream = walker(parser.parseFragment(raw_html))
+#    serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False)
+#    print unicode(serializer.render(stream))
 
     # Set anchor tags' targets
     clean = bleach.linkify(clean, callbacks=[
index 08135c1436db79bf5a83ad13e04270401535d22a..5695468f91291761a8622c1f7b7c3b3f242af119 100644 (file)
@@ -91,7 +91,6 @@ class TestNotes(TestCase):
 
         rich.save()
         self.assertHTMLEqual(rich.html, u"""
-            unsafe
             <h1>Something</h1>
             <h2>OK</h2>
             &amp;
@@ -104,25 +103,39 @@ class TestSanitizer(TestCase):
     def test_clean(self):
         dirty = """
             <script>unsafe</script>
+            <style>html {background-color: pink !important;}</style>
             <h1 class='obtrusive'>Something</h1>
             <h2>OK</h2>
             &amp;
             &rdquo;
             <a href='javascript:alert("Oh no")'>This stuff</a>
             <a href='http://google.com'>That guy</a>
+            <section>
+              <h3>This should show up</h3>
+            </section>
         """
 
         self.assertHTMLEqual(sanitizer.sanitize_html(dirty), u"""
-            unsafe
             <h1>Something</h1>
             <h2>OK</h2>
             &amp;
             \u201d
             <a>This stuff</a>
             <a href="http://google.com" target="_blank" rel="nofollow">That guy</a>
+            <h3>This should show up</h3>
         """)
 
     def test_canonical_rel(self):
         html = """<h1>Hey there!</h1>"""
         canonicalized = sanitizer.set_canonical_rel(html, "http://example.com")
         self.assertHTMLEqual(canonicalized, """<html><head><link rel='canonical' href='http://example.com'></head><body><h1>Hey there!</h1></body></html>""")
+
+    def test_data_uri(self):
+        #html = '<img src="/this.gif">'
+        #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing")
+
+        html = '<img src="">'
+        self.assertHTMLEqual(sanitizer.sanitize_html(html), html)
+
+        html = '<img src="data:application/pdf;base64,blergh">'
+        self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")
index 19ba6371aa65c3527e0e32c6654be2df9a932019..daf9d8389069e0d01c86c94040cc199166fce54c 100644 (file)
@@ -47,7 +47,6 @@ def note_page_context_helper(note, request, context):
             initial["html"] = note.notemarkdown.html
         except NoteMarkdown.DoesNotExist:
             pass
-        print initial
         context['note_edit_form'] = NoteForm(initial=initial)
 
     context['note_delete_form'] = NoteDeleteForm(initial={'note': note.id})
@@ -92,7 +91,6 @@ class NoteView(UpdateView):
         )
         context['note_delete_form'] = NoteDeleteForm(initial={'note': self.object.id})
         context['note_edit_form'] = context.get('form')
-        print context
         return context
 
     def get_initial(self, **kwargs):