# Cache the uploaded file's URL
note.gdrive_url = file_dict['alternateLink']
+ note.text = content_dict['text']
# Extract HTML from the appropriate place
html = ''
html = pdf2html(content_dict['pdf'])
elif 'html' in content_dict and content_dict['html']:
html = content_dict['html']
- convert_to_markdown = True
- # cleanup the HTML
- html = sanitizer.sanitize_html(html)
- html = sanitizer.set_canonical_rel(note.get_canonical_url())
-
- # upload the HTML file to static host if it is not already there
- note.send_to_s3(html, do_save=False)
-
- note.text = content_dict['text']
-
- if convert_to_markdown:
- h = html2text.HTML2Text()
- h.google_doc = True
- h.escape_snob = True
- h.unicode_snob = True
- markdown = h.handle(html.decode('utf8', 'ignore'))
- note_markdown = NoteMarkdown(note=note, markdown=markdown)
- note_markdown.save()
+ if html:
+ NoteMarkdown.objects.create(note=note, html=html)
# If we know the user who uploaded this,
# associate them with the note
+import re
import bleach
+import html5lib
+from html5lib.constants import tokenTypes
+from bleach.sanitizer import BleachSanitizer
+from bleach import _render
import bleach_whitelist
from bs4 import BeautifulSoup
tag.has_attr('rel') and \
u'canonical' in tag['rel']
+class Sanitizer(BleachSanitizer):
+ """
+ The default bleach clean method uses a sanitizer that handles disallowed
+ tags either by escaping them. With the bad HTML
+
+ <script>alert('bad')</script>
+
+ if ``strip=False``, bleach will output:
+
+ <script>alert('bad')</script>
+
+ if ``strip=True``, bleach will output:
+
+ alert('bad')
+
+ But we want to strip both the tag and contents for certain tags like script
+ and style. This subclass does that.
+
+ Also support data URI's.
+ """
+ allowed_elements = bleach_whitelist.markdown_tags
+ allowed_attributes = bleach_whitelist.markdown_attrs
+ suppressed_elements = ["script", "style"]
+ strip_disallowed_elements = True
+ strip_html_comments = True
+
+ def __init__(self, *args, **kwargs):
+ self.suppressing = None
+ super(Sanitizer, self).__init__(*args, **kwargs)
+
+ def sanitize_token(self, token):
+ extra_data = []
+ # Allow data URIs of some types for images. Store them in 'extra_data'
+ # so we can appendthem to the result.
+ if token.get('name') == "img":
+ for (name, val) in token['data']:
+ if name == u"src":
+ if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val):
+ extra_data.append((name, val))
+ break
+ # Do sanitization on the parent.
+ result = super(Sanitizer, self).sanitize_token(token)
+ # Append extra data potentially including data URI's.
+ if extra_data:
+ if result['data']:
+ result['data'] += extra_data
+ else:
+ result['data'] = extra_data
+ print result
+
+ # Suppress elements like script and style entirely.
+ if token.get('name') and token['name'] in self.suppressed_elements:
+ if token['type'] == tokenTypes['StartTag']:
+ self.suppressing = token['name']
+ elif token['type'] == tokenTypes['EndTag'] and token['name'] == self.suppressing:
+ self.suppressing = False
+ if self.suppressing:
+ return {u'data': '', 'type': 2}
+ else:
+ return result
+
def sanitize_html(raw_html):
"""
Sanitize the given raw_html.
"""
# Strip tags to the few that we like
- clean = bleach.clean(raw_html,
- bleach_whitelist.markdown_tags,
- bleach_whitelist.markdown_attrs,
- strip=True)
+ parser = html5lib.HTMLParser(tokenizer=Sanitizer)
+ clean = _render(parser.parseFragment(raw_html))
+
+# walker = html5lib.treewalkers.getTreeWalker('etree')
+# stream = walker(parser.parseFragment(raw_html))
+# serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False)
+# print unicode(serializer.render(stream))
# Set anchor tags' targets
clean = bleach.linkify(clean, callbacks=[
rich.save()
self.assertHTMLEqual(rich.html, u"""
- unsafe
<h1>Something</h1>
<h2>OK</h2>
&
def test_clean(self):
dirty = """
<script>unsafe</script>
+ <style>html {background-color: pink !important;}</style>
<h1 class='obtrusive'>Something</h1>
<h2>OK</h2>
&
”
<a href='javascript:alert("Oh no")'>This stuff</a>
<a href='http://google.com'>That guy</a>
+ <section>
+ <h3>This should show up</h3>
+ </section>
"""
self.assertHTMLEqual(sanitizer.sanitize_html(dirty), u"""
- unsafe
<h1>Something</h1>
<h2>OK</h2>
&
\u201d
<a>This stuff</a>
<a href="http://google.com" target="_blank" rel="nofollow">That guy</a>
+ <h3>This should show up</h3>
""")
def test_canonical_rel(self):
html = """<h1>Hey there!</h1>"""
canonicalized = sanitizer.set_canonical_rel(html, "http://example.com")
self.assertHTMLEqual(canonicalized, """<html><head><link rel='canonical' href='http://example.com'></head><body><h1>Hey there!</h1></body></html>""")
+
+ def test_data_uri(self):
+ #html = '<img src="/this.gif">'
+ #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing")
+
+ html = '<img src="">'
+ self.assertHTMLEqual(sanitizer.sanitize_html(html), html)
+
+ html = '<img src="data:application/pdf;base64,blergh">'
+ self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")