import bleach
import html5lib
from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
from bleach.sanitizer import BleachSanitizer
from bleach import _render
import bleach_whitelist
from bs4 import BeautifulSoup
+from PIL import Image
+from cStringIO import StringIO
+import base64
+import uuid
def _canonical_link_predicate(tag):
return tag.name == u'link' and \
But we want to strip both the tag and contents for certain tags like script
and style. This subclass does that.
- Also support data URI's.
+ Also support data URI's for some mimetypes (image/png, image/gif, image/jpeg)
"""
allowed_elements = bleach_whitelist.markdown_tags
allowed_attributes = bleach_whitelist.markdown_attrs
super(Sanitizer, self).__init__(*args, **kwargs)
def sanitize_token(self, token):
- extra_data = []
- # Allow data URIs of some types for images. Store them in 'extra_data'
- # so we can appendthem to the result.
- if token.get('name') == "img":
- for (name, val) in token['data']:
- if name == u"src":
- if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val):
- extra_data.append((name, val))
- break
# Do sanitization on the parent.
result = super(Sanitizer, self).sanitize_token(token)
- # Append extra data potentially including data URI's.
- if extra_data:
- if result['data']:
- result['data'] += extra_data
- else:
- result['data'] = extra_data
- print result
# Suppress elements like script and style entirely.
if token.get('name') and token['name'] in self.suppressed_elements:
else:
return result
+class DataUriReplacer(HTMLTokenizer, HTMLSanitizerMixin):
+ """
+ Convert any valid image data URI's to files, and upload them to s3. Replace
+ the data URI with a link to the file in s3.
+ """
+ VALID_URI = "^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$"
+
+ def sanitize_token(self, token):
+ if token.get('name') == u"img":
+ attrs = dict([(name, val) for name, val in token['data'][::-1]])
+ if 'src' in attrs:
+ src = attrs['src']
+ if re.match(self.VALID_URI, src):
+ url = self._upload_image(src)
+ attrs['src'] = url
+ token['data'] = [(k,v) for k,v in attrs.iteritems()]
+ return token
+
+ def _upload_image(self, data_uri):
+ from django.core.files.storage import default_storage
+ from karmaworld.apps.notes.models import all_read_xml_acl
+ from django.conf import settings
+
+ mimetype, data = data_uri.split(";base64,")
+ sio = StringIO()
+ sio.write(base64.b64decode(data))
+ sio.seek(0)
+ try:
+ image = Image.open(sio)
+ except IOError:
+ raise ValueError("Bad image data URI")
+
+ fmt = mimetype.split("/")[1]
+
+ image_data = StringIO()
+ image.save(image_data, format=fmt)
+
+ filepath = "images/{}.{}".format(uuid.uuid4(), fmt)
+ new_key = default_storage.bucket.new_key(filepath)
+ new_key.set_contents_from_string(image_data.getvalue(), {"Content-Type": mimetype})
+ new_key.set_xml_acl(all_read_xml_acl)
+ parts = [settings.S3_URL, filepath]
+ if parts[0].startswith("//"):
+ # Fully resolve the URL as https for happiness in all things.
+ parts.insert(0, "https:")
+ return "".join(parts)
+
+ def __iter__(self):
+ for token in HTMLTokenizer.__iter__(self):
+ token = self.sanitize_token(token)
+ if token:
+ yield token
+
def sanitize_html(raw_html):
"""
Sanitize the given raw_html.
parser = html5lib.HTMLParser(tokenizer=Sanitizer)
clean = _render(parser.parseFragment(raw_html))
-# walker = html5lib.treewalkers.getTreeWalker('etree')
-# stream = walker(parser.parseFragment(raw_html))
-# serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False)
-# print unicode(serializer.render(stream))
-
# Set anchor tags' targets
clean = bleach.linkify(clean, callbacks=[
bleach.callbacks.nofollow,
bleach.callbacks.target_blank
- ])
+ ], tokenizer=Sanitizer)
+ return clean
+
+def data_uris_to_s3(raw_html):
+ parser = html5lib.HTMLParser(tokenizer=DataUriReplacer)
+ clean = _render(parser.parseFragment(raw_html))
return clean
def set_canonical_rel(raw_html, href):
#!/usr/bin/env python
# -*- coding:utf8 -*-
# Copyright (C) 2012 FinalsClub Foundation
+import re
import datetime
from django.test import TestCase
+from bs4 import BeautifulSoup
from karmaworld.apps.notes.search import SearchIndex
from karmaworld.apps.notes.models import Note, NoteMarkdown
self.assertHTMLEqual(canonicalized, """<html><head><link rel='canonical' href='http://example.com'></head><body><h1>Hey there!</h1></body></html>""")
def test_data_uri(self):
- #html = '<img src="/this.gif">'
- #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing")
-
+ # Strip out all data URIs.
html = '<img src="">'
- self.assertHTMLEqual(sanitizer.sanitize_html(html), html)
+ self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")
+ # Strip out non-image data URI's
html = '<img src="data:application/pdf;base64,blergh">'
self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")
+
+class TestDataUriToS3(TestCase):
+ def test_data_uri(self):
+ html = '<img src="">'
+ s3ified = sanitizer.data_uris_to_s3(html)
+ soup = BeautifulSoup(s3ified)
+ print s3ified
+ regex = r'^https?://.*$'
+ self.assertTrue(bool(re.match(regex, soup.img['src'])),
+ "{} does not match {}".format(s3ified, regex))