From 540600e7f7c6fd505756f01a8c50e91e8c181e0b Mon Sep 17 00:00:00 2001 From: Charlie DeTar Date: Sat, 31 Jan 2015 18:38:30 -0700 Subject: [PATCH] Translate data-uri's to files stored on Amazon s3 --- karmaworld/apps/notes/forms.py | 1 + karmaworld/apps/notes/gdrive.py | 1 + karmaworld/apps/notes/sanitizer.py | 89 ++++++++++++++----- karmaworld/apps/notes/tests.py | 19 +++- karmaworld/apps/notes/views.py | 2 +- .../apps/wysihtml5/static/wysihtml5/init.js | 8 +- .../wysihtml5/templates/wysihtml5/widget.html | 9 +- 7 files changed, 99 insertions(+), 30 deletions(-) diff --git a/karmaworld/apps/notes/forms.py b/karmaworld/apps/notes/forms.py index 8bd47bd..4bf660b 100644 --- a/karmaworld/apps/notes/forms.py +++ b/karmaworld/apps/notes/forms.py @@ -15,6 +15,7 @@ class NoteForm(ModelForm): def save(self, *args, **kwargs): # TODO: use transaction.atomic for this when we switch to Django 1.6+ + print self.cleaned_data instance = super(NoteForm, self).save(*args, **kwargs) instance.tags.set(*self.cleaned_data['tags']) if instance.is_hidden: diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index c6d6084..e48b7b3 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -223,6 +223,7 @@ def convert_raw_document(raw_document, user=None): html = content_dict['html'] if html: + html = sanitizer.data_uris_to_s3(html) NoteMarkdown.objects.create(note=note, html=html) # If we know the user who uploaded this, diff --git a/karmaworld/apps/notes/sanitizer.py b/karmaworld/apps/notes/sanitizer.py index 895c622..162d256 100644 --- a/karmaworld/apps/notes/sanitizer.py +++ b/karmaworld/apps/notes/sanitizer.py @@ -2,10 +2,16 @@ import re import bleach import html5lib from html5lib.constants import tokenTypes +from html5lib.sanitizer import HTMLSanitizerMixin +from html5lib.tokenizer import HTMLTokenizer from bleach.sanitizer import BleachSanitizer from bleach import _render import bleach_whitelist from bs4 import BeautifulSoup +from PIL import Image +from cStringIO import StringIO +import base64 +import uuid def _canonical_link_predicate(tag): return tag.name == u'link' and \ @@ -30,7 +36,7 @@ class Sanitizer(BleachSanitizer): But we want to strip both the tag and contents for certain tags like script and style. This subclass does that. - Also support data URI's. + Also support data URI's for some mimetypes (image/png, image/gif, image/jpeg) """ allowed_elements = bleach_whitelist.markdown_tags allowed_attributes = bleach_whitelist.markdown_attrs @@ -43,24 +49,8 @@ class Sanitizer(BleachSanitizer): super(Sanitizer, self).__init__(*args, **kwargs) def sanitize_token(self, token): - extra_data = [] - # Allow data URIs of some types for images. Store them in 'extra_data' - # so we can appendthem to the result. - if token.get('name') == "img": - for (name, val) in token['data']: - if name == u"src": - if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val): - extra_data.append((name, val)) - break # Do sanitization on the parent. result = super(Sanitizer, self).sanitize_token(token) - # Append extra data potentially including data URI's. - if extra_data: - if result['data']: - result['data'] += extra_data - else: - result['data'] = extra_data - print result # Suppress elements like script and style entirely. if token.get('name') and token['name'] in self.suppressed_elements: @@ -73,6 +63,59 @@ class Sanitizer(BleachSanitizer): else: return result +class DataUriReplacer(HTMLTokenizer, HTMLSanitizerMixin): + """ + Convert any valid image data URI's to files, and upload them to s3. Replace + the data URI with a link to the file in s3. + """ + VALID_URI = "^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$" + + def sanitize_token(self, token): + if token.get('name') == u"img": + attrs = dict([(name, val) for name, val in token['data'][::-1]]) + if 'src' in attrs: + src = attrs['src'] + if re.match(self.VALID_URI, src): + url = self._upload_image(src) + attrs['src'] = url + token['data'] = [(k,v) for k,v in attrs.iteritems()] + return token + + def _upload_image(self, data_uri): + from django.core.files.storage import default_storage + from karmaworld.apps.notes.models import all_read_xml_acl + from django.conf import settings + + mimetype, data = data_uri.split(";base64,") + sio = StringIO() + sio.write(base64.b64decode(data)) + sio.seek(0) + try: + image = Image.open(sio) + except IOError: + raise ValueError("Bad image data URI") + + fmt = mimetype.split("/")[1] + + image_data = StringIO() + image.save(image_data, format=fmt) + + filepath = "images/{}.{}".format(uuid.uuid4(), fmt) + new_key = default_storage.bucket.new_key(filepath) + new_key.set_contents_from_string(image_data.getvalue(), {"Content-Type": mimetype}) + new_key.set_xml_acl(all_read_xml_acl) + parts = [settings.S3_URL, filepath] + if parts[0].startswith("//"): + # Fully resolve the URL as https for happiness in all things. + parts.insert(0, "https:") + return "".join(parts) + + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: + yield token + def sanitize_html(raw_html): """ Sanitize the given raw_html. @@ -81,16 +124,16 @@ def sanitize_html(raw_html): parser = html5lib.HTMLParser(tokenizer=Sanitizer) clean = _render(parser.parseFragment(raw_html)) -# walker = html5lib.treewalkers.getTreeWalker('etree') -# stream = walker(parser.parseFragment(raw_html)) -# serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False) -# print unicode(serializer.render(stream)) - # Set anchor tags' targets clean = bleach.linkify(clean, callbacks=[ bleach.callbacks.nofollow, bleach.callbacks.target_blank - ]) + ], tokenizer=Sanitizer) + return clean + +def data_uris_to_s3(raw_html): + parser = html5lib.HTMLParser(tokenizer=DataUriReplacer) + clean = _render(parser.parseFragment(raw_html)) return clean def set_canonical_rel(raw_html, href): diff --git a/karmaworld/apps/notes/tests.py b/karmaworld/apps/notes/tests.py index 5695468..e452420 100644 --- a/karmaworld/apps/notes/tests.py +++ b/karmaworld/apps/notes/tests.py @@ -1,8 +1,10 @@ #!/usr/bin/env python # -*- coding:utf8 -*- # Copyright (C) 2012 FinalsClub Foundation +import re import datetime from django.test import TestCase +from bs4 import BeautifulSoup from karmaworld.apps.notes.search import SearchIndex from karmaworld.apps.notes.models import Note, NoteMarkdown @@ -131,11 +133,20 @@ class TestSanitizer(TestCase): self.assertHTMLEqual(canonicalized, """

Hey there!

""") def test_data_uri(self): - #html = '' - #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing") - + # Strip out all data URIs. html = '' - self.assertHTMLEqual(sanitizer.sanitize_html(html), html) + self.assertHTMLEqual(sanitizer.sanitize_html(html), "") + # Strip out non-image data URI's html = '' self.assertHTMLEqual(sanitizer.sanitize_html(html), "") + +class TestDataUriToS3(TestCase): + def test_data_uri(self): + html = '' + s3ified = sanitizer.data_uris_to_s3(html) + soup = BeautifulSoup(s3ified) + print s3ified + regex = r'^https?://.*$' + self.assertTrue(bool(re.match(regex, soup.img['src'])), + "{} does not match {}".format(s3ified, regex)) diff --git a/karmaworld/apps/notes/views.py b/karmaworld/apps/notes/views.py index daf9d83..0e38024 100644 --- a/karmaworld/apps/notes/views.py +++ b/karmaworld/apps/notes/views.py @@ -110,7 +110,7 @@ class NoteDeleteView(FormView): self.note = Note.objects.get(id=form.cleaned_data['note']) u = self.request.user # Ensure that the requesting user has permission to delete. - if (u.is_authenticated() and u.id == note.user_id) or u.is_staff: + if (u.is_authenticated() and u.id == self.note.user_id) or u.is_staff: self.note.is_hidden = True self.note.save() messages.success(self.request, 'The note "{0}" was deleted successfully.'.format(self.note.name)) diff --git a/karmaworld/apps/wysihtml5/static/wysihtml5/init.js b/karmaworld/apps/wysihtml5/static/wysihtml5/init.js index 18b8a57..7679bdb 100644 --- a/karmaworld/apps/wysihtml5/static/wysihtml5/init.js +++ b/karmaworld/apps/wysihtml5/static/wysihtml5/init.js @@ -3,7 +3,13 @@ function initWysihtml5(element) { toolbar: element.id + "-toolbar", parserRules: wysihtml5ParserRules }); - editor.on("change", function() { element.value = editor.value; }); + editor.on("load", function() { + }); + editor.on("change", function() { + console.log("EDITOR", editor.value); + element.value = editor.value; + console.log("ELEMENT", element.value); + }); return editor; } diff --git a/karmaworld/apps/wysihtml5/templates/wysihtml5/widget.html b/karmaworld/apps/wysihtml5/templates/wysihtml5/widget.html index 7a162b0..5d810ef 100644 --- a/karmaworld/apps/wysihtml5/templates/wysihtml5/widget.html +++ b/karmaworld/apps/wysihtml5/templates/wysihtml5/widget.html @@ -8,9 +8,16 @@ {% toolbar_button "insertOrderedList" "list-ol" "Ordered list" %} {% toolbar_button "createLink" "link" "Link" %} {% toolbar_button "removeLink" "unlink" "Unlink" %} + {% toolbar_button "insertImage" "image" "Image" %}
Link: - OK Cancel + OK  + Cancel +
+
+ Image: + OK Cancel
-- 2.25.1