Translate data-uri's to files stored on Amazon s3
authorCharlie DeTar <cfd@media.mit.edu>
Sun, 1 Feb 2015 01:38:30 +0000 (18:38 -0700)
committerBryan <btbonval@gmail.com>
Fri, 27 Feb 2015 01:08:12 +0000 (20:08 -0500)
karmaworld/apps/notes/forms.py
karmaworld/apps/notes/gdrive.py
karmaworld/apps/notes/sanitizer.py
karmaworld/apps/notes/tests.py
karmaworld/apps/notes/views.py
karmaworld/apps/wysihtml5/static/wysihtml5/init.js
karmaworld/apps/wysihtml5/templates/wysihtml5/widget.html

index 8bd47bd241e4915ff8c0f93860d27f63cd67b6aa..4bf660b529ec202be97d7fc099ce72df4d530e48 100644 (file)
@@ -15,6 +15,7 @@ class NoteForm(ModelForm):
 
     def save(self, *args, **kwargs):
         # TODO: use transaction.atomic for this when we switch to Django 1.6+
+        print self.cleaned_data
         instance = super(NoteForm, self).save(*args, **kwargs)
         instance.tags.set(*self.cleaned_data['tags'])
         if instance.is_hidden:
index c6d608459ebc20b04f440e8382ca83f723573ca0..e48b7b36cff0a8c28e7ce8310cdde952cf1d27dd 100644 (file)
@@ -223,6 +223,7 @@ def convert_raw_document(raw_document, user=None):
         html = content_dict['html']
 
     if html:
+        html = sanitizer.data_uris_to_s3(html)
         NoteMarkdown.objects.create(note=note, html=html)
 
     # If we know the user who uploaded this,
index 895c622a1f46d737d403168d527399eb791f3128..162d2569d40ad699d2305f3b64d60b258529e564 100644 (file)
@@ -2,10 +2,16 @@ import re
 import bleach
 import html5lib
 from html5lib.constants import tokenTypes
+from html5lib.sanitizer import HTMLSanitizerMixin
+from html5lib.tokenizer import HTMLTokenizer
 from bleach.sanitizer import BleachSanitizer
 from bleach import _render
 import bleach_whitelist
 from bs4 import BeautifulSoup
+from PIL import Image
+from cStringIO import StringIO
+import base64
+import uuid
 
 def _canonical_link_predicate(tag):
     return tag.name == u'link' and \
@@ -30,7 +36,7 @@ class Sanitizer(BleachSanitizer):
     But we want to strip both the tag and contents for certain tags like script
     and style.  This subclass does that.
 
-    Also support data URI's.
+    Also support data URI's for some mimetypes (image/png, image/gif, image/jpeg)
     """
     allowed_elements = bleach_whitelist.markdown_tags
     allowed_attributes = bleach_whitelist.markdown_attrs
@@ -43,24 +49,8 @@ class Sanitizer(BleachSanitizer):
         super(Sanitizer, self).__init__(*args, **kwargs)
 
     def sanitize_token(self, token):
-        extra_data = []
-        # Allow data URIs of some types for images. Store them in 'extra_data'
-        # so we can appendthem to the result.
-        if token.get('name') == "img":
-            for (name, val) in token['data']:
-                if name == u"src":
-                    if re.match("^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$", val):
-                        extra_data.append((name, val))
-                    break
         # Do sanitization on the parent.
         result = super(Sanitizer, self).sanitize_token(token)
-        # Append extra data potentially including data URI's.
-        if extra_data:
-            if result['data']:
-                result['data'] += extra_data
-            else:
-                result['data'] = extra_data
-        print result
 
         # Suppress elements like script and style entirely.
         if token.get('name') and token['name'] in self.suppressed_elements:
@@ -73,6 +63,59 @@ class Sanitizer(BleachSanitizer):
         else:
             return result
 
+class DataUriReplacer(HTMLTokenizer, HTMLSanitizerMixin):
+    """
+    Convert any valid image data URI's to files, and upload them to s3. Replace
+    the data URI with a link to the file in s3.
+    """
+    VALID_URI = "^data:image/(png|gif|jpeg);base64,[A-Za-z0-9+/=]+$"
+
+    def sanitize_token(self, token):
+        if token.get('name') == u"img":
+            attrs = dict([(name, val) for name, val in token['data'][::-1]])
+            if 'src' in attrs:
+                src = attrs['src']
+                if re.match(self.VALID_URI, src):
+                    url = self._upload_image(src)
+                    attrs['src'] = url
+                    token['data'] = [(k,v) for k,v in attrs.iteritems()]
+        return token
+
+    def _upload_image(self, data_uri):
+        from django.core.files.storage import default_storage
+        from karmaworld.apps.notes.models import all_read_xml_acl
+        from django.conf import settings
+
+        mimetype, data = data_uri.split(";base64,")
+        sio = StringIO()
+        sio.write(base64.b64decode(data))
+        sio.seek(0)
+        try:
+            image = Image.open(sio)
+        except IOError:
+            raise ValueError("Bad image data URI")
+
+        fmt = mimetype.split("/")[1]
+
+        image_data = StringIO()
+        image.save(image_data, format=fmt)
+
+        filepath = "images/{}.{}".format(uuid.uuid4(), fmt)
+        new_key = default_storage.bucket.new_key(filepath)
+        new_key.set_contents_from_string(image_data.getvalue(), {"Content-Type": mimetype})
+        new_key.set_xml_acl(all_read_xml_acl)
+        parts = [settings.S3_URL, filepath]
+        if parts[0].startswith("//"):
+            # Fully resolve the URL as https for happiness in all things.
+            parts.insert(0, "https:")
+        return "".join(parts)
+
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
+
 def sanitize_html(raw_html):
     """
     Sanitize the given raw_html.
@@ -81,16 +124,16 @@ def sanitize_html(raw_html):
     parser = html5lib.HTMLParser(tokenizer=Sanitizer)
     clean = _render(parser.parseFragment(raw_html))
 
-#    walker = html5lib.treewalkers.getTreeWalker('etree')
-#    stream = walker(parser.parseFragment(raw_html))
-#    serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=False, omit_optional_tags=False)
-#    print unicode(serializer.render(stream))
-
     # Set anchor tags' targets
     clean = bleach.linkify(clean, callbacks=[
         bleach.callbacks.nofollow,
         bleach.callbacks.target_blank
-    ])
+    ], tokenizer=Sanitizer)
+    return clean
+
+def data_uris_to_s3(raw_html):
+    parser = html5lib.HTMLParser(tokenizer=DataUriReplacer)
+    clean = _render(parser.parseFragment(raw_html))
     return clean
 
 def set_canonical_rel(raw_html, href):
index 5695468f91291761a8622c1f7b7c3b3f242af119..e4524209141bd711f600610f61300f15fc032cc4 100644 (file)
@@ -1,8 +1,10 @@
 #!/usr/bin/env python
 # -*- coding:utf8 -*-
 # Copyright (C) 2012  FinalsClub Foundation
+import re
 import datetime
 from django.test import TestCase
+from bs4 import BeautifulSoup
 from karmaworld.apps.notes.search import SearchIndex
 
 from karmaworld.apps.notes.models import Note, NoteMarkdown
@@ -131,11 +133,20 @@ class TestSanitizer(TestCase):
         self.assertHTMLEqual(canonicalized, """<html><head><link rel='canonical' href='http://example.com'></head><body><h1>Hey there!</h1></body></html>""")
 
     def test_data_uri(self):
-        #html = '<img src="/this.gif">'
-        #self.assertHTMLEqual(sanitizer.sanitize_html(html), "nothing")
-
+        # Strip out all data URIs.
         html = '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==">'
-        self.assertHTMLEqual(sanitizer.sanitize_html(html), html)
+        self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")
 
+        # Strip out non-image data URI's
         html = '<img src="data:application/pdf;base64,blergh">'
         self.assertHTMLEqual(sanitizer.sanitize_html(html), "<img/>")
+
+class TestDataUriToS3(TestCase):
+    def test_data_uri(self):
+        html = '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==">'
+        s3ified = sanitizer.data_uris_to_s3(html)
+        soup = BeautifulSoup(s3ified)
+        print s3ified
+        regex = r'^https?://.*$'
+        self.assertTrue(bool(re.match(regex, soup.img['src'])),
+                "{} does not match {}".format(s3ified, regex))
index daf9d8389069e0d01c86c94040cc199166fce54c..0e380248825c143dad028a0869ff40cce81fc779 100644 (file)
@@ -110,7 +110,7 @@ class NoteDeleteView(FormView):
         self.note = Note.objects.get(id=form.cleaned_data['note'])
         u = self.request.user
         # Ensure that the requesting user has permission to delete.
-        if (u.is_authenticated() and u.id == note.user_id) or u.is_staff:
+        if (u.is_authenticated() and u.id == self.note.user_id) or u.is_staff:
             self.note.is_hidden = True
             self.note.save()
             messages.success(self.request, 'The note "{0}" was deleted successfully.'.format(self.note.name))
index 18b8a57b65abafc1897d199cab93f0a6f8368c04..7679bdb54877b3c0d8a92f4300c2aa499d5a9c1a 100644 (file)
@@ -3,7 +3,13 @@ function initWysihtml5(element) {
     toolbar: element.id + "-toolbar",
     parserRules: wysihtml5ParserRules
   });
-  editor.on("change", function() { element.value = editor.value; });
+  editor.on("load", function() {
+  });
+  editor.on("change", function() {
+    console.log("EDITOR", editor.value);
+    element.value = editor.value;
+    console.log("ELEMENT", element.value);
+  });
   return editor;
 }
 
index 7a162b00e6f3bef90414c46d788658740ababa8e..5d810ef1a71a2dcabfd63b9967a72d9e415efc6c 100644 (file)
@@ -8,9 +8,16 @@
   {% toolbar_button "insertOrderedList" "list-ol" "Ordered list" %}
   {% toolbar_button "createLink" "link" "Link" %}
   {% toolbar_button "removeLink" "unlink" "Unlink" %}
+  {% toolbar_button "insertImage" "image" "Image" %}
   <div data-wysihtml5-dialog="createLink" style="display: none;">
     Link: <input data-wysihtml5-dialog-field="href" value="http://">
-    <a data-wysihtml5-dialog-action="save" class='button'>OK</a>&nbsp;<a data-wysihtml5-dialog-action="cancel" class='button secondary'>Cancel</a>
+    <a data-wysihtml5-dialog-action="save" class='button'>OK</a>&nbsp;
+    <a data-wysihtml5-dialog-action="cancel" class='button secondary'>Cancel</a>
+  </div>
+  <div data-wysihtml5-dialog="insertImage" style="display: none;">
+    Image: <input data-wysihtml5-dialog-field="src" value="http://">
+    <a data-wysihtml5-dialog-action="save" class='button'>OK</a>&nbsp;<a data-wysihtml5-dial
+    <a data-wysihtml5-dialog-action="cancel" class='button secondary'>Cancel</a>
   </div>
 </div>
 <textarea id="{{ attrs.id }}" name="{{ name }}" role='wysihtml5-rich-text' rows=10 cols=40>{{ value|safe }}</textarea>