static note download migration appears to work.
authorBryan <btbonval@gmail.com>
Fri, 20 Feb 2015 04:33:08 +0000 (23:33 -0500)
committerBryan <btbonval@gmail.com>
Fri, 27 Feb 2015 01:08:13 +0000 (20:08 -0500)
karmaworld/apps/notes/migrations/0021_get_html_from_s3.py

index a173e478c1d971769291c52f8432cab9a3dcb69f..dee227e806a90604fa98de838e419edbac5eb6c8 100644 (file)
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+from karmaworld.apps.notes import sanitizer
 from south.utils import datetime_utils as datetime
 from south.db import db
 from south.v2 import DataMigration
@@ -14,10 +15,22 @@ class Migration(DataMigration):
         # Use orm.ModelName to refer to models in this application,
         # and orm['appname.ModelName'] for models in other applications.
 
-        # keep score
+        # keep score. save as lists for debugging purposes if needed.
         good = []
+        edit = []
+        nonedit = []
         bad = []
 
+        # at the time of migration, editable categories are limited to
+        EDITABLE_CATEGORIES = ('LECTURE_NOTES',)
+
+        # at the time of migration, translated PDFs were based on mimetypes
+        PDF_MIMETYPES = (
+          'application/pdf',
+          'application/vnd.ms-powerpoint',
+          'application/vnd.openxmlformats-officedocument.presentationml.presentation'
+        )
+
         # find each Note without an html field, download its S3 html, and
         # store it in the local database.
         for note in orm['notes.Note'].objects.filter(notemarkdown__html__isnull=True):
@@ -29,12 +42,26 @@ class Migration(DataMigration):
             if key:
                 html = key.read()
 
+            # check the downloaded html
             if not html:
                 bad.append(note.slug)
                 continue
             else:
                 good.append(note.slug)
 
+            # clean the html in a consistent way with note uploads as of the
+            # time of this migration.
+            # handle embedded images from pdf2htmlEX or other sources
+            html = sanitizer.data_uris_to_s3(html)
+            if note.category in EDITABLE_CATEGORIES:
+                # make HTML editable
+                html = sanitizer.sanitize_html_to_editable(html)
+                edit.append(note)
+            else:
+                # clean up HTML without concern for editing
+                html = sanitizer.sanitize_html_preserve_formatting(html)
+                nonedit.append(note)
+
             # store the html in the corresponding NoteMarkdown object
             nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0]
             nmd.html = html
@@ -43,6 +70,8 @@ class Migration(DataMigration):
         # Display the score
         print "Migrated {0} notes and failed to migrate {1} notes.".format(
           len(good), len(bad))
+        print "Of good notes, {0} are editable and {1} are not.".format(
+          len(edit), len(nonedit))
 
         print "Failed list:"
         for slug in bad: