From 344bf28e32310026694cc98b0d757619c93bb338 Mon Sep 17 00:00:00 2001
From: Bryan <btbonval@gmail.com>
Date: Tue, 14 Jan 2014 03:27:57 -0500
Subject: [PATCH] initial attempt at #273 to replace HTML in database with
 static file HTML

---
 karmaworld/apps/notes/gdrive.py               | 34 ++++++--
 .../management/commands/process_note_html.py  | 24 ------
 karmaworld/apps/notes/models.py               | 81 +++++++++++++------
 karmaworld/secret/static_s3.py.example        |  1 -
 karmaworld/settings/prod.py                   | 15 ++--
 karmaworld/templates/notes/note_detail.html   | 10 ++-
 karmaworld/templates/notes/note_raw.html      |  5 --
 karmaworld/urls.py                            |  2 -
 reqs/common.txt                               |  1 -
 9 files changed, 94 insertions(+), 79 deletions(-)
 delete mode 100644 karmaworld/apps/notes/management/commands/process_note_html.py
 delete mode 100644 karmaworld/templates/notes/note_raw.html

diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py
index 42bc11a..a355f76 100644
--- a/karmaworld/apps/notes/gdrive.py
+++ b/karmaworld/apps/notes/gdrive.py
@@ -6,6 +6,7 @@ import datetime
 from django.contrib.auth.models import User
 from django.contrib.sessions.backends.db import SessionStore
 from django.core.exceptions import ObjectDoesNotExist
+from django.core.files.storage import default_storage
 import os
 import subprocess
 import tempfile
@@ -14,6 +15,7 @@ import magic
 import re
 import json
 import time
+from cStringIO import StringIO
 
 import httplib2
 from apiclient.discovery import build
@@ -217,16 +219,34 @@ def convert_raw_document(raw_document, user=None, session_key=None):
     # Cache the uploaded file's URL
     note.gdrive_url = file_dict['alternateLink']
 
+    # Extract HTML from the appropriate place
+    html = ''
     if raw_document.mimetype == PDF_MIMETYPE:
-        note.html = pdf2html(original_content)
-
+        html = pdf2html(original_content)
     elif raw_document.mimetype in PPT_MIMETYPES:
-        note.html = pdf2html(content_dict['pdf'])
-
+        html = pdf2html(content_dict['pdf'])
     elif 'html' in content_dict and content_dict['html']:
-        note.html = content_dict['html']
-        # before we save new html, sanitize a tags in note.html
-        note.sanitize_html(save=False)
+        html = content_dict['html']
+    # cleanup the HTML
+    html = note.filter_html(html)
+
+    # upload the HTML file to static host if it is not already there
+    filepath = note.get_relative_s3_path()
+    if not default_storage.exists(filepath):
+        # This is a pretty ugly hackified answer to some s3boto shortcomings
+        # and some decent default settings chosen by django-storages.
+
+        # S3 upload wants a file-like object.
+        htmlflo = StringIO(html)
+        # Create the new key (key == filename in S3 bucket)
+        newkey = default_storage.bucket.new(filepath)
+        # Upload data!
+        newkey.send_file(htmlflo)
+        if not newkey.exists():
+            raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
+        else:
+            # Mark this note as available from the static host
+            note.static_html = True
 
     note.text = content_dict['text']
 
diff --git a/karmaworld/apps/notes/management/commands/process_note_html.py b/karmaworld/apps/notes/management/commands/process_note_html.py
deleted file mode 100644
index 9f44198..0000000
--- a/karmaworld/apps/notes/management/commands/process_note_html.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-# -*- coding:utf8 -*-
-# Copyright (C) 2012  FinalsClub Foundation
-
-from lxml.html import fromstring, tostring
-
-from django.core.management.base import BaseCommand
-from apps.notes.models import Note
-
-class Command(BaseCommand):
-    args = 'none'
-    help = "Process note.html and modify a tags to open in new window"
-
-    def add_target(self, tag):
-        tag.attrib['target'] = '_blank'
-
-    def handle(self, *args, **kwargs):
-        notes = Note.objects.filter(html__isnull=False)
-
-        for note in notes:
-            succ, data = note.sanitize_html()
-            if succ:
-                print "Note %s contained %s <a>s" % (note.id, data)
-
diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py
index 9cc402f..778c84e 100644
--- a/karmaworld/apps/notes/models.py
+++ b/karmaworld/apps/notes/models.py
@@ -25,7 +25,7 @@ from django.core.files.storage import FileSystemStorage
 from django.db import models
 from django.utils.text import slugify
 import django_filepicker
-from lxml.html import fromstring, tostring
+from bs4 import BeautifulSoup as BS
 from taggit.managers import TaggableManager
 
 from karmaworld.apps.courses.models import Course
@@ -55,7 +55,7 @@ class Document(models.Model):
     course          = models.ForeignKey(Course)
     tags            = TaggableManager(blank=True)
     name            = models.CharField(max_length=255, blank=True, null=True)
-    slug            = models.SlugField(max_length=255, null=True)
+    slug            = models.SlugField(max_length=255, unique=True)
 
     # license if different from default
     license         = models.ForeignKey(License, blank=True, null=True)
@@ -171,10 +171,12 @@ class Note(Document):
                             upload_to="notes/%Y/%m/%d/",
                             blank=True, null=True)
 
-    # Generated by Google Drive by saved locally
-    html            = models.TextField(blank=True, null=True)
+    # Generated by Google Drive but saved locally
     text            = models.TextField(blank=True, null=True)
+    static_html     = models.BooleanField(default=False)
 
+    # html is deprecated. delete once data is all sorted.
+    html            = models.TextField(blank=True, null=True)
 
     # Academic year of course
     year            = models.IntegerField(blank=True, null=True,\
@@ -204,6 +206,15 @@ class Note(Document):
         # gdrive_url might also fit the bill?
         return (self.fp_file, self.upstream_link)
 
+    def get_relative_s3_path(self):
+        """
+        returns s3 path relative to the appropriate bucket.
+        """
+        # Note.slug will be unique and brought in from RawDocument or created
+        # upon save() inside RawDocument.convert_to_note(). It makes for a good
+        # filename and its pretty well guaranteed to be there.
+        return 'html/{0}.html'.format(self.slug)
+
     def get_absolute_url(self):
         """ Resolve note url, use 'note' route and slug if slug
             otherwise use note.id
@@ -215,29 +226,49 @@ class Note(Document):
             # return a url ending in id
             return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.id)
 
-    def sanitize_html(self, save=True):
-        """ if self contains html, find all <a> tags and add target=_blank
-            takes self
-            returns True/False on succ/fail and error or count
+    def filter_html(self, html):
         """
+        Apply all sanitizing filters to HTML.
+        Takes in HTML string and outputs HTML string.
+        """
+        # Fun fact: This could be made into a static method.
+        if not html or not len(html):
+            # if there was no HTML, return an empty string
+            return ''
+
+        # TODO adding from_encoding (if known) will speed up the process
+        # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings
+        soup = BS(html)
+        # Iterate through filters, applying all to the soup object.
+        for soupfilter in (
+          self.sanitize_anchor_html,
+        ):
+            soup = soupfilter(soup)
+        # Return BeautifulSoup cleaned up HTML in UTF-8
+        # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-encoding
+        return soup.prettify()
+
+    def sanitize_anchor_html(self, soup):
+        """
+        Filter the given BeautifulSoup obj by adding target=_blank to all
+        anchor tags.
+        Returns BeautifulSoup obj.
+        """
+        # Fun fact: This could be made into a static method.
+        # Find all a tags in the HTML
+        a_tags = soup.find_all('a')
+        if not a_tags or not len(a_tags):
+            # nothing to process.
+            return soup
+
         # build a tag sanitizer
-        def add_attribute_target(tag):
-            tag.attrib['target'] = '_blank'
-
-        # if no html, return false
-        if not self.html:
-            return False, "Note has no html"
-
-        _html = fromstring(self.html)
-        a_tags = _html.findall('.//a') # recursively find all a tags in document tree
-        # if there are a tags
-        if a_tags > 1:
-            #apply the add attribute function
-            map(add_attribute_target, a_tags)
-            self.html = tostring(_html)
-            if save:
-                self.save()
-            return True, len(a_tags)
+        def set_attribute_target(tag):
+            tag['target'] = '_blank'
+        # set all anchors to have target="_blank"
+        map(set_attribute_target, a_tags)
+
+        # return filtered soup
+        return soup
 
     def _update_parent_updated_at(self):
         """ update the parent Course.updated_at model
diff --git a/karmaworld/secret/static_s3.py.example b/karmaworld/secret/static_s3.py.example
index cb7cb20..ae053e3 100644
--- a/karmaworld/secret/static_s3.py.example
+++ b/karmaworld/secret/static_s3.py.example
@@ -8,4 +8,3 @@ AWS_ACCESS_KEY_ID = 'access_id'
 AWS_SECRET_ACCESS_KEY = 'access_key'
 AWS_STORAGE_BUCKET_NAME = 'bucket'
 S3_URL = 'http://%s.s3.amazonaws.com/' % AWS_STORAGE_BUCKET_NAME
-STATIC_URL = S3_URL
diff --git a/karmaworld/settings/prod.py b/karmaworld/settings/prod.py
index 1bd651f..c22f174 100644
--- a/karmaworld/settings/prod.py
+++ b/karmaworld/settings/prod.py
@@ -10,13 +10,7 @@ from S3 import CallingFormat
 
 from common import *
 
-
-from karmaworld.secret.static_s3 import DEFAULT_FILE_STORAGE
-from karmaworld.secret.static_s3 import AWS_ACCESS_KEY_ID
-from karmaworld.secret.static_s3 import AWS_SECRET_ACCESS_KEY
-from karmaworld.secret.static_s3 import AWS_STORAGE_BUCKET_NAME
-from karmaworld.secret.static_s3 import S3_URL
-from karmaworld.secret.static_s3 import STATIC_URL
+from karmaworld.secret.static_s3 import *
 
 from karmaworld.secret.db_settings import PROD_DB_NAME
 from karmaworld.secret.db_settings import PROD_DB_USERNAME
@@ -131,12 +125,12 @@ INSTALLED_APPS += (
 )
 
 # See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings
-STATICFILES_STORAGE = DEFAULT_FILE_STORAGE = 'storages.backends.s3boto.S3BotoStorage'
+# DEFAULT_FILE_STORAGE comes from karmaworld.secret.static_s3
+STATICFILES_STORAGE = DEFAULT_FILE_STORAGE
 
 # See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings
 AWS_CALLING_FORMAT = CallingFormat.SUBDOMAIN
 
-
 # AWS cache settings, don't change unless you know what you're doing:
 AWS_EXPIREY = 60 * 60 * 24 * 7
 AWS_HEADERS = {
@@ -145,7 +139,8 @@ AWS_HEADERS = {
 }
 
 # See: https://docs.djangoproject.com/en/dev/ref/settings/#static-url
-STATIC_URL = 'https://s3.amazonaws.com/%s/' % AWS_STORAGE_BUCKET_NAME
+# S3_URL comes from karmaworld.secret.static_s3
+STATIC_URL = S3_URL
 ########## END STORAGE CONFIGURATION
 
 
diff --git a/karmaworld/templates/notes/note_detail.html b/karmaworld/templates/notes/note_detail.html
index fc48a2d..5bcd9d7 100644
--- a/karmaworld/templates/notes/note_detail.html
+++ b/karmaworld/templates/notes/note_detail.html
@@ -115,17 +115,19 @@
 
       <div class="row">
         <div class="small-12 small-centered columns medium-12 large-12 body_copy">
-          {% if note.html %}
+          {% if note.static_html %}
             <div class="note-text">
-              <iframe style="border:none; width:100%; min-height: 1000px;" id="noteframe" src="/raw/{{ note.id }}"
+              <!-- Give crawlers a link to the iframe contents-->
+              <a href="{{ STATIC_URL }}{{ note.get_relative_s3_path }}" target="_blank">Open note in a new window or tab</a>.
+              <iframe style="border:none; width:100%; min-height: 1000px;" id="noteframe" src="{{ STATIC_URL }}{{ note.get_relative_s3_path }}"
                       onload="autoResize('noteframe'); {% if pdf_controls %} setupPdfViewer(); {% endif %}"> </iframe>
             </div> <!-- .note-text -->
 
-          {% else %} {# note.html #}
+          {% else %} {# note.static_html #}
             <div class="note-error">
               This document's content is currently unavailable. Please try again later.
             </div>
-          {% endif %} {# note.html #}
+          {% endif %} {# note.static_html #}
 
       {% endif %} {# note.filetype == 'pdf' #}
 
diff --git a/karmaworld/templates/notes/note_raw.html b/karmaworld/templates/notes/note_raw.html
deleted file mode 100644
index c22220c..0000000
--- a/karmaworld/templates/notes/note_raw.html
+++ /dev/null
@@ -1,5 +0,0 @@
-{% if note.html %}
-  {{ note.html|safe }}
-{% else %}
-  <h2>Error: Note has no html</h2>
-{% endif %}
diff --git a/karmaworld/urls.py b/karmaworld/urls.py
index b859767..f47fd3a 100644
--- a/karmaworld/urls.py
+++ b/karmaworld/urls.py
@@ -66,8 +66,6 @@ urlpatterns = patterns('',
     url(r'^accounts/', include('allauth.urls')),
     url(r'^accounts/profile/', ProfileView.as_view(), name='accounts_profile'),
 
-    # VIEW for viewing a Note's gdrive generated html, used as iframe
-    url(r'^raw/(?P<pk>\d+)$', RawNoteDetailView.as_view(), name='note_raw'),
     #url(r'^pdfview$', PDFView.as_view(), name='pdf'),
     url(r'^pdfview/(?P<pk>\d+)$', PDFView.as_view(), name='pdf'),
 
diff --git a/reqs/common.txt b/reqs/common.txt
index cd8c876..fff9644 100644
--- a/reqs/common.txt
+++ b/reqs/common.txt
@@ -8,7 +8,6 @@ oauth2client==1.0
 urllib3==1.5
 google-api-python-client==1.0
 django-grappelli==2.4.8
-lxml==3.1.0
 git+https://github.com/FinalsClub/django-taggit.git
 django-filepicker==0.1.5
 filemagic==1.6
-- 
2.25.1