From be5e062ad0d708515269fdbd4208fead3150710a Mon Sep 17 00:00:00 2001 From: Bryan Date: Tue, 14 Jan 2014 03:27:57 -0500 Subject: [PATCH] initial attempt at #273 to replace HTML in database with static file HTML --- karmaworld/apps/notes/gdrive.py | 34 ++++++-- .../management/commands/process_note_html.py | 24 ------ karmaworld/apps/notes/models.py | 81 +++++++++++++------ karmaworld/secret/static_s3.py.example | 1 - karmaworld/settings/prod.py | 15 ++-- karmaworld/templates/notes/note_detail.html | 10 ++- karmaworld/templates/notes/note_raw.html | 5 -- karmaworld/urls.py | 2 - reqs/common.txt | 1 - 9 files changed, 94 insertions(+), 79 deletions(-) delete mode 100644 karmaworld/apps/notes/management/commands/process_note_html.py delete mode 100644 karmaworld/templates/notes/note_raw.html diff --git a/karmaworld/apps/notes/gdrive.py b/karmaworld/apps/notes/gdrive.py index 42bc11a..a355f76 100644 --- a/karmaworld/apps/notes/gdrive.py +++ b/karmaworld/apps/notes/gdrive.py @@ -6,6 +6,7 @@ import datetime from django.contrib.auth.models import User from django.contrib.sessions.backends.db import SessionStore from django.core.exceptions import ObjectDoesNotExist +from django.core.files.storage import default_storage import os import subprocess import tempfile @@ -14,6 +15,7 @@ import magic import re import json import time +from cStringIO import StringIO import httplib2 from apiclient.discovery import build @@ -217,16 +219,34 @@ def convert_raw_document(raw_document, user=None, session_key=None): # Cache the uploaded file's URL note.gdrive_url = file_dict['alternateLink'] + # Extract HTML from the appropriate place + html = '' if raw_document.mimetype == PDF_MIMETYPE: - note.html = pdf2html(original_content) - + html = pdf2html(original_content) elif raw_document.mimetype in PPT_MIMETYPES: - note.html = pdf2html(content_dict['pdf']) - + html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: - note.html = content_dict['html'] - # before we save new html, sanitize a tags in note.html - note.sanitize_html(save=False) + html = content_dict['html'] + # cleanup the HTML + html = note.filter_html(html) + + # upload the HTML file to static host if it is not already there + filepath = note.get_relative_s3_path() + if not default_storage.exists(filepath): + # This is a pretty ugly hackified answer to some s3boto shortcomings + # and some decent default settings chosen by django-storages. + + # S3 upload wants a file-like object. + htmlflo = StringIO(html) + # Create the new key (key == filename in S3 bucket) + newkey = default_storage.bucket.new(filepath) + # Upload data! + newkey.send_file(htmlflo) + if not newkey.exists(): + raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey))) + else: + # Mark this note as available from the static host + note.static_html = True note.text = content_dict['text'] diff --git a/karmaworld/apps/notes/management/commands/process_note_html.py b/karmaworld/apps/notes/management/commands/process_note_html.py deleted file mode 100644 index 9f44198..0000000 --- a/karmaworld/apps/notes/management/commands/process_note_html.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf8 -*- -# Copyright (C) 2012 FinalsClub Foundation - -from lxml.html import fromstring, tostring - -from django.core.management.base import BaseCommand -from apps.notes.models import Note - -class Command(BaseCommand): - args = 'none' - help = "Process note.html and modify a tags to open in new window" - - def add_target(self, tag): - tag.attrib['target'] = '_blank' - - def handle(self, *args, **kwargs): - notes = Note.objects.filter(html__isnull=False) - - for note in notes: - succ, data = note.sanitize_html() - if succ: - print "Note %s contained %s s" % (note.id, data) - diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py index d284977..722db8e 100644 --- a/karmaworld/apps/notes/models.py +++ b/karmaworld/apps/notes/models.py @@ -25,7 +25,7 @@ from django.core.files.storage import FileSystemStorage from django.db import models from django.template import defaultfilters import django_filepicker -from lxml.html import fromstring, tostring +from bs4 import BeautifulSoup as BS from taggit.managers import TaggableManager from karmaworld.apps.courses.models import Course @@ -55,7 +55,7 @@ class Document(models.Model): course = models.ForeignKey(Course) tags = TaggableManager(blank=True) name = models.CharField(max_length=255, blank=True, null=True) - slug = models.SlugField(max_length=255, null=True) + slug = models.SlugField(max_length=255, unique=True) # license if different from default license = models.ForeignKey(License, blank=True, null=True) @@ -171,10 +171,12 @@ class Note(Document): upload_to="notes/%Y/%m/%d/", blank=True, null=True) - # Generated by Google Drive by saved locally - html = models.TextField(blank=True, null=True) + # Generated by Google Drive but saved locally text = models.TextField(blank=True, null=True) + static_html = models.BooleanField(default=False) + # html is deprecated. delete once data is all sorted. + html = models.TextField(blank=True, null=True) # Academic year of course year = models.IntegerField(blank=True, null=True,\ @@ -204,6 +206,15 @@ class Note(Document): # gdrive_url might also fit the bill? return (self.fp_file, self.upstream_link) + def get_relative_s3_path(self): + """ + returns s3 path relative to the appropriate bucket. + """ + # Note.slug will be unique and brought in from RawDocument or created + # upon save() inside RawDocument.convert_to_note(). It makes for a good + # filename and its pretty well guaranteed to be there. + return 'html/{0}.html'.format(self.slug) + def get_absolute_url(self): """ Resolve note url, use 'note' route and slug if slug otherwise use note.id @@ -215,29 +226,49 @@ class Note(Document): # return a url ending in id return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.id) - def sanitize_html(self, save=True): - """ if self contains html, find all tags and add target=_blank - takes self - returns True/False on succ/fail and error or count + def filter_html(self, html): """ + Apply all sanitizing filters to HTML. + Takes in HTML string and outputs HTML string. + """ + # Fun fact: This could be made into a static method. + if not html or not len(html): + # if there was no HTML, return an empty string + return '' + + # TODO adding from_encoding (if known) will speed up the process + # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings + soup = BS(html) + # Iterate through filters, applying all to the soup object. + for soupfilter in ( + self.sanitize_anchor_html, + ): + soup = soupfilter(soup) + # Return BeautifulSoup cleaned up HTML in UTF-8 + # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-encoding + return soup.prettify() + + def sanitize_anchor_html(self, soup): + """ + Filter the given BeautifulSoup obj by adding target=_blank to all + anchor tags. + Returns BeautifulSoup obj. + """ + # Fun fact: This could be made into a static method. + # Find all a tags in the HTML + a_tags = soup.find_all('a') + if not a_tags or not len(a_tags): + # nothing to process. + return soup + # build a tag sanitizer - def add_attribute_target(tag): - tag.attrib['target'] = '_blank' - - # if no html, return false - if not self.html: - return False, "Note has no html" - - _html = fromstring(self.html) - a_tags = _html.findall('.//a') # recursively find all a tags in document tree - # if there are a tags - if a_tags > 1: - #apply the add attribute function - map(add_attribute_target, a_tags) - self.html = tostring(_html) - if save: - self.save() - return True, len(a_tags) + def set_attribute_target(tag): + tag['target'] = '_blank' + # set all anchors to have target="_blank" + map(set_attribute_target, a_tags) + + # return filtered soup + return soup def _update_parent_updated_at(self): """ update the parent Course.updated_at model diff --git a/karmaworld/secret/static_s3.py.example b/karmaworld/secret/static_s3.py.example index cb7cb20..ae053e3 100644 --- a/karmaworld/secret/static_s3.py.example +++ b/karmaworld/secret/static_s3.py.example @@ -8,4 +8,3 @@ AWS_ACCESS_KEY_ID = 'access_id' AWS_SECRET_ACCESS_KEY = 'access_key' AWS_STORAGE_BUCKET_NAME = 'bucket' S3_URL = 'http://%s.s3.amazonaws.com/' % AWS_STORAGE_BUCKET_NAME -STATIC_URL = S3_URL diff --git a/karmaworld/settings/prod.py b/karmaworld/settings/prod.py index 1bd651f..c22f174 100644 --- a/karmaworld/settings/prod.py +++ b/karmaworld/settings/prod.py @@ -10,13 +10,7 @@ from S3 import CallingFormat from common import * - -from karmaworld.secret.static_s3 import DEFAULT_FILE_STORAGE -from karmaworld.secret.static_s3 import AWS_ACCESS_KEY_ID -from karmaworld.secret.static_s3 import AWS_SECRET_ACCESS_KEY -from karmaworld.secret.static_s3 import AWS_STORAGE_BUCKET_NAME -from karmaworld.secret.static_s3 import S3_URL -from karmaworld.secret.static_s3 import STATIC_URL +from karmaworld.secret.static_s3 import * from karmaworld.secret.db_settings import PROD_DB_NAME from karmaworld.secret.db_settings import PROD_DB_USERNAME @@ -131,12 +125,12 @@ INSTALLED_APPS += ( ) # See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings -STATICFILES_STORAGE = DEFAULT_FILE_STORAGE = 'storages.backends.s3boto.S3BotoStorage' +# DEFAULT_FILE_STORAGE comes from karmaworld.secret.static_s3 +STATICFILES_STORAGE = DEFAULT_FILE_STORAGE # See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings AWS_CALLING_FORMAT = CallingFormat.SUBDOMAIN - # AWS cache settings, don't change unless you know what you're doing: AWS_EXPIREY = 60 * 60 * 24 * 7 AWS_HEADERS = { @@ -145,7 +139,8 @@ AWS_HEADERS = { } # See: https://docs.djangoproject.com/en/dev/ref/settings/#static-url -STATIC_URL = 'https://s3.amazonaws.com/%s/' % AWS_STORAGE_BUCKET_NAME +# S3_URL comes from karmaworld.secret.static_s3 +STATIC_URL = S3_URL ########## END STORAGE CONFIGURATION diff --git a/karmaworld/templates/notes/note_detail.html b/karmaworld/templates/notes/note_detail.html index fc48a2d..5bcd9d7 100644 --- a/karmaworld/templates/notes/note_detail.html +++ b/karmaworld/templates/notes/note_detail.html @@ -115,17 +115,19 @@
- {% if note.html %} + {% if note.static_html %}
-
- {% else %} {# note.html #} + {% else %} {# note.static_html #}
This document's content is currently unavailable. Please try again later.
- {% endif %} {# note.html #} + {% endif %} {# note.static_html #} {% endif %} {# note.filetype == 'pdf' #} diff --git a/karmaworld/templates/notes/note_raw.html b/karmaworld/templates/notes/note_raw.html deleted file mode 100644 index c22220c..0000000 --- a/karmaworld/templates/notes/note_raw.html +++ /dev/null @@ -1,5 +0,0 @@ -{% if note.html %} - {{ note.html|safe }} -{% else %} -

Error: Note has no html

-{% endif %} diff --git a/karmaworld/urls.py b/karmaworld/urls.py index 1718cfc..9ecd1d9 100644 --- a/karmaworld/urls.py +++ b/karmaworld/urls.py @@ -66,8 +66,6 @@ urlpatterns = patterns('', url(r'^accounts/', include('allauth.urls')), url(r'^accounts/profile/', ProfileView.as_view(), name='accounts_profile'), - # VIEW for viewing a Note's gdrive generated html, used as iframe - url(r'^raw/(?P\d+)$', RawNoteDetailView.as_view(), name='note_raw'), #url(r'^pdfview$', PDFView.as_view(), name='pdf'), url(r'^pdfview/(?P\d+)$', PDFView.as_view(), name='pdf'), diff --git a/reqs/common.txt b/reqs/common.txt index 90fac64..0b707d3 100644 --- a/reqs/common.txt +++ b/reqs/common.txt @@ -8,7 +8,6 @@ oauth2client==1.0 urllib3==1.5 google-api-python-client==1.0 django-grappelli==2.4.3 -lxml==3.1.0 git+https://github.com/FinalsClub/django-taggit.git django-filepicker==0.1.5 filemagic==1.6 -- 2.25.1