from django.contrib.auth.models import User
from django.contrib.sessions.backends.db import SessionStore
from django.core.exceptions import ObjectDoesNotExist
+from django.core.files.storage import default_storage
import os
import subprocess
import tempfile
import re
import json
import time
+from cStringIO import StringIO
import httplib2
from apiclient.discovery import build
# Cache the uploaded file's URL
note.gdrive_url = file_dict['alternateLink']
+ # Extract HTML from the appropriate place
+ html = ''
if raw_document.mimetype == PDF_MIMETYPE:
- note.html = pdf2html(original_content)
-
+ html = pdf2html(original_content)
elif raw_document.mimetype in PPT_MIMETYPES:
- note.html = pdf2html(content_dict['pdf'])
-
+ html = pdf2html(content_dict['pdf'])
elif 'html' in content_dict and content_dict['html']:
- note.html = content_dict['html']
- # before we save new html, sanitize a tags in note.html
- note.sanitize_html(save=False)
+ html = content_dict['html']
+ # cleanup the HTML
+ html = note.filter_html(html)
+
+ # upload the HTML file to static host if it is not already there
+ filepath = note.get_relative_s3_path()
+ if not default_storage.exists(filepath):
+ # This is a pretty ugly hackified answer to some s3boto shortcomings
+ # and some decent default settings chosen by django-storages.
+
+ # S3 upload wants a file-like object.
+ htmlflo = StringIO(html)
+ # Create the new key (key == filename in S3 bucket)
+ newkey = default_storage.bucket.new(filepath)
+ # Upload data!
+ newkey.send_file(htmlflo)
+ if not newkey.exists():
+ raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
+ else:
+ # Mark this note as available from the static host
+ note.static_html = True
note.text = content_dict['text']
+++ /dev/null
-#!/usr/bin/env python
-# -*- coding:utf8 -*-
-# Copyright (C) 2012 FinalsClub Foundation
-
-from lxml.html import fromstring, tostring
-
-from django.core.management.base import BaseCommand
-from apps.notes.models import Note
-
-class Command(BaseCommand):
- args = 'none'
- help = "Process note.html and modify a tags to open in new window"
-
- def add_target(self, tag):
- tag.attrib['target'] = '_blank'
-
- def handle(self, *args, **kwargs):
- notes = Note.objects.filter(html__isnull=False)
-
- for note in notes:
- succ, data = note.sanitize_html()
- if succ:
- print "Note %s contained %s <a>s" % (note.id, data)
-
from django.db import models
from django.utils.text import slugify
import django_filepicker
-from lxml.html import fromstring, tostring
+from bs4 import BeautifulSoup as BS
from taggit.managers import TaggableManager
from karmaworld.apps.courses.models import Course
course = models.ForeignKey(Course)
tags = TaggableManager(blank=True)
name = models.CharField(max_length=255, blank=True, null=True)
- slug = models.SlugField(max_length=255, null=True)
+ slug = models.SlugField(max_length=255, unique=True)
# license if different from default
license = models.ForeignKey(License, blank=True, null=True)
upload_to="notes/%Y/%m/%d/",
blank=True, null=True)
- # Generated by Google Drive by saved locally
- html = models.TextField(blank=True, null=True)
+ # Generated by Google Drive but saved locally
text = models.TextField(blank=True, null=True)
+ static_html = models.BooleanField(default=False)
+ # html is deprecated. delete once data is all sorted.
+ html = models.TextField(blank=True, null=True)
# Academic year of course
year = models.IntegerField(blank=True, null=True,\
# gdrive_url might also fit the bill?
return (self.fp_file, self.upstream_link)
+ def get_relative_s3_path(self):
+ """
+ returns s3 path relative to the appropriate bucket.
+ """
+ # Note.slug will be unique and brought in from RawDocument or created
+ # upon save() inside RawDocument.convert_to_note(). It makes for a good
+ # filename and its pretty well guaranteed to be there.
+ return 'html/{0}.html'.format(self.slug)
+
def get_absolute_url(self):
""" Resolve note url, use 'note' route and slug if slug
otherwise use note.id
# return a url ending in id
return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.id)
- def sanitize_html(self, save=True):
- """ if self contains html, find all <a> tags and add target=_blank
- takes self
- returns True/False on succ/fail and error or count
+ def filter_html(self, html):
"""
+ Apply all sanitizing filters to HTML.
+ Takes in HTML string and outputs HTML string.
+ """
+ # Fun fact: This could be made into a static method.
+ if not html or not len(html):
+ # if there was no HTML, return an empty string
+ return ''
+
+ # TODO adding from_encoding (if known) will speed up the process
+ # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#encodings
+ soup = BS(html)
+ # Iterate through filters, applying all to the soup object.
+ for soupfilter in (
+ self.sanitize_anchor_html,
+ ):
+ soup = soupfilter(soup)
+ # Return BeautifulSoup cleaned up HTML in UTF-8
+ # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-encoding
+ return soup.prettify()
+
+ def sanitize_anchor_html(self, soup):
+ """
+ Filter the given BeautifulSoup obj by adding target=_blank to all
+ anchor tags.
+ Returns BeautifulSoup obj.
+ """
+ # Fun fact: This could be made into a static method.
+ # Find all a tags in the HTML
+ a_tags = soup.find_all('a')
+ if not a_tags or not len(a_tags):
+ # nothing to process.
+ return soup
+
# build a tag sanitizer
- def add_attribute_target(tag):
- tag.attrib['target'] = '_blank'
-
- # if no html, return false
- if not self.html:
- return False, "Note has no html"
-
- _html = fromstring(self.html)
- a_tags = _html.findall('.//a') # recursively find all a tags in document tree
- # if there are a tags
- if a_tags > 1:
- #apply the add attribute function
- map(add_attribute_target, a_tags)
- self.html = tostring(_html)
- if save:
- self.save()
- return True, len(a_tags)
+ def set_attribute_target(tag):
+ tag['target'] = '_blank'
+ # set all anchors to have target="_blank"
+ map(set_attribute_target, a_tags)
+
+ # return filtered soup
+ return soup
def _update_parent_updated_at(self):
""" update the parent Course.updated_at model
AWS_SECRET_ACCESS_KEY = 'access_key'
AWS_STORAGE_BUCKET_NAME = 'bucket'
S3_URL = 'http://%s.s3.amazonaws.com/' % AWS_STORAGE_BUCKET_NAME
-STATIC_URL = S3_URL
from common import *
-
-from karmaworld.secret.static_s3 import DEFAULT_FILE_STORAGE
-from karmaworld.secret.static_s3 import AWS_ACCESS_KEY_ID
-from karmaworld.secret.static_s3 import AWS_SECRET_ACCESS_KEY
-from karmaworld.secret.static_s3 import AWS_STORAGE_BUCKET_NAME
-from karmaworld.secret.static_s3 import S3_URL
-from karmaworld.secret.static_s3 import STATIC_URL
+from karmaworld.secret.static_s3 import *
from karmaworld.secret.db_settings import PROD_DB_NAME
from karmaworld.secret.db_settings import PROD_DB_USERNAME
)
# See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings
-STATICFILES_STORAGE = DEFAULT_FILE_STORAGE = 'storages.backends.s3boto.S3BotoStorage'
+# DEFAULT_FILE_STORAGE comes from karmaworld.secret.static_s3
+STATICFILES_STORAGE = DEFAULT_FILE_STORAGE
# See: http://django-storages.readthedocs.org/en/latest/backends/amazon-S3.html#settings
AWS_CALLING_FORMAT = CallingFormat.SUBDOMAIN
-
# AWS cache settings, don't change unless you know what you're doing:
AWS_EXPIREY = 60 * 60 * 24 * 7
AWS_HEADERS = {
}
# See: https://docs.djangoproject.com/en/dev/ref/settings/#static-url
-STATIC_URL = 'https://s3.amazonaws.com/%s/' % AWS_STORAGE_BUCKET_NAME
+# S3_URL comes from karmaworld.secret.static_s3
+STATIC_URL = S3_URL
########## END STORAGE CONFIGURATION
<div class="row">
<div class="small-12 small-centered columns medium-12 large-12 body_copy">
- {% if note.html %}
+ {% if note.static_html %}
<div class="note-text">
- <iframe style="border:none; width:100%; min-height: 1000px;" id="noteframe" src="/raw/{{ note.id }}"
+ <!-- Give crawlers a link to the iframe contents-->
+ <a href="{{ STATIC_URL }}{{ note.get_relative_s3_path }}" target="_blank">Open note in a new window or tab</a>.
+ <iframe style="border:none; width:100%; min-height: 1000px;" id="noteframe" src="{{ STATIC_URL }}{{ note.get_relative_s3_path }}"
onload="autoResize('noteframe'); {% if pdf_controls %} setupPdfViewer(); {% endif %}"> </iframe>
</div> <!-- .note-text -->
- {% else %} {# note.html #}
+ {% else %} {# note.static_html #}
<div class="note-error">
This document's content is currently unavailable. Please try again later.
</div>
- {% endif %} {# note.html #}
+ {% endif %} {# note.static_html #}
{% endif %} {# note.filetype == 'pdf' #}
+++ /dev/null
-{% if note.html %}
- {{ note.html|safe }}
-{% else %}
- <h2>Error: Note has no html</h2>
-{% endif %}
url(r'^accounts/', include('allauth.urls')),
url(r'^accounts/profile/', ProfileView.as_view(), name='accounts_profile'),
- # VIEW for viewing a Note's gdrive generated html, used as iframe
- url(r'^raw/(?P<pk>\d+)$', RawNoteDetailView.as_view(), name='note_raw'),
#url(r'^pdfview$', PDFView.as_view(), name='pdf'),
url(r'^pdfview/(?P<pk>\d+)$', PDFView.as_view(), name='pdf'),
urllib3==1.5
google-api-python-client==1.0
django-grappelli==2.4.8
-lxml==3.1.0
git+https://github.com/FinalsClub/django-taggit.git
django-filepicker==0.1.5
filemagic==1.6