WIP: Note editing, markdown to html
[oweals/karmaworld.git] / karmaworld / apps / notes / models.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 """
6     Models for the notes django app.
7     Contains only the minimum for handling files and their representation
8 """
9 import datetime
10 import traceback
11 import logging
12 from allauth.account.signals import user_logged_in
13 from django.contrib.auth.models import User
14 from django.contrib.sites.models import Site
15 from django.core.urlresolvers import reverse
16 from django.utils.safestring import mark_safe
17 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
18 from django.core.files.storage import default_storage
19 from django.db.models import SET_NULL
20 from django.db.models.signals import post_save, post_delete, pre_save
21 from django.dispatch import receiver
22 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
23 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
24 import os
25 import time
26 import urllib
27
28 from django.conf import settings
29 from django.core.files import File
30 from django.core.files.storage import FileSystemStorage
31 from django.db import models
32 from django.utils.text import slugify
33 import django_filepicker
34 from bs4 import BeautifulSoup as BS
35 from taggit.managers import TaggableManager
36 import bleach
37 import bleach_whitelist
38 import markdown
39
40 from karmaworld.apps.courses.models import Course
41 from karmaworld.apps.licenses.models import License
42 from karmaworld.apps.notes.search import SearchIndex
43 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
44
45 FILEPICKER_API_KEY = os.environ['FILEPICKER_API_KEY']
46
47 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
48 KEYWORD_MTURK_THRESHOLD = 3
49
50 logger = logging.getLogger(__name__)
51 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
52
53 # Dictionary for S3 upload headers
54 s3_upload_headers = {
55     'Content-Type': 'text/html',
56 }
57
58 # This is a bit hacky, but nothing else works. Grabbed this from a proper
59 # file configured via S3 management console.
60 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
61 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
62
63
64 class Document(models.Model):
65     """
66     An Abstract Base Class representing a document intended to be subclassed.
67     """
68     course          = models.ForeignKey(Course)
69     tags            = TaggableManager(blank=True)
70     name            = models.CharField(max_length=255, blank=True, null=True)
71     slug            = models.SlugField(max_length=255, unique=True)
72
73     LECTURE_NOTES = 'LECTURE_NOTES'
74     STUDY_GUIDE = 'STUDY_GUIDE'
75     SYLLABUS = 'SYLLABUS'
76     ASSIGNMENT = 'ASSIGNMENT'
77     OTHER = 'OTHER'
78     NOTE_CATEGORIES = (
79         (LECTURE_NOTES, 'Lecture Notes'),
80         (STUDY_GUIDE, 'Study Guide'),
81         (SYLLABUS, 'Syllabus'),
82         (ASSIGNMENT, 'Assignment'),
83         (OTHER, 'Other'),
84     )
85     category = models.CharField(max_length=50, choices=NOTE_CATEGORIES, blank=True, null=True)
86
87     # license if different from default
88     license         = models.ForeignKey(License, blank=True, null=True)
89
90     # provide an upstream file link
91     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
92
93     # metadata relevant to the Upload process
94     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
95     ip              = models.GenericIPAddressField(blank=True, null=True,
96                         help_text=u"IP address of the uploader")
97     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
98
99
100     # if True, NEVER show this file
101     # WARNING: This may throw an error on migration
102     is_hidden       = models.BooleanField(default=False)
103
104     ###
105     # Everything Filepicker, now in one small area
106
107     # Allow pick (choose files), store (upload to S3), read (from FP repo),
108     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
109     # seconds). Generated one time, at class definition upon import. So the
110     # server will need to be rebooted at least one time each year or this will
111     # go stale.
112     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
113     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
114     fp_policy      = encode_fp_policy(fp_policy_json)
115     fp_signature   = sign_fp_policy(fp_policy)
116
117     # Hack because mimetypes conflict with extensions, but there is no way to
118     # disable mimetypes.
119     # https://github.com/Ink/django-filepicker/issues/22
120     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
121     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
122     # parameters!
123     fp_file = django_filepicker.models.FPFileField(
124                 # FPFileField settings
125                 apikey=FILEPICKER_API_KEY,
126                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
127                 additional_params={
128                     'data-fp-multiple': 'true', 
129                     'data-fp-folders': 'true',
130                     'data-fp-button-class':
131                       'inline-button important add-note-btn',
132                     'data-fp-button-text': 'Add Notes',
133                     'data-fp-extensions':
134                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
135                     'data-fp-store-location': 'S3',
136                     'data-fp-policy': fp_policy,
137                     'data-fp-signature': fp_signature,
138                     'type': 'filepicker',
139                     'onchange': "got_file(event)",
140                 },
141                 # FileField settings
142                 null=True, blank=True,
143                 upload_to='nil', # field ignored because S3, but required.
144                 verbose_name='', # prevent a label from showing up
145                 )
146     mimetype = models.CharField(max_length=255, blank=True, null=True)
147
148     class Meta:
149         abstract = True
150         ordering = ['-uploaded_at']
151
152     def _generate_unique_slug(self):
153         """ generate a unique slug based on name and uploaded_at  """
154         _slug = slugify(unicode(self.name))
155         klass = self.__class__
156         collision = klass.objects.filter(slug=_slug)
157         if collision:
158             _slug = u"{0}-{1}-{2}-{3}".format(
159                     _slug, self.uploaded_at.month,
160                     self.uploaded_at.day, self.uploaded_at.microsecond)
161         self.slug = _slug
162
163     def _get_fpf(self):
164         """
165         Memoized FilepickerFile getter. Returns FilepickerFile.
166         """
167         if not hasattr(self, 'cached_fpf'):
168             # Fetch additional_params containing signature, etc
169             aps = self.fp_file.field.additional_params
170             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
171         return self.cached_fpf
172
173     def get_fp_url(self):
174         """
175         Returns the Filepicker URL for reading the upstream document.
176         """
177         fpf = self._get_fpf()
178         # Return proper URL for reading
179         return fpf.get_url()
180
181     def get_file(self):
182         """
183         Downloads the file from filepicker.io and returns a Django File wrapper
184         object.
185         """
186         # Fetch FilepickerFile
187         fpf = self._get_fpf()
188         # Return Django File
189         return fpf.get_file()
190
191     def save(self, *args, **kwargs):
192         if self.name and not self.slug:
193             self._generate_unique_slug()
194         super(Document, self).save(*args, **kwargs)
195
196
197 class NoteManager(models.Manager):
198     """ Handle restoring data. """
199     def get_by_natural_key(self, fp_file, upstream_link):
200         """
201         Return a Note defined by its Filepicker and upstream URLs.
202         """
203         return self.get(fp_file=fp_file,upstream_link=upstream_link)
204
205
206 class Note(Document):
207     """ 
208     A django model representing an uploaded file and associated metadata.
209     """
210     objects = NoteManager()
211
212     PDF_MIMETYPES = (
213       'application/pdf',
214       'application/vnd.ms-powerpoint',
215       'application/vnd.openxmlformats-officedocument.presentationml.presentation'
216     )
217
218     # Cache the Google drive file link
219     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
220
221     # Generated by Google Drive but saved locally
222     text            = models.TextField(blank=True, null=True)
223
224     # Number of times this note has been flagged as abusive/spam.
225     flags           = models.IntegerField(default=0,null=False)
226
227     # Social media tracking
228     tweeted         = models.BooleanField(default=False)
229     thanks          = models.PositiveIntegerField(default=0)
230
231     class Meta:
232         unique_together = ('fp_file', 'upstream_link')
233         ordering = ['-uploaded_at']
234
235     def __unicode__(self):
236         return u"Note at {0} (from {1}) ({2})".format(self.fp_file, self.upstream_link, self.id)
237
238     def natural_key(self):
239         """
240         A Note is uniquely defined by both the Filepicker link and the upstream
241         link. The Filepicker link should be unique by itself, but it may be
242         null in the database, so the upstream link component should resolve
243         those cases.
244         """
245         # gdrive_url might also fit the bill?
246         return (self.fp_file, self.upstream_link)
247
248     def get_relative_s3_path(self):
249         """
250         returns s3 path relative to the appropriate bucket.
251         """
252         # Note.slug will be unique and brought in from RawDocument or created
253         # upon save() inside RawDocument.convert_to_note(). It makes for a good
254         # filename and its pretty well guaranteed to be there.
255         return 'html/{0}.html'.format(self.slug)
256
257     def send_to_s3(self, html, do_save=True):
258         """
259         Push the given HTML up to S3 for this Note.
260         Set do_save to False if the note will be saved outside this call.
261         """
262         # do nothing if HTML is empty.
263         if not html or not len(html):
264             return
265         # upload the HTML file to static host if it is not already there
266         filepath = self.get_relative_s3_path()
267         if not default_storage.exists(filepath):
268             # This is a pretty ugly hackified answer to some s3boto shortcomings
269             # and some decent default settings chosen by django-storages.
270
271             # Create the new key (key == filename in S3 bucket)
272             newkey = default_storage.bucket.new_key(filepath)
273             # Upload data!
274             newkey.set_contents_from_string(html, headers=s3_upload_headers)
275             if not newkey.exists():
276                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
277
278             # set the permissions for everyone to read.
279             newkey.set_xml_acl(all_read_xml_acl)
280
281     def update_note_on_s3(self, html):
282         # do nothing if HTML is empty.
283         if not html or not len(html):
284             return
285         # if it's not already there then bail out
286         filepath = self.get_relative_s3_path()
287         if not default_storage.exists(filepath):
288             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
289             return
290
291         key = default_storage.bucket.get_key(filepath)
292         key.set_contents_from_string(html, headers=s3_upload_headers)
293         key.set_xml_acl(all_read_xml_acl)
294
295     def remaining_thanks_for_mturk(self):
296         return KEYWORD_MTURK_THRESHOLD - self.thanks
297
298     def total_thanks_for_mturk(self):
299         return KEYWORD_MTURK_THRESHOLD
300
301     def get_absolute_url(self):
302         """ Resolve note url, use 'note' route and slug if slug
303             otherwise use note.id
304         """
305         if self.slug:
306             # return a url ending in slug
307             if self.course.school:
308                 return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.slug])
309             else:
310                 return reverse('note_detail', args=[self.course.department.school.slug, self.course.slug, self.slug])
311         else:
312             # return a url ending in id
313             return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.id])
314
315     def get_absolute_keywords_url(self):
316         """ Resolve note url, use 'note' route and slug if slug
317             otherwise use note.id
318         """
319         if self.slug is not None:
320             # return a url ending in slug
321             if self.course.school:
322                 return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.slug])
323             else:
324                 return reverse('note_keywords', args=[self.course.department.school.slug, self.course.slug, self.slug])
325         else:
326             # return a url ending in id
327             return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.id])
328
329     def get_absolute_quiz_url(self):
330         """ Resolve note url, use 'note' route and slug if slug
331             otherwise use note.id
332         """
333         if self.slug is not None:
334             # return a url ending in slug
335             if self.course.school:
336                 return reverse('note_quiz', args=[self.course.school.slug, self.course.slug, self.slug])
337             else:
338                 return reverse('note_quiz', args=[self.course.department.school.slug, self.course.slug, self.slug])
339         else:
340             # return a url ending in id
341             return reverse('note_quiz', args=[self.course.school.slug, self.course.slug, self.id])
342
343     def filter_html(self, html):
344         """
345         Apply all sanitizing filters to HTML.
346         Takes in HTML string and outputs HTML string.
347         """
348         # Fun fact: This could be made into a static method.
349         if not html or not len(html):
350             # if there was no HTML, return an empty string
351             return ''
352
353         soup = BS(html)
354         # Iterate through filters, applying all to the soup object.
355         for soupfilter in (
356             self.sanitize_anchor_html,
357             self.set_canonical_link,
358         ):
359             soup = soupfilter(soup)
360         return str(soup)
361
362     def sanitize_anchor_html(self, soup):
363         """
364         Filter the given BeautifulSoup obj by adding target=_blank to all
365         anchor tags.
366         Returns BeautifulSoup obj.
367         """
368         # Fun fact: This could be made into a static method.
369         # Find all a tags in the HTML
370         a_tags = soup.find_all('a')
371         if not a_tags or not len(a_tags):
372             # nothing to process.
373             return soup
374
375         # build a tag sanitizer
376         def set_attribute_target(tag):
377             tag['target'] = '_blank'
378         # set all anchors to have target="_blank"
379         map(set_attribute_target, a_tags)
380
381         # return filtered soup
382         return soup
383
384     @staticmethod
385     def canonical_link_predicate(tag):
386         return tag.name == u'link' and \
387             tag.has_attr('rel') and \
388             u'canonical' in tag['rel']
389
390     def set_canonical_link(self, soup):
391         """
392         Filter the given BeautifulSoup obj by adding
393         <link rel="canonical" href="note.get_absolute_url" />
394         to the document head.
395         Returns BeautifulSoup obj.
396         """
397         domain = Site.objects.all()[0].domain
398         note_full_href = 'http://' + domain + self.get_absolute_url()
399         canonical_tags = soup.find_all(self.canonical_link_predicate)
400         if canonical_tags:
401             for tag in canonical_tags:
402                 tag['href'] = note_full_href
403         else:
404             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
405             head = soup.find('head')
406             head.append(new_tag)
407
408         # return filtered soup
409         return soup
410
411     def _update_parent_updated_at(self):
412         """ update the parent Course.updated_at model
413             with the latest uploaded_at """
414         self.course.updated_at = self.uploaded_at
415         self.course.save()
416
417     def save(self, *args, **kwargs):
418         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
419             self._update_parent_updated_at()
420         super(Note, self).save(*args, **kwargs)
421
422     def has_markdown(self):
423         return hasattr(self, "notemarkdown")
424
425     def is_pdf(self):
426         return self.mimetype in Note.PDF_MIMETYPES
427
428
429 class NoteMarkdown(models.Model):
430     note     = models.OneToOneField(Note, primary_key=True)
431     markdown = models.TextField(blank=True, null=True)
432     html     = models.TextField(blank=True, null=True)
433
434     @classmethod
435     def sanitize(cls, html):
436         return bleach.clean(html,
437                 bleach_whitelist.markdown_tags,
438                 bleach_whitelist.markdown_attrs,
439                 strip=True)
440
441     def save(self, *args, **kwargs):
442         if self.markdown and not self.html:
443             self.html = markdown.markdown(self.markdown)
444         self.html = NoteMarkdown.sanitize(self.html)
445         super(NoteMarkdown, self).save(*args, **kwargs)
446
447 auto_add_check_unique_together(Note)
448
449
450 def update_note_counts(note_instance):
451     try:
452         # test if the course still exists, or if this is a cascade delete.
453         note_instance.course
454     except Course.DoesNotExist:
455         # this is a cascade delete. there is no course to update
456         pass
457     else:
458         # course exists
459         note_instance.course.update_thank_count()
460         note_instance.course.update_note_count()
461         if note_instance.course.school:
462             note_instance.course.school.update_note_count()
463         elif note_instance.course.department.school:
464             note_instance.course.department.school.update_note_count()
465
466 @receiver(pre_save, sender=Note, weak=False)
467 def note_pre_save_receiver(sender, **kwargs):
468     """Stick an instance of the pre-save value of
469     the given Note instance in the instances itself.
470     This will be looked at in post_save."""
471     if not 'instance' in kwargs:
472         return
473
474     try:
475         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
476     except ObjectDoesNotExist:
477         pass
478
479 @receiver(post_save, sender=Note, weak=False)
480 def note_save_receiver(sender, **kwargs):
481     if not 'instance' in kwargs:
482         return
483     note = kwargs['instance']
484
485
486     update_note_counts(note)
487
488     try:
489         index = SearchIndex()
490         if kwargs['created']:
491             index.add_note(note)
492         else:
493             index.update_note(note, note.old_instance)
494     except Exception:
495         logger.error("Error with IndexDen:\n" + traceback.format_exc())
496
497
498 @receiver(post_delete, sender=Note, weak=False)
499 def note_delete_receiver(sender, **kwargs):
500     if not 'instance' in kwargs:
501         return
502     note = kwargs['instance']
503
504     # Update course and school counts of how
505     # many notes they have
506     update_note_counts(kwargs['instance'])
507
508     # Remove document from search index
509     try:
510         index = SearchIndex()
511         index.remove_note(note)
512     except Exception:
513         logger.error("Error with IndexDen:\n" + traceback.format_exc())
514
515     if note.user:
516         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
517
518
519 class UserUploadMapping(models.Model):
520     user = models.ForeignKey(User)
521     fp_file = models.CharField(max_length=255)
522
523     class Meta:
524         unique_together = ('user', 'fp_file')
525
526
527 @receiver(user_logged_in, weak=True)
528 def find_orphan_notes(sender, **kwargs):
529     user = kwargs['user']
530     s = kwargs['request'].session
531     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
532     for uploaded_note_url in uploaded_note_urls:
533         try:
534             note = Note.objects.get(fp_file=uploaded_note_url)
535             note.user = user
536             note.save()
537             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
538         except (ObjectDoesNotExist, MultipleObjectsReturned):
539             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
540             mapping.save()
541