df33bfc22d45059a41a1e08b8725944f4310d373
[oweals/karmaworld.git] / karmaworld / apps / notes / models.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 """
6     Models for the notes django app.
7     Contains only the minimum for handling files and their representation
8 """
9 import datetime
10 import traceback
11 import logging
12 from allauth.account.signals import user_logged_in
13 from django.contrib.auth.models import User
14 from django.contrib.sites.models import Site
15 from django.core.urlresolvers import reverse
16 from django.utils.safestring import mark_safe
17 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
18 from django.core.files.storage import default_storage
19 from django.db.models import SET_NULL
20 from django.db.models.signals import post_save, post_delete, pre_save
21 from django.dispatch import receiver
22 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
23 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
24 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
25 import os
26 import time
27 import urllib
28
29 from django.conf import settings
30 from django.core.files import File
31 from django.core.files.storage import FileSystemStorage
32 from django.db import models
33 from django.utils.text import slugify
34 import django_filepicker
35 from bs4 import BeautifulSoup as BS
36 from taggit.managers import TaggableManager
37
38 from karmaworld.apps.courses.models import Course
39 from karmaworld.apps.licenses.models import License
40 from karmaworld.apps.notes.search import SearchIndex
41 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
42
43 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
44
45 logger = logging.getLogger(__name__)
46 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
47
48 # Dictionary for S3 upload headers
49 s3_upload_headers = {
50     'Content-Type': 'text/html',
51 }
52
53 # This is a bit hacky, but nothing else works. Grabbed this from a proper
54 # file configured via S3 management console.
55 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
56 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
57
58
59 class Document(models.Model):
60     """
61     An Abstract Base Class representing a document intended to be subclassed.
62     """
63     course          = models.ForeignKey(Course)
64     tags            = TaggableManager(blank=True)
65     name            = models.CharField(max_length=255, blank=True, null=True)
66     slug            = models.SlugField(max_length=255, unique=True)
67
68     # license if different from default
69     license         = models.ForeignKey(License, blank=True, null=True)
70
71     # provide an upstream file link
72     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
73
74     # metadata relevant to the Upload process
75     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
76     ip              = models.GenericIPAddressField(blank=True, null=True,
77                         help_text=u"IP address of the uploader")
78     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
79
80
81     # if True, NEVER show this file
82     # WARNING: This may throw an error on migration
83     is_hidden       = models.BooleanField(default=False)
84
85     ###
86     # Everything Filepicker, now in one small area
87
88     # Allow pick (choose files), store (upload to S3), read (from FP repo),
89     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
90     # seconds). Generated one time, at class definition upon import. So the
91     # server will need to be rebooted at least one time each year or this will
92     # go stale.
93     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
94     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
95     fp_policy      = encode_fp_policy(fp_policy_json)
96     fp_signature   = sign_fp_policy(fp_policy)
97
98     # Hack because mimetypes conflict with extensions, but there is no way to
99     # disable mimetypes.
100     # https://github.com/Ink/django-filepicker/issues/22
101     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
102     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
103     # parameters!
104     fp_file = django_filepicker.models.FPFileField(
105                 # FPFileField settings
106                 apikey=FILEPICKER_API_KEY,
107                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
108                 additional_params={
109                     'data-fp-multiple': 'true', 
110                     'data-fp-folders': 'true',
111                     'data-fp-button-class':
112                       'inline-button important add-note-btn',
113                     'data-fp-button-text': 'Add Notes',
114                     'data-fp-extensions':
115                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
116                     'data-fp-store-location': 'S3',
117                     'data-fp-policy': fp_policy,
118                     'data-fp-signature': fp_signature,
119                     'type': 'filepicker',
120                     'onchange': "got_file(event)",
121                 },
122                 # FileField settings
123                 null=True, blank=True,
124                 upload_to='nil', # field ignored because S3, but required.
125                 verbose_name='', # prevent a label from showing up
126                 )
127     mimetype = models.CharField(max_length=255, blank=True, null=True)
128
129     class Meta:
130         abstract = True
131         ordering = ['-uploaded_at']
132
133     def _generate_unique_slug(self):
134         """ generate a unique slug based on name and uploaded_at  """
135         _slug = slugify(unicode(self.name))
136         klass = self.__class__
137         collision = klass.objects.filter(slug=_slug)
138         if collision:
139             _slug = u"{0}-{1}-{2}-{3}".format(
140                     _slug, self.uploaded_at.month,
141                     self.uploaded_at.day, self.uploaded_at.microsecond)
142         self.slug = _slug
143
144     def _get_fpf(self):
145         """
146         Memoized FilepickerFile getter. Returns FilepickerFile.
147         """
148         if not hasattr(self, 'cached_fpf'):
149             # Fetch additional_params containing signature, etc
150             aps = self.fp_file.field.additional_params
151             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
152         return self.cached_fpf
153
154     def get_fp_url(self):
155         """
156         Returns the Filepicker URL for reading the upstream document.
157         """
158         fpf = self._get_fpf()
159         # Return proper URL for reading
160         return fpf.get_url()
161
162     def get_file(self):
163         """
164         Downloads the file from filepicker.io and returns a Django File wrapper
165         object.
166         """
167         # Fetch FilepickerFile
168         fpf = self._get_fpf()
169         # Return Django File
170         return fpf.get_file()
171
172     def save(self, *args, **kwargs):
173         if self.name and not self.slug:
174             self._generate_unique_slug()
175         super(Document, self).save(*args, **kwargs)
176
177
178 class NoteManager(models.Manager):
179     """ Handle restoring data. """
180     def get_by_natural_key(self, fp_file, upstream_link):
181         """
182         Return a Note defined by its Filepicker and upstream URLs.
183         """
184         return self.get(fp_file=fp_file,upstream_link=upstream_link)
185
186
187 class Note(Document):
188     """ 
189     A django model representing an uploaded file and associated metadata.
190     """
191     objects = NoteManager()
192
193     # FIXME: refactor file choices after FP.io integration
194     UNKNOWN_FILE = '???'
195     FILE_TYPE_CHOICES = (
196         ('doc', 'MS Word compatible file (.doc, .docx, .rtf, .odf)'),
197         ('img', 'Scan or picture of notes'),
198         ('pdf', 'PDF file'),
199         ('ppt', 'Powerpoint'),
200         ('txt', 'Text'),
201         (UNKNOWN_FILE, 'Unknown file'),
202     )
203
204     PDF_MIMETYPES = (
205       'application/pdf',
206       'application/vnd.ms-powerpoint',
207       'application/vnd.openxmlformats-officedocument.presentationml.presentation'
208     )
209
210     file_type       = models.CharField(max_length=15,
211                             choices=FILE_TYPE_CHOICES,
212                             default=UNKNOWN_FILE,
213                             blank=True, null=True)
214
215     # Cache the Google drive file link
216     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
217
218     # Upload files to MEDIA_ROOT/notes/YEAR/MONTH/DAY, 2012/10/30/filename
219     pdf_file       = models.FileField(
220                             storage=fs,
221                             upload_to="notes/%Y/%m/%d/",
222                             blank=True, null=True)
223
224     # Generated by Google Drive but saved locally
225     text            = models.TextField(blank=True, null=True)
226     static_html     = models.BooleanField(default=False)
227
228     # html is deprecated. delete once data is all sorted.
229     html            = models.TextField(blank=True, null=True)
230
231     # Academic year of course
232     year            = models.IntegerField(blank=True, null=True,\
233                         default=datetime.datetime.utcnow().year)
234
235     # Number of times this note has been flagged as abusive/spam.
236     flags           = models.IntegerField(default=0,null=False)
237
238     # Social media tracking
239     tweeted         = models.BooleanField(default=False)
240     thanks          = models.PositiveIntegerField(default=0)
241
242     class Meta:
243         unique_together = ('fp_file', 'upstream_link')
244         ordering = ['-uploaded_at']
245
246     def __unicode__(self):
247         return u"Note at {0} (from {1}) ({2})".format(self.fp_file, self.upstream_link, self.id)
248
249     def natural_key(self):
250         """
251         A Note is uniquely defined by both the Filepicker link and the upstream
252         link. The Filepicker link should be unique by itself, but it may be
253         null in the database, so the upstream link component should resolve
254         those cases.
255         """
256         # gdrive_url might also fit the bill?
257         return (self.fp_file, self.upstream_link)
258
259     def get_relative_s3_path(self):
260         """
261         returns s3 path relative to the appropriate bucket.
262         """
263         # Note.slug will be unique and brought in from RawDocument or created
264         # upon save() inside RawDocument.convert_to_note(). It makes for a good
265         # filename and its pretty well guaranteed to be there.
266         return 'html/{0}.html'.format(self.slug)
267
268     def send_to_s3(self, html, do_save=True):
269         """
270         Push the given HTML up to S3 for this Note.
271         Set do_save to False if the note will be saved outside this call.
272         """
273         # do nothing if HTML is empty.
274         if not html or not len(html):
275             return
276         # do nothing if already uploaded.
277         # Maybe run checksums if possible to confirm its really done?
278         # (but then you gotta wonder was the original correct or is the new
279         # one correct)
280         if self.static_html:
281             return
282         # upload the HTML file to static host if it is not already there
283         filepath = self.get_relative_s3_path()
284         if not default_storage.exists(filepath):
285             # This is a pretty ugly hackified answer to some s3boto shortcomings
286             # and some decent default settings chosen by django-storages.
287
288             # Create the new key (key == filename in S3 bucket)
289             newkey = default_storage.bucket.new_key(filepath)
290             # Upload data!
291             newkey.set_contents_from_string(html, headers=s3_upload_headers)
292             if not newkey.exists():
293                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
294
295             # set the permissions for everyone to read.
296             newkey.set_xml_acl(all_read_xml_acl)
297
298         # If the code reaches here, either:
299         # filepath exists on S3 but static_html is not marked.
300         # or
301         # file was just uploaded successfully to filepath
302         # Regardless, set note as uploaded.
303         self.static_html = True
304         if do_save:
305             self.save()
306
307     def update_note_on_s3(self, html):
308         # do nothing if HTML is empty.
309         if not html or not len(html):
310             return
311         # if it's not already there then bail out
312         filepath = self.get_relative_s3_path()
313         if not default_storage.exists(filepath):
314             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
315             return
316
317         key = default_storage.bucket.get_key(filepath)
318         key.set_contents_from_string(html, headers=s3_upload_headers)
319         key.set_xml_acl(all_read_xml_acl)
320
321     def get_absolute_url(self):
322         """ Resolve note url, use 'note' route and slug if slug
323             otherwise use note.id
324         """
325         if self.slug is not None:
326             # return a url ending in slug
327             if self.course.school:
328                 return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.slug])
329             else:
330                 return reverse('note_detail', args=[self.course.department.school.slug, self.course.slug, self.slug])
331         else:
332             # return a url ending in id
333             return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.id])
334
335     def get_absolute_keywords_url(self):
336         """ Resolve note url, use 'note' route and slug if slug
337             otherwise use note.id
338         """
339         if self.slug is not None:
340             # return a url ending in slug
341             if self.course.school:
342                 return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.slug])
343             else:
344                 return reverse('note_keywords', args=[self.course.department.school.slug, self.course.slug, self.slug])
345         else:
346             # return a url ending in id
347             return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.id])
348
349
350     def filter_html(self, html):
351         """
352         Apply all sanitizing filters to HTML.
353         Takes in HTML string and outputs HTML string.
354         """
355         # Fun fact: This could be made into a static method.
356         if not html or not len(html):
357             # if there was no HTML, return an empty string
358             return ''
359
360         soup = BS(html)
361         # Iterate through filters, applying all to the soup object.
362         for soupfilter in (
363             self.sanitize_anchor_html,
364             self.set_canonical_link,
365         ):
366             soup = soupfilter(soup)
367         return str(soup)
368
369     def sanitize_anchor_html(self, soup):
370         """
371         Filter the given BeautifulSoup obj by adding target=_blank to all
372         anchor tags.
373         Returns BeautifulSoup obj.
374         """
375         # Fun fact: This could be made into a static method.
376         # Find all a tags in the HTML
377         a_tags = soup.find_all('a')
378         if not a_tags or not len(a_tags):
379             # nothing to process.
380             return soup
381
382         # build a tag sanitizer
383         def set_attribute_target(tag):
384             tag['target'] = '_blank'
385         # set all anchors to have target="_blank"
386         map(set_attribute_target, a_tags)
387
388         # return filtered soup
389         return soup
390
391     @staticmethod
392     def canonical_link_predicate(tag):
393         return tag.name == u'link' and \
394             tag.has_attr('rel') and \
395             u'canonical' in tag['rel']
396
397     def set_canonical_link(self, soup):
398         """
399         Filter the given BeautifulSoup obj by adding
400         <link rel="canonical" href="note.get_absolute_url" />
401         to the document head.
402         Returns BeautifulSoup obj.
403         """
404         domain = Site.objects.all()[0].domain
405         note_full_href = 'http://' + domain + self.get_absolute_url()
406         canonical_tags = soup.find_all(self.canonical_link_predicate)
407         if canonical_tags:
408             for tag in canonical_tags:
409                 tag['href'] = note_full_href
410         else:
411             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
412             head = soup.find('head')
413             head.append(new_tag)
414
415         # return filtered soup
416         return soup
417
418     def _update_parent_updated_at(self):
419         """ update the parent Course.updated_at model
420             with the latest uploaded_at """
421         self.course.updated_at = self.uploaded_at
422         self.course.save()
423
424     def save(self, *args, **kwargs):
425         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
426             self._update_parent_updated_at()
427         super(Note, self).save(*args, **kwargs)
428
429     def has_markdown(self):
430         return hasattr(self, "notemarkdown")
431
432     def is_pdf(self):
433         return self.mimetype in Note.PDF_MIMETYPES
434
435
436 class NoteMarkdown(models.Model):
437     note     = models.OneToOneField(Note, primary_key=True)
438     markdown = models.TextField(blank=True, null=True)
439
440 auto_add_check_unique_together(Note)
441
442
443 def update_note_counts(note_instance):
444     try:
445         # test if the course still exists, or if this is a cascade delete.
446         note_instance.course
447     except Course.DoesNotExist:
448         # this is a cascade delete. there is no course to update
449         pass
450     else:
451         # course exists
452         note_instance.course.update_note_count()
453         if note_instance.course.school:
454             note_instance.course.school.update_note_count()
455         elif note_instance.course.department.school:
456             note_instance.course.department.school.update_note_count()
457
458 @receiver(pre_save, sender=Note, weak=False)
459 def note_pre_save_receiver(sender, **kwargs):
460     """Stick an instance of the pre-save value of
461     the given Note instance in the instances itself.
462     This will be looked at in post_save."""
463     if not 'instance' in kwargs:
464         return
465
466     try:
467         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
468     except ObjectDoesNotExist:
469         pass
470
471 @receiver(post_save, sender=Note, weak=False)
472 def note_save_receiver(sender, **kwargs):
473     if not 'instance' in kwargs:
474         return
475     note = kwargs['instance']
476
477     if kwargs['created']:
478         update_note_counts(note)
479
480     try:
481         index = SearchIndex()
482         if kwargs['created']:
483             index.add_note(note)
484         else:
485             index.update_note(note, note.old_instance)
486     except Exception:
487         logger.error("Error with IndexDen:\n" + traceback.format_exc())
488
489
490 @receiver(post_delete, sender=Note, weak=False)
491 def note_delete_receiver(sender, **kwargs):
492     if not 'instance' in kwargs:
493         return
494     note = kwargs['instance']
495
496     # Update course and school counts of how
497     # many notes they have
498     update_note_counts(kwargs['instance'])
499
500     # Remove document from search index
501     try:
502         index = SearchIndex()
503         index.remove_note(note)
504     except Exception:
505         logger.error("Error with IndexDen:\n" + traceback.format_exc())
506
507     if note.user:
508         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
509
510
511 class UserUploadMapping(models.Model):
512     user = models.ForeignKey(User)
513     fp_file = models.CharField(max_length=255)
514
515     class Meta:
516         unique_together = ('user', 'fp_file')
517
518
519 @receiver(user_logged_in, weak=True)
520 def find_orphan_notes(sender, **kwargs):
521     user = kwargs['user']
522     s = kwargs['request'].session
523     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
524     for uploaded_note_url in uploaded_note_urls:
525         try:
526             note = Note.objects.get(fp_file=uploaded_note_url)
527             note.user = user
528             note.save()
529             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
530         except (ObjectDoesNotExist, MultipleObjectsReturned):
531             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
532             mapping.save()
533