45c4caaac162247d3da29767ad52686342958467
[oweals/karmaworld.git] / karmaworld / apps / notes / models.py
1 #!/usr/bin/env python
2 # -*- coding:utf8 -*-
3 # Copyright (C) 2012  FinalsClub Foundation
4
5 """
6     Models for the notes django app.
7     Contains only the minimum for handling files and their representation
8 """
9 import datetime
10 import traceback
11 import logging
12 from allauth.account.signals import user_logged_in
13 from django.contrib.auth.models import User
14 from django.contrib.sites.models import Site
15 from django.utils.safestring import mark_safe
16 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
17 from django.core.files.storage import default_storage
18 from django.db.models import SET_NULL
19 from django.db.models.signals import post_save, post_delete, pre_save
20 from django.dispatch import receiver
21 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
22 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
23 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
24 import os
25 import time
26 import urllib
27
28 from django.conf import settings
29 from django.core.files import File
30 from django.core.files.storage import FileSystemStorage
31 from django.db import models
32 from django.utils.text import slugify
33 import django_filepicker
34 from bs4 import BeautifulSoup as BS
35 from taggit.managers import TaggableManager
36
37 from karmaworld.apps.courses.models import Course
38 from karmaworld.apps.licenses.models import License
39 from karmaworld.apps.notes.search import SearchIndex
40 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
41
42 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
43
44 logger = logging.getLogger(__name__)
45 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
46
47 # Dictionary for S3 upload headers
48 s3_upload_headers = {
49     'Content-Type': 'text/html',
50 }
51
52 # This is a bit hacky, but nothing else works. Grabbed this from a proper
53 # file configured via S3 management console.
54 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
55 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
56
57
58 class Document(models.Model):
59     """
60     An Abstract Base Class representing a document intended to be subclassed.
61     """
62     course          = models.ForeignKey(Course)
63     tags            = TaggableManager(blank=True)
64     name            = models.CharField(max_length=255, blank=True, null=True)
65     slug            = models.SlugField(max_length=255, unique=True)
66
67     # license if different from default
68     license         = models.ForeignKey(License, blank=True, null=True)
69
70     # provide an upstream file link
71     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
72
73     # metadata relevant to the Upload process
74     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
75     ip              = models.GenericIPAddressField(blank=True, null=True,
76                         help_text=u"IP address of the uploader")
77     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
78
79
80     # if True, NEVER show this file
81     # WARNING: This may throw an error on migration
82     is_hidden       = models.BooleanField(default=False)
83
84     ###
85     # Everything Filepicker, now in one small area
86
87     # Allow pick (choose files), store (upload to S3), read (from FP repo),
88     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
89     # seconds). Generated one time, at class definition upon import. So the
90     # server will need to be rebooted at least one time each year or this will
91     # go stale.
92     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
93     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
94     fp_policy      = encode_fp_policy(fp_policy_json)
95     fp_signature   = sign_fp_policy(fp_policy)
96
97     # Hack because mimetypes conflict with extensions, but there is no way to
98     # disable mimetypes.
99     # https://github.com/Ink/django-filepicker/issues/22
100     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
101     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
102     # parameters!
103     fp_file = django_filepicker.models.FPFileField(
104                 # FPFileField settings
105                 apikey=FILEPICKER_API_KEY,
106                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
107                 additional_params={
108                     'data-fp-multiple': 'true', 
109                     'data-fp-folders': 'true',
110                     'data-fp-button-class':
111                       'add-note-btn small-10 columns large-4',
112                     'data-fp-button-text':
113                       mark_safe("<i class='fa fa-arrow-circle-o-up'></i> add notes"),
114                     'data-fp-drag-class':
115                       'dragdrop show-for-medium-up large-7 columns',
116                     'data-fp-drag-text': 'Drop Some Knowledge',
117                     'data-fp-extensions':
118                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
119                     'data-fp-store-location': 'S3',
120                     'data-fp-policy': fp_policy,
121                     'data-fp-signature': fp_signature,
122                     'onchange': "got_file(event)",
123                 },
124                 # FileField settings
125                 null=True, blank=True,
126                 upload_to='nil', # field ignored because S3, but required.
127                 verbose_name='', # prevent a label from showing up
128                 )
129     mimetype = models.CharField(max_length=255, blank=True, null=True)
130
131     class Meta:
132         abstract = True
133         ordering = ['-uploaded_at']
134
135     def _generate_unique_slug(self):
136         """ generate a unique slug based on name and uploaded_at  """
137         _slug = slugify(unicode(self.name))
138         klass = self.__class__
139         collision = klass.objects.filter(slug=_slug)
140         if collision:
141             _slug = u"{0}-{1}-{2}-{3}".format(
142                     _slug, self.uploaded_at.month,
143                     self.uploaded_at.day, self.uploaded_at.microsecond)
144         self.slug = _slug
145
146     def _get_fpf(self):
147         """
148         Memoized FilepickerFile getter. Returns FilepickerFile.
149         """
150         if not hasattr(self, 'cached_fpf'):
151             # Fetch additional_params containing signature, etc
152             aps = self.fp_file.field.additional_params
153             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
154         return self.cached_fpf
155
156     def get_fp_url(self):
157         """
158         Returns the Filepicker URL for reading the upstream document.
159         """
160         # Fetch FilepickerFile
161         if not self.fp_file.name:
162             return None
163
164         fpf = self._get_fpf()
165         # Return proper URL for reading
166         return fpf.get_url()
167
168     def get_file(self):
169         """
170         Downloads the file from filepicker.io and returns a Django File wrapper
171         object.
172         """
173         # Fetch FilepickerFile
174         fpf = self._get_fpf()
175         # Return Django File
176         return fpf.get_file()
177
178     def save(self, *args, **kwargs):
179         if self.name and not self.slug:
180             self._generate_unique_slug()
181         super(Document, self).save(*args, **kwargs)
182
183
184 class NoteManager(models.Manager):
185     """ Handle restoring data. """
186     def get_by_natural_key(self, fp_file, upstream_link):
187         """
188         Return a Note defined by its Filepicker and upstream URLs.
189         """
190         return self.get(fp_file=fp_file,upstream_link=upstream_link)
191
192
193 class Note(Document):
194     """ 
195     A django model representing an uploaded file and associated metadata.
196     """
197     objects = NoteManager()
198
199     # FIXME: refactor file choices after FP.io integration
200     UNKNOWN_FILE = '???'
201     FILE_TYPE_CHOICES = (
202         ('doc', 'MS Word compatible file (.doc, .docx, .rtf, .odf)'),
203         ('img', 'Scan or picture of notes'),
204         ('pdf', 'PDF file'),
205         ('ppt', 'Powerpoint'),
206         ('txt', 'Text'),
207         (UNKNOWN_FILE, 'Unknown file'),
208     )
209
210     file_type       = models.CharField(max_length=15,
211                             choices=FILE_TYPE_CHOICES,
212                             default=UNKNOWN_FILE,
213                             blank=True, null=True)
214
215     # Cache the Google drive file link
216     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
217
218     # Upload files to MEDIA_ROOT/notes/YEAR/MONTH/DAY, 2012/10/30/filename
219     pdf_file       = models.FileField(
220                             storage=fs,
221                             upload_to="notes/%Y/%m/%d/",
222                             blank=True, null=True)
223
224     # Generated by Google Drive but saved locally
225     text            = models.TextField(blank=True, null=True)
226     static_html     = models.BooleanField(default=False)
227
228     # html is deprecated. delete once data is all sorted.
229     html            = models.TextField(blank=True, null=True)
230
231     # Academic year of course
232     year            = models.IntegerField(blank=True, null=True,\
233                         default=datetime.datetime.utcnow().year)
234
235     # Number of times this note has been flagged as abusive/spam.
236     flags           = models.IntegerField(default=0,null=False)
237
238     # Social media tracking
239     tweeted         = models.BooleanField(default=False)
240     thanks          = models.PositiveIntegerField(default=0)
241
242     class Meta:
243         unique_together = ('fp_file', 'upstream_link')
244         ordering = ['-uploaded_at']
245
246     def __unicode__(self):
247         return u"Note at {0} (from {1})".format(self.fp_file, self.upstream_link)
248
249     def natural_key(self):
250         """
251         A Note is uniquely defined by both the Filepicker link and the upstream
252         link. The Filepicker link should be unique by itself, but it may be
253         null in the database, so the upstream link component should resolve
254         those cases.
255         """
256         # gdrive_url might also fit the bill?
257         return (self.fp_file, self.upstream_link)
258
259     def get_relative_s3_path(self):
260         """
261         returns s3 path relative to the appropriate bucket.
262         """
263         # Note.slug will be unique and brought in from RawDocument or created
264         # upon save() inside RawDocument.convert_to_note(). It makes for a good
265         # filename and its pretty well guaranteed to be there.
266         return 'html/{0}.html'.format(self.slug)
267
268     def send_to_s3(self, html, do_save=True):
269         """
270         Push the given HTML up to S3 for this Note.
271         Set do_save to False if the note will be saved outside this call.
272         """
273         # do nothing if HTML is empty.
274         if not html or not len(html):
275             return
276         # do nothing if already uploaded.
277         # Maybe run checksums if possible to confirm its really done?
278         # (but then you gotta wonder was the original correct or is the new
279         # one correct)
280         if self.static_html:
281             return
282         # upload the HTML file to static host if it is not already there
283         filepath = self.get_relative_s3_path()
284         if not default_storage.exists(filepath):
285             # This is a pretty ugly hackified answer to some s3boto shortcomings
286             # and some decent default settings chosen by django-storages.
287
288             # Create the new key (key == filename in S3 bucket)
289             newkey = default_storage.bucket.new_key(filepath)
290             # Upload data!
291             newkey.set_contents_from_string(html, headers=s3_upload_headers)
292             if not newkey.exists():
293                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
294
295             # set the permissions for everyone to read.
296             newkey.set_xml_acl(all_read_xml_acl)
297
298         # If the code reaches here, either:
299         # filepath exists on S3 but static_html is not marked.
300         # or
301         # file was just uploaded successfully to filepath
302         # Regardless, set note as uploaded.
303         self.static_html = True
304         if do_save:
305             self.save()
306
307     def update_note_on_s3(self, html):
308         # do nothing if HTML is empty.
309         if not html or not len(html):
310             return
311         # if it's not already there then bail out
312         filepath = self.get_relative_s3_path()
313         if not default_storage.exists(filepath):
314             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
315             return
316
317         key = default_storage.bucket.get_key(filepath)
318         key.set_contents_from_string(html, headers=s3_upload_headers)
319         key.set_xml_acl(all_read_xml_acl)
320
321     def get_absolute_url(self):
322         """ Resolve note url, use 'note' route and slug if slug
323             otherwise use note.id
324         """
325         if self.slug is not None:
326             # return a url ending in slug
327             return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.slug)
328         else:
329             # return a url ending in id
330             return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.id)
331
332     def filter_html(self, html):
333         """
334         Apply all sanitizing filters to HTML.
335         Takes in HTML string and outputs HTML string.
336         """
337         # Fun fact: This could be made into a static method.
338         if not html or not len(html):
339             # if there was no HTML, return an empty string
340             return ''
341
342         soup = BS(html)
343         # Iterate through filters, applying all to the soup object.
344         for soupfilter in (
345             self.sanitize_anchor_html,
346             self.set_canonical_link,
347         ):
348             soup = soupfilter(soup)
349         return str(soup)
350
351     def sanitize_anchor_html(self, soup):
352         """
353         Filter the given BeautifulSoup obj by adding target=_blank to all
354         anchor tags.
355         Returns BeautifulSoup obj.
356         """
357         # Fun fact: This could be made into a static method.
358         # Find all a tags in the HTML
359         a_tags = soup.find_all('a')
360         if not a_tags or not len(a_tags):
361             # nothing to process.
362             return soup
363
364         # build a tag sanitizer
365         def set_attribute_target(tag):
366             tag['target'] = '_blank'
367         # set all anchors to have target="_blank"
368         map(set_attribute_target, a_tags)
369
370         # return filtered soup
371         return soup
372
373     @staticmethod
374     def canonical_link_predicate(tag):
375         return tag.name == u'link' and \
376             tag.has_attr('rel') and \
377             u'canonical' in tag['rel']
378
379     def set_canonical_link(self, soup):
380         """
381         Filter the given BeautifulSoup obj by adding
382         <link rel="canonical" href="note.get_absolute_url" />
383         to the document head.
384         Returns BeautifulSoup obj.
385         """
386         domain = Site.objects.all()[0].domain
387         note_full_href = 'http://' + domain + self.get_absolute_url()
388         canonical_tags = soup.find_all(self.canonical_link_predicate)
389         if canonical_tags:
390             for tag in canonical_tags:
391                 tag['href'] = note_full_href
392         else:
393             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
394             head = soup.find('head')
395             head.append(new_tag)
396
397         # return filtered soup
398         return soup
399
400     def _update_parent_updated_at(self):
401         """ update the parent Course.updated_at model
402             with the latest uploaded_at """
403         self.course.updated_at = self.uploaded_at
404         self.course.save()
405
406     def save(self, *args, **kwargs):
407         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
408             self._update_parent_updated_at()
409         super(Note, self).save(*args, **kwargs)
410
411
412 auto_add_check_unique_together(Note)
413
414
415 def update_note_counts(note_instance):
416     try:
417         # test if the course still exists, or if this is a cascade delete.
418         note_instance.course
419     except Course.DoesNotExist:
420         # this is a cascade delete. there is no course to update
421         pass
422     else:
423         # course exists
424         note_instance.course.update_note_count()
425         note_instance.course.school.update_note_count()
426
427 @receiver(pre_save, sender=Note, weak=False)
428 def note_pre_save_receiver(sender, **kwargs):
429     """Stick an instance of the pre-save value of
430     the given Note instance in the instances itself.
431     This will be looked at in post_save."""
432     if not 'instance' in kwargs:
433         return
434
435     try:
436         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
437     except ObjectDoesNotExist:
438         pass
439
440 @receiver(post_save, sender=Note, weak=False)
441 def note_save_receiver(sender, **kwargs):
442     if not 'instance' in kwargs:
443         return
444     note = kwargs['instance']
445
446     if kwargs['created']:
447         update_note_counts(note)
448
449     try:
450         index = SearchIndex()
451         if kwargs['created']:
452             index.add_note(note)
453         else:
454             index.update_note(note, note.old_instance)
455     except Exception:
456         logger.error("Error with IndexDen:\n" + traceback.format_exc())
457
458
459 @receiver(post_delete, sender=Note, weak=False)
460 def note_delete_receiver(sender, **kwargs):
461     if not 'instance' in kwargs:
462         return
463     note = kwargs['instance']
464
465     # Update course and school counts of how
466     # many notes they have
467     update_note_counts(kwargs['instance'])
468
469     # Remove document from search index
470     try:
471         index = SearchIndex()
472         index.remove_note(note)
473     except Exception:
474         logger.error("Error with IndexDen:\n" + traceback.format_exc())
475
476     if note.user:
477         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
478
479
480 class UserUploadMapping(models.Model):
481     user = models.ForeignKey(User)
482     fp_file = models.CharField(max_length=255)
483
484     class Meta:
485         unique_together = ('user', 'fp_file')
486
487
488 @receiver(user_logged_in, weak=True)
489 def find_orphan_notes(sender, **kwargs):
490     user = kwargs['user']
491     s = kwargs['request'].session
492     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
493     for uploaded_note_url in uploaded_note_urls:
494         try:
495             note = Note.objects.get(fp_file=uploaded_note_url)
496             note.user = user
497             note.save()
498             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
499         except (ObjectDoesNotExist, MultipleObjectsReturned):
500             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
501             mapping.save()
502