karmaworld/apps/notes/models.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 """
   6     Models for the notes django app.
   7     Contains only the minimum for handling files and their representation
   8 """
   9 import datetime
  10 import traceback
  11 import logging
  12 from allauth.account.signals import user_logged_in
  13 from django.contrib.auth.models import User
  14 from django.contrib.sites.models import Site
  15 from django.core.urlresolvers import reverse
  16 from django.utils.safestring import mark_safe
  17 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  18 from django.core.files.storage import default_storage
  19 from django.db.models import SET_NULL
  20 from django.db.models.signals import post_save, post_delete, pre_save
  21 from django.dispatch import receiver
  22 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
  23 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
  24 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
  25 import os
  26 import time
  27 import urllib
  28
  29 from django.conf import settings
  30 from django.core.files import File
  31 from django.core.files.storage import FileSystemStorage
  32 from django.db import models
  33 from django.utils.text import slugify
  34 import django_filepicker
  35 from bs4 import BeautifulSoup as BS
  36 from taggit.managers import TaggableManager
  37
  38 from karmaworld.apps.courses.models import Course
  39 from karmaworld.apps.licenses.models import License
  40 from karmaworld.apps.notes.search import SearchIndex
  41 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
  42
  43 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
  44
  45 logger = logging.getLogger(__name__)
  46 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
  47
  48 # Dictionary for S3 upload headers
  49 s3_upload_headers = {
  50     'Content-Type': 'text/html',
  51 }
  52
  53 # This is a bit hacky, but nothing else works. Grabbed this from a proper
  54 # file configured via S3 management console.
  55 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
  56 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
  57
  58
  59 class Document(models.Model):
  60     """
  61     An Abstract Base Class representing a document intended to be subclassed.
  62     """
  63     course          = models.ForeignKey(Course)
  64     tags            = TaggableManager(blank=True)
  65     name            = models.CharField(max_length=255, blank=True, null=True)
  66     slug            = models.SlugField(max_length=255, unique=True)
  67
  68     # license if different from default
  69     license         = models.ForeignKey(License, blank=True, null=True)
  70
  71     # provide an upstream file link
  72     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
  73
  74     # metadata relevant to the Upload process
  75     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
  76     ip              = models.GenericIPAddressField(blank=True, null=True,
  77                         help_text=u"IP address of the uploader")
  78     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
  79
  80
  81     # if True, NEVER show this file
  82     # WARNING: This may throw an error on migration
  83     is_hidden       = models.BooleanField(default=False)
  84
  85     ###
  86     # Everything Filepicker, now in one small area
  87
  88     # Allow pick (choose files), store (upload to S3), read (from FP repo),
  89     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
  90     # seconds). Generated one time, at class definition upon import. So the
  91     # server will need to be rebooted at least one time each year or this will
  92     # go stale.
  93     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
  94     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
  95     fp_policy      = encode_fp_policy(fp_policy_json)
  96     fp_signature   = sign_fp_policy(fp_policy)
  97
  98     # Hack because mimetypes conflict with extensions, but there is no way to
  99     # disable mimetypes.
 100     # https://github.com/Ink/django-filepicker/issues/22
 101     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
 102     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
 103     # parameters!
 104     fp_file = django_filepicker.models.FPFileField(
 105                 # FPFileField settings
 106                 apikey=FILEPICKER_API_KEY,
 107                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
 108                 additional_params={
 109                     'data-fp-multiple': 'true',
 110                     'data-fp-folders': 'true',
 111                     'data-fp-button-class':
 112                       'inline-button important add-note-btn',
 113                     'data-fp-button-text': 'Add Notes',
 114                     'data-fp-extensions':
 115                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
 116                     'data-fp-store-location': 'S3',
 117                     'data-fp-policy': fp_policy,
 118                     'data-fp-signature': fp_signature,
 119                     'type': 'filepicker',
 120                     'onchange': "got_file(event)",
 121                 },
 122                 # FileField settings
 123                 null=True, blank=True,
 124                 upload_to='nil', # field ignored because S3, but required.
 125                 verbose_name='', # prevent a label from showing up
 126                 )
 127     mimetype = models.CharField(max_length=255, blank=True, null=True)
 128
 129     class Meta:
 130         abstract = True
 131         ordering = ['-uploaded_at']
 132
 133     def _generate_unique_slug(self):
 134         """ generate a unique slug based on name and uploaded_at  """
 135         _slug = slugify(unicode(self.name))
 136         klass = self.__class__
 137         collision = klass.objects.filter(slug=_slug)
 138         if collision:
 139             _slug = u"{0}-{1}-{2}-{3}".format(
 140                     _slug, self.uploaded_at.month,
 141                     self.uploaded_at.day, self.uploaded_at.microsecond)
 142         self.slug = _slug
 143
 144     def _get_fpf(self):
 145         """
 146         Memoized FilepickerFile getter. Returns FilepickerFile.
 147         """
 148         if not hasattr(self, 'cached_fpf'):
 149             # Fetch additional_params containing signature, etc
 150             aps = self.fp_file.field.additional_params
 151             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
 152         return self.cached_fpf
 153
 154     def get_fp_url(self):
 155         """
 156         Returns the Filepicker URL for reading the upstream document.
 157         """
 158         fpf = self._get_fpf()
 159         # Return proper URL for reading
 160         return fpf.get_url()
 161
 162     def get_file(self):
 163         """
 164         Downloads the file from filepicker.io and returns a Django File wrapper
 165         object.
 166         """
 167         # Fetch FilepickerFile
 168         fpf = self._get_fpf()
 169         # Return Django File
 170         return fpf.get_file()
 171
 172     def save(self, *args, **kwargs):
 173         if self.name and not self.slug:
 174             self._generate_unique_slug()
 175         super(Document, self).save(*args, **kwargs)
 176
 177
 178 class NoteManager(models.Manager):
 179     """ Handle restoring data. """
 180     def get_by_natural_key(self, fp_file, upstream_link):
 181         """
 182         Return a Note defined by its Filepicker and upstream URLs.
 183         """
 184         return self.get(fp_file=fp_file,upstream_link=upstream_link)
 185
 186
 187 class Note(Document):
 188     """
 189     A django model representing an uploaded file and associated metadata.
 190     """
 191     objects = NoteManager()
 192
 193     # FIXME: refactor file choices after FP.io integration
 194     UNKNOWN_FILE = '???'
 195     FILE_TYPE_CHOICES = (
 196         ('doc', 'MS Word compatible file (.doc, .docx, .rtf, .odf)'),
 197         ('img', 'Scan or picture of notes'),
 198         ('pdf', 'PDF file'),
 199         ('ppt', 'Powerpoint'),
 200         ('txt', 'Text'),
 201         (UNKNOWN_FILE, 'Unknown file'),
 202     )
 203
 204     PDF_MIMETYPES = (
 205       'application/pdf',
 206       'application/vnd.ms-powerpoint',
 207       'application/vnd.openxmlformats-officedocument.presentationml.presentation'
 208     )
 209
 210     file_type       = models.CharField(max_length=15,
 211                             choices=FILE_TYPE_CHOICES,
 212                             default=UNKNOWN_FILE,
 213                             blank=True, null=True)
 214
 215     # Cache the Google drive file link
 216     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
 217
 218     # Upload files to MEDIA_ROOT/notes/YEAR/MONTH/DAY, 2012/10/30/filename
 219     pdf_file       = models.FileField(
 220                             storage=fs,
 221                             upload_to="notes/%Y/%m/%d/",
 222                             blank=True, null=True)
 223
 224     # Generated by Google Drive but saved locally
 225     text            = models.TextField(blank=True, null=True)
 226     static_html     = models.BooleanField(default=False)
 227
 228     # html is deprecated. delete once data is all sorted.
 229     html            = models.TextField(blank=True, null=True)
 230
 231     # Academic year of course
 232     year            = models.IntegerField(blank=True, null=True,\
 233                         default=datetime.datetime.utcnow().year)
 234
 235     # Number of times this note has been flagged as abusive/spam.
 236     flags           = models.IntegerField(default=0,null=False)
 237
 238     # Social media tracking
 239     tweeted         = models.BooleanField(default=False)
 240     thanks          = models.PositiveIntegerField(default=0)
 241
 242     class Meta:
 243         unique_together = ('fp_file', 'upstream_link')
 244         ordering = ['-uploaded_at']
 245
 246     def __unicode__(self):
 247         return u"Note at {0} (from {1}) ({2})".format(self.fp_file, self.upstream_link, self.id)
 248
 249     def natural_key(self):
 250         """
 251         A Note is uniquely defined by both the Filepicker link and the upstream
 252         link. The Filepicker link should be unique by itself, but it may be
 253         null in the database, so the upstream link component should resolve
 254         those cases.
 255         """
 256         # gdrive_url might also fit the bill?
 257         return (self.fp_file, self.upstream_link)
 258
 259     def get_relative_s3_path(self):
 260         """
 261         returns s3 path relative to the appropriate bucket.
 262         """
 263         # Note.slug will be unique and brought in from RawDocument or created
 264         # upon save() inside RawDocument.convert_to_note(). It makes for a good
 265         # filename and its pretty well guaranteed to be there.
 266         return 'html/{0}.html'.format(self.slug)
 267
 268     def send_to_s3(self, html, do_save=True):
 269         """
 270         Push the given HTML up to S3 for this Note.
 271         Set do_save to False if the note will be saved outside this call.
 272         """
 273         # do nothing if HTML is empty.
 274         if not html or not len(html):
 275             return
 276         # do nothing if already uploaded.
 277         # Maybe run checksums if possible to confirm its really done?
 278         # (but then you gotta wonder was the original correct or is the new
 279         # one correct)
 280         if self.static_html:
 281             return
 282         # upload the HTML file to static host if it is not already there
 283         filepath = self.get_relative_s3_path()
 284         if not default_storage.exists(filepath):
 285             # This is a pretty ugly hackified answer to some s3boto shortcomings
 286             # and some decent default settings chosen by django-storages.
 287
 288             # Create the new key (key == filename in S3 bucket)
 289             newkey = default_storage.bucket.new_key(filepath)
 290             # Upload data!
 291             newkey.set_contents_from_string(html, headers=s3_upload_headers)
 292             if not newkey.exists():
 293                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
 294
 295             # set the permissions for everyone to read.
 296             newkey.set_xml_acl(all_read_xml_acl)
 297
 298         # If the code reaches here, either:
 299         # filepath exists on S3 but static_html is not marked.
 300         # or
 301         # file was just uploaded successfully to filepath
 302         # Regardless, set note as uploaded.
 303         self.static_html = True
 304         if do_save:
 305             self.save()
 306
 307     def update_note_on_s3(self, html):
 308         # do nothing if HTML is empty.
 309         if not html or not len(html):
 310             return
 311         # if it's not already there then bail out
 312         filepath = self.get_relative_s3_path()
 313         if not default_storage.exists(filepath):
 314             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
 315             return
 316
 317         key = default_storage.bucket.get_key(filepath)
 318         key.set_contents_from_string(html, headers=s3_upload_headers)
 319         key.set_xml_acl(all_read_xml_acl)
 320
 321     def get_absolute_url(self):
 322         """ Resolve note url, use 'note' route and slug if slug
 323             otherwise use note.id
 324         """
 325         if self.slug is not None:
 326             # return a url ending in slug
 327             if self.course.school:
 328                 return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.slug])
 329             else:
 330                 return reverse('note_detail', args=[self.course.department.school.slug, self.course.slug, self.slug])
 331         else:
 332             # return a url ending in id
 333             return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.id])
 334
 335     def get_absolute_keywords_url(self):
 336         """ Resolve note url, use 'note' route and slug if slug
 337             otherwise use note.id
 338         """
 339         if self.slug is not None:
 340             # return a url ending in slug
 341             if self.course.school:
 342                 return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.slug])
 343             else:
 344                 return reverse('note_keywords', args=[self.course.department.school.slug, self.course.slug, self.slug])
 345         else:
 346             # return a url ending in id
 347             return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.id])
 348
 349
 350     def filter_html(self, html):
 351         """
 352         Apply all sanitizing filters to HTML.
 353         Takes in HTML string and outputs HTML string.
 354         """
 355         # Fun fact: This could be made into a static method.
 356         if not html or not len(html):
 357             # if there was no HTML, return an empty string
 358             return ''
 359
 360         soup = BS(html)
 361         # Iterate through filters, applying all to the soup object.
 362         for soupfilter in (
 363             self.sanitize_anchor_html,
 364             self.set_canonical_link,
 365         ):
 366             soup = soupfilter(soup)
 367         return str(soup)
 368
 369     def sanitize_anchor_html(self, soup):
 370         """
 371         Filter the given BeautifulSoup obj by adding target=_blank to all
 372         anchor tags.
 373         Returns BeautifulSoup obj.
 374         """
 375         # Fun fact: This could be made into a static method.
 376         # Find all a tags in the HTML
 377         a_tags = soup.find_all('a')
 378         if not a_tags or not len(a_tags):
 379             # nothing to process.
 380             return soup
 381
 382         # build a tag sanitizer
 383         def set_attribute_target(tag):
 384             tag['target'] = '_blank'
 385         # set all anchors to have target="_blank"
 386         map(set_attribute_target, a_tags)
 387
 388         # return filtered soup
 389         return soup
 390
 391     @staticmethod
 392     def canonical_link_predicate(tag):
 393         return tag.name == u'link' and \
 394             tag.has_attr('rel') and \
 395             u'canonical' in tag['rel']
 396
 397     def set_canonical_link(self, soup):
 398         """
 399         Filter the given BeautifulSoup obj by adding
 400         <link rel="canonical" href="note.get_absolute_url" />
 401         to the document head.
 402         Returns BeautifulSoup obj.
 403         """
 404         domain = Site.objects.all()[0].domain
 405         note_full_href = 'http://' + domain + self.get_absolute_url()
 406         canonical_tags = soup.find_all(self.canonical_link_predicate)
 407         if canonical_tags:
 408             for tag in canonical_tags:
 409                 tag['href'] = note_full_href
 410         else:
 411             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
 412             head = soup.find('head')
 413             head.append(new_tag)
 414
 415         # return filtered soup
 416         return soup
 417
 418     def _update_parent_updated_at(self):
 419         """ update the parent Course.updated_at model
 420             with the latest uploaded_at """
 421         self.course.updated_at = self.uploaded_at
 422         self.course.save()
 423
 424     def save(self, *args, **kwargs):
 425         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
 426             self._update_parent_updated_at()
 427         super(Note, self).save(*args, **kwargs)
 428
 429     def has_markdown(self):
 430         return hasattr(self, "notemarkdown")
 431
 432     def is_pdf(self):
 433         return self.mimetype in Note.PDF_MIMETYPES
 434
 435
 436 class NoteMarkdown(models.Model):
 437     note     = models.OneToOneField(Note, primary_key=True)
 438     markdown = models.TextField(blank=True, null=True)
 439
 440 auto_add_check_unique_together(Note)
 441
 442
 443 def update_note_counts(note_instance):
 444     try:
 445         # test if the course still exists, or if this is a cascade delete.
 446         note_instance.course
 447     except Course.DoesNotExist:
 448         # this is a cascade delete. there is no course to update
 449         pass
 450     else:
 451         # course exists
 452         note_instance.course.update_note_count()
 453         if note_instance.course.school:
 454             note_instance.course.school.update_note_count()
 455         elif note_instance.course.department.school:
 456             note_instance.course.department.school.update_note_count()
 457
 458 @receiver(pre_save, sender=Note, weak=False)
 459 def note_pre_save_receiver(sender, **kwargs):
 460     """Stick an instance of the pre-save value of
 461     the given Note instance in the instances itself.
 462     This will be looked at in post_save."""
 463     if not 'instance' in kwargs:
 464         return
 465
 466     try:
 467         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
 468     except ObjectDoesNotExist:
 469         pass
 470
 471 @receiver(post_save, sender=Note, weak=False)
 472 def note_save_receiver(sender, **kwargs):
 473     if not 'instance' in kwargs:
 474         return
 475     note = kwargs['instance']
 476
 477     if kwargs['created']:
 478         update_note_counts(note)
 479
 480     try:
 481         index = SearchIndex()
 482         if kwargs['created']:
 483             index.add_note(note)
 484         else:
 485             index.update_note(note, note.old_instance)
 486     except Exception:
 487         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 488
 489
 490 @receiver(post_delete, sender=Note, weak=False)
 491 def note_delete_receiver(sender, **kwargs):
 492     if not 'instance' in kwargs:
 493         return
 494     note = kwargs['instance']
 495
 496     # Update course and school counts of how
 497     # many notes they have
 498     update_note_counts(kwargs['instance'])
 499
 500     # Remove document from search index
 501     try:
 502         index = SearchIndex()
 503         index.remove_note(note)
 504     except Exception:
 505         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 506
 507     if note.user:
 508         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
 509
 510
 511 class UserUploadMapping(models.Model):
 512     user = models.ForeignKey(User)
 513     fp_file = models.CharField(max_length=255)
 514
 515     class Meta:
 516         unique_together = ('user', 'fp_file')
 517
 518
 519 @receiver(user_logged_in, weak=True)
 520 def find_orphan_notes(sender, **kwargs):
 521     user = kwargs['user']
 522     s = kwargs['request'].session
 523     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
 524     for uploaded_note_url in uploaded_note_urls:
 525         try:
 526             note = Note.objects.get(fp_file=uploaded_note_url)
 527             note.user = user
 528             note.save()
 529             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 530         except (ObjectDoesNotExist, MultipleObjectsReturned):
 531             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
 532             mapping.save()
 533