karmaworld/apps/notes/models.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 """
   6     Models for the notes django app.
   7     Contains only the minimum for handling files and their representation
   8 """
   9 import datetime
  10 import traceback
  11 import logging
  12 from allauth.account.signals import user_logged_in
  13 from django.contrib.auth.models import User
  14 from django.contrib.sites.models import Site
  15 from django.core.urlresolvers import reverse
  16 from django.utils.safestring import mark_safe
  17 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  18 from django.core.files.storage import default_storage
  19 from django.db.models import SET_NULL
  20 from django.db.models.signals import post_save, post_delete, pre_save
  21 from django.dispatch import receiver
  22 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
  23 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
  24 import os
  25 import time
  26 import urllib
  27
  28 from django.conf import settings
  29 from django.core.files import File
  30 from django.core.files.storage import FileSystemStorage
  31 from django.db import models
  32 from django.utils.text import slugify
  33 import django_filepicker
  34 from bs4 import BeautifulSoup as BS
  35 from taggit.managers import TaggableManager
  36 import bleach
  37 import bleach_whitelist
  38 import markdown
  39
  40 from karmaworld.apps.courses.models import Course
  41 from karmaworld.apps.licenses.models import License
  42 from karmaworld.apps.notes.search import SearchIndex
  43 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
  44
  45 FILEPICKER_API_KEY = os.environ['FILEPICKER_API_KEY']
  46
  47 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
  48 KEYWORD_MTURK_THRESHOLD = 3
  49
  50 logger = logging.getLogger(__name__)
  51 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
  52
  53 # Dictionary for S3 upload headers
  54 s3_upload_headers = {
  55     'Content-Type': 'text/html',
  56 }
  57
  58 # This is a bit hacky, but nothing else works. Grabbed this from a proper
  59 # file configured via S3 management console.
  60 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
  61 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
  62
  63
  64 class Document(models.Model):
  65     """
  66     An Abstract Base Class representing a document intended to be subclassed.
  67     """
  68     course          = models.ForeignKey(Course)
  69     tags            = TaggableManager(blank=True)
  70     name            = models.CharField(max_length=255, blank=True, null=True)
  71     slug            = models.SlugField(max_length=255, unique=True)
  72
  73     LECTURE_NOTES = 'LECTURE_NOTES'
  74     STUDY_GUIDE = 'STUDY_GUIDE'
  75     SYLLABUS = 'SYLLABUS'
  76     ASSIGNMENT = 'ASSIGNMENT'
  77     OTHER = 'OTHER'
  78     NOTE_CATEGORIES = (
  79         (LECTURE_NOTES, 'Lecture Notes'),
  80         (STUDY_GUIDE, 'Study Guide'),
  81         (SYLLABUS, 'Syllabus'),
  82         (ASSIGNMENT, 'Assignment'),
  83         (OTHER, 'Other'),
  84     )
  85     category = models.CharField(max_length=50, choices=NOTE_CATEGORIES, blank=True, null=True)
  86
  87     # license if different from default
  88     license         = models.ForeignKey(License, blank=True, null=True)
  89
  90     # provide an upstream file link
  91     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
  92
  93     # metadata relevant to the Upload process
  94     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
  95     ip              = models.GenericIPAddressField(blank=True, null=True,
  96                         help_text=u"IP address of the uploader")
  97     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
  98
  99
 100     # if True, NEVER show this file
 101     # WARNING: This may throw an error on migration
 102     is_hidden       = models.BooleanField(default=False)
 103
 104     ###
 105     # Everything Filepicker, now in one small area
 106
 107     # Allow pick (choose files), store (upload to S3), read (from FP repo),
 108     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
 109     # seconds). Generated one time, at class definition upon import. So the
 110     # server will need to be rebooted at least one time each year or this will
 111     # go stale.
 112     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
 113     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
 114     fp_policy      = encode_fp_policy(fp_policy_json)
 115     fp_signature   = sign_fp_policy(fp_policy)
 116
 117     # Hack because mimetypes conflict with extensions, but there is no way to
 118     # disable mimetypes.
 119     # https://github.com/Ink/django-filepicker/issues/22
 120     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
 121     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
 122     # parameters!
 123     fp_file = django_filepicker.models.FPFileField(
 124                 # FPFileField settings
 125                 apikey=FILEPICKER_API_KEY,
 126                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
 127                 additional_params={
 128                     'data-fp-multiple': 'true',
 129                     'data-fp-folders': 'true',
 130                     'data-fp-button-class':
 131                       'inline-button important add-note-btn',
 132                     'data-fp-button-text': 'Add Notes',
 133                     'data-fp-extensions':
 134                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
 135                     'data-fp-store-location': 'S3',
 136                     'data-fp-policy': fp_policy,
 137                     'data-fp-signature': fp_signature,
 138                     'type': 'filepicker',
 139                     'onchange': "got_file(event)",
 140                 },
 141                 # FileField settings
 142                 null=True, blank=True,
 143                 upload_to='nil', # field ignored because S3, but required.
 144                 verbose_name='', # prevent a label from showing up
 145                 )
 146     mimetype = models.CharField(max_length=255, blank=True, null=True)
 147
 148     class Meta:
 149         abstract = True
 150         ordering = ['-uploaded_at']
 151
 152     def _generate_unique_slug(self):
 153         """ generate a unique slug based on name and uploaded_at  """
 154         _slug = slugify(unicode(self.name))
 155         klass = self.__class__
 156         collision = klass.objects.filter(slug=_slug)
 157         if collision:
 158             _slug = u"{0}-{1}-{2}-{3}".format(
 159                     _slug, self.uploaded_at.month,
 160                     self.uploaded_at.day, self.uploaded_at.microsecond)
 161         self.slug = _slug
 162
 163     def _get_fpf(self):
 164         """
 165         Memoized FilepickerFile getter. Returns FilepickerFile.
 166         """
 167         if not hasattr(self, 'cached_fpf'):
 168             # Fetch additional_params containing signature, etc
 169             aps = self.fp_file.field.additional_params
 170             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
 171         return self.cached_fpf
 172
 173     def get_fp_url(self):
 174         """
 175         Returns the Filepicker URL for reading the upstream document.
 176         """
 177         fpf = self._get_fpf()
 178         # Return proper URL for reading
 179         return fpf.get_url()
 180
 181     def get_file(self):
 182         """
 183         Downloads the file from filepicker.io and returns a Django File wrapper
 184         object.
 185         """
 186         # Fetch FilepickerFile
 187         fpf = self._get_fpf()
 188         # Return Django File
 189         return fpf.get_file()
 190
 191     def save(self, *args, **kwargs):
 192         if self.name and not self.slug:
 193             self._generate_unique_slug()
 194         super(Document, self).save(*args, **kwargs)
 195
 196
 197 class NoteManager(models.Manager):
 198     """ Handle restoring data. """
 199     def get_by_natural_key(self, fp_file, upstream_link):
 200         """
 201         Return a Note defined by its Filepicker and upstream URLs.
 202         """
 203         return self.get(fp_file=fp_file,upstream_link=upstream_link)
 204
 205
 206 class Note(Document):
 207     """
 208     A django model representing an uploaded file and associated metadata.
 209     """
 210     objects = NoteManager()
 211
 212     PDF_MIMETYPES = (
 213       'application/pdf',
 214       'application/vnd.ms-powerpoint',
 215       'application/vnd.openxmlformats-officedocument.presentationml.presentation'
 216     )
 217
 218     # Cache the Google drive file link
 219     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
 220
 221     # Generated by Google Drive but saved locally
 222     text            = models.TextField(blank=True, null=True)
 223
 224     # Number of times this note has been flagged as abusive/spam.
 225     flags           = models.IntegerField(default=0,null=False)
 226
 227     # Social media tracking
 228     tweeted         = models.BooleanField(default=False)
 229     thanks          = models.PositiveIntegerField(default=0)
 230
 231     class Meta:
 232         unique_together = ('fp_file', 'upstream_link')
 233         ordering = ['-uploaded_at']
 234
 235     def __unicode__(self):
 236         return u"Note at {0} (from {1}) ({2})".format(self.fp_file, self.upstream_link, self.id)
 237
 238     def natural_key(self):
 239         """
 240         A Note is uniquely defined by both the Filepicker link and the upstream
 241         link. The Filepicker link should be unique by itself, but it may be
 242         null in the database, so the upstream link component should resolve
 243         those cases.
 244         """
 245         # gdrive_url might also fit the bill?
 246         return (self.fp_file, self.upstream_link)
 247
 248     def get_relative_s3_path(self):
 249         """
 250         returns s3 path relative to the appropriate bucket.
 251         """
 252         # Note.slug will be unique and brought in from RawDocument or created
 253         # upon save() inside RawDocument.convert_to_note(). It makes for a good
 254         # filename and its pretty well guaranteed to be there.
 255         return 'html/{0}.html'.format(self.slug)
 256
 257     def send_to_s3(self, html, do_save=True):
 258         """
 259         Push the given HTML up to S3 for this Note.
 260         Set do_save to False if the note will be saved outside this call.
 261         """
 262         # do nothing if HTML is empty.
 263         if not html or not len(html):
 264             return
 265         # upload the HTML file to static host if it is not already there
 266         filepath = self.get_relative_s3_path()
 267         if not default_storage.exists(filepath):
 268             # This is a pretty ugly hackified answer to some s3boto shortcomings
 269             # and some decent default settings chosen by django-storages.
 270
 271             # Create the new key (key == filename in S3 bucket)
 272             newkey = default_storage.bucket.new_key(filepath)
 273             # Upload data!
 274             newkey.set_contents_from_string(html, headers=s3_upload_headers)
 275             if not newkey.exists():
 276                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
 277
 278             # set the permissions for everyone to read.
 279             newkey.set_xml_acl(all_read_xml_acl)
 280
 281     def update_note_on_s3(self, html):
 282         # do nothing if HTML is empty.
 283         if not html or not len(html):
 284             return
 285         # if it's not already there then bail out
 286         filepath = self.get_relative_s3_path()
 287         if not default_storage.exists(filepath):
 288             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
 289             return
 290
 291         key = default_storage.bucket.get_key(filepath)
 292         key.set_contents_from_string(html, headers=s3_upload_headers)
 293         key.set_xml_acl(all_read_xml_acl)
 294
 295     def remaining_thanks_for_mturk(self):
 296         return KEYWORD_MTURK_THRESHOLD - self.thanks
 297
 298     def total_thanks_for_mturk(self):
 299         return KEYWORD_MTURK_THRESHOLD
 300
 301     def get_absolute_url(self):
 302         """ Resolve note url, use 'note' route and slug if slug
 303             otherwise use note.id
 304         """
 305         if self.slug:
 306             # return a url ending in slug
 307             if self.course.school:
 308                 return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.slug])
 309             else:
 310                 return reverse('note_detail', args=[self.course.department.school.slug, self.course.slug, self.slug])
 311         else:
 312             # return a url ending in id
 313             return reverse('note_detail', args=[self.course.school.slug, self.course.slug, self.id])
 314
 315     def get_absolute_keywords_url(self):
 316         """ Resolve note url, use 'note' route and slug if slug
 317             otherwise use note.id
 318         """
 319         if self.slug is not None:
 320             # return a url ending in slug
 321             if self.course.school:
 322                 return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.slug])
 323             else:
 324                 return reverse('note_keywords', args=[self.course.department.school.slug, self.course.slug, self.slug])
 325         else:
 326             # return a url ending in id
 327             return reverse('note_keywords', args=[self.course.school.slug, self.course.slug, self.id])
 328
 329     def get_absolute_quiz_url(self):
 330         """ Resolve note url, use 'note' route and slug if slug
 331             otherwise use note.id
 332         """
 333         if self.slug is not None:
 334             # return a url ending in slug
 335             if self.course.school:
 336                 return reverse('note_quiz', args=[self.course.school.slug, self.course.slug, self.slug])
 337             else:
 338                 return reverse('note_quiz', args=[self.course.department.school.slug, self.course.slug, self.slug])
 339         else:
 340             # return a url ending in id
 341             return reverse('note_quiz', args=[self.course.school.slug, self.course.slug, self.id])
 342
 343     def filter_html(self, html):
 344         """
 345         Apply all sanitizing filters to HTML.
 346         Takes in HTML string and outputs HTML string.
 347         """
 348         # Fun fact: This could be made into a static method.
 349         if not html or not len(html):
 350             # if there was no HTML, return an empty string
 351             return ''
 352
 353         soup = BS(html)
 354         # Iterate through filters, applying all to the soup object.
 355         for soupfilter in (
 356             self.sanitize_anchor_html,
 357             self.set_canonical_link,
 358         ):
 359             soup = soupfilter(soup)
 360         return str(soup)
 361
 362     def sanitize_anchor_html(self, soup):
 363         """
 364         Filter the given BeautifulSoup obj by adding target=_blank to all
 365         anchor tags.
 366         Returns BeautifulSoup obj.
 367         """
 368         # Fun fact: This could be made into a static method.
 369         # Find all a tags in the HTML
 370         a_tags = soup.find_all('a')
 371         if not a_tags or not len(a_tags):
 372             # nothing to process.
 373             return soup
 374
 375         # build a tag sanitizer
 376         def set_attribute_target(tag):
 377             tag['target'] = '_blank'
 378         # set all anchors to have target="_blank"
 379         map(set_attribute_target, a_tags)
 380
 381         # return filtered soup
 382         return soup
 383
 384     @staticmethod
 385     def canonical_link_predicate(tag):
 386         return tag.name == u'link' and \
 387             tag.has_attr('rel') and \
 388             u'canonical' in tag['rel']
 389
 390     def set_canonical_link(self, soup):
 391         """
 392         Filter the given BeautifulSoup obj by adding
 393         <link rel="canonical" href="note.get_absolute_url" />
 394         to the document head.
 395         Returns BeautifulSoup obj.
 396         """
 397         domain = Site.objects.all()[0].domain
 398         note_full_href = 'http://' + domain + self.get_absolute_url()
 399         canonical_tags = soup.find_all(self.canonical_link_predicate)
 400         if canonical_tags:
 401             for tag in canonical_tags:
 402                 tag['href'] = note_full_href
 403         else:
 404             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
 405             head = soup.find('head')
 406             head.append(new_tag)
 407
 408         # return filtered soup
 409         return soup
 410
 411     def _update_parent_updated_at(self):
 412         """ update the parent Course.updated_at model
 413             with the latest uploaded_at """
 414         self.course.updated_at = self.uploaded_at
 415         self.course.save()
 416
 417     def save(self, *args, **kwargs):
 418         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
 419             self._update_parent_updated_at()
 420         super(Note, self).save(*args, **kwargs)
 421
 422     def has_markdown(self):
 423         return hasattr(self, "notemarkdown")
 424
 425     def is_pdf(self):
 426         return self.mimetype in Note.PDF_MIMETYPES
 427
 428
 429 class NoteMarkdown(models.Model):
 430     note     = models.OneToOneField(Note, primary_key=True)
 431     markdown = models.TextField(blank=True, null=True)
 432     html     = models.TextField(blank=True, null=True)
 433
 434     @classmethod
 435     def sanitize(cls, html):
 436         return bleach.clean(html,
 437                 bleach_whitelist.markdown_tags,
 438                 bleach_whitelist.markdown_attrs,
 439                 strip=True)
 440
 441     def save(self, *args, **kwargs):
 442         if self.markdown and not self.html:
 443             self.html = markdown.markdown(self.markdown)
 444         self.html = NoteMarkdown.sanitize(self.html)
 445         super(NoteMarkdown, self).save(*args, **kwargs)
 446
 447 auto_add_check_unique_together(Note)
 448
 449
 450 def update_note_counts(note_instance):
 451     try:
 452         # test if the course still exists, or if this is a cascade delete.
 453         note_instance.course
 454     except Course.DoesNotExist:
 455         # this is a cascade delete. there is no course to update
 456         pass
 457     else:
 458         # course exists
 459         note_instance.course.update_thank_count()
 460         note_instance.course.update_note_count()
 461         if note_instance.course.school:
 462             note_instance.course.school.update_note_count()
 463         elif note_instance.course.department.school:
 464             note_instance.course.department.school.update_note_count()
 465
 466 @receiver(pre_save, sender=Note, weak=False)
 467 def note_pre_save_receiver(sender, **kwargs):
 468     """Stick an instance of the pre-save value of
 469     the given Note instance in the instances itself.
 470     This will be looked at in post_save."""
 471     if not 'instance' in kwargs:
 472         return
 473
 474     try:
 475         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
 476     except ObjectDoesNotExist:
 477         pass
 478
 479 @receiver(post_save, sender=Note, weak=False)
 480 def note_save_receiver(sender, **kwargs):
 481     if not 'instance' in kwargs:
 482         return
 483     note = kwargs['instance']
 484
 485
 486     update_note_counts(note)
 487
 488     try:
 489         index = SearchIndex()
 490         if kwargs['created']:
 491             index.add_note(note)
 492         else:
 493             index.update_note(note, note.old_instance)
 494     except Exception:
 495         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 496
 497
 498 @receiver(post_delete, sender=Note, weak=False)
 499 def note_delete_receiver(sender, **kwargs):
 500     if not 'instance' in kwargs:
 501         return
 502     note = kwargs['instance']
 503
 504     # Update course and school counts of how
 505     # many notes they have
 506     update_note_counts(kwargs['instance'])
 507
 508     # Remove document from search index
 509     try:
 510         index = SearchIndex()
 511         index.remove_note(note)
 512     except Exception:
 513         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 514
 515     if note.user:
 516         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
 517
 518
 519 class UserUploadMapping(models.Model):
 520     user = models.ForeignKey(User)
 521     fp_file = models.CharField(max_length=255)
 522
 523     class Meta:
 524         unique_together = ('user', 'fp_file')
 525
 526
 527 @receiver(user_logged_in, weak=True)
 528 def find_orphan_notes(sender, **kwargs):
 529     user = kwargs['user']
 530     s = kwargs['request'].session
 531     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
 532     for uploaded_note_url in uploaded_note_urls:
 533         try:
 534             note = Note.objects.get(fp_file=uploaded_note_url)
 535             note.user = user
 536             note.save()
 537             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 538         except (ObjectDoesNotExist, MultipleObjectsReturned):
 539             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
 540             mapping.save()
 541