karmaworld/apps/notes/models.py

   1 #!/usr/bin/env python
   2 # -*- coding:utf8 -*-
   3 # Copyright (C) 2012  FinalsClub Foundation
   4
   5 """
   6     Models for the notes django app.
   7     Contains only the minimum for handling files and their representation
   8 """
   9 import datetime
  10 import traceback
  11 import logging
  12 from allauth.account.signals import user_logged_in
  13 from django.contrib.auth.models import User
  14 from django.contrib.sites.models import Site
  15 from django.utils.safestring import mark_safe
  16 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  17 from django.core.files.storage import default_storage
  18 from django.db.models import SET_NULL
  19 from django.db.models.signals import post_save, post_delete, pre_save
  20 from django.dispatch import receiver
  21 from karmaworld.apps.users.models import NoteKarmaEvent, GenericKarmaEvent
  22 from karmaworld.secret.filepicker import FILEPICKER_API_KEY
  23 from karmaworld.utils.filepicker import encode_fp_policy, sign_fp_policy
  24 import os
  25 import time
  26 import urllib
  27
  28 from django.conf import settings
  29 from django.core.files import File
  30 from django.core.files.storage import FileSystemStorage
  31 from django.db import models
  32 from django.utils.text import slugify
  33 import django_filepicker
  34 from bs4 import BeautifulSoup as BS
  35 from taggit.managers import TaggableManager
  36
  37 from karmaworld.apps.courses.models import Course
  38 from karmaworld.apps.licenses.models import License
  39 from karmaworld.apps.notes.search import SearchIndex
  40 from karmaworld.settings.manual_unique_together import auto_add_check_unique_together
  41
  42 ANONYMOUS_UPLOAD_URLS = 'anonymous_upload_urls'
  43
  44 logger = logging.getLogger(__name__)
  45 fs = FileSystemStorage(location=settings.MEDIA_ROOT)
  46
  47 # Dictionary for S3 upload headers
  48 s3_upload_headers = {
  49     'Content-Type': 'text/html',
  50 }
  51
  52 # This is a bit hacky, but nothing else works. Grabbed this from a proper
  53 # file configured via S3 management console.
  54 # https://github.com/FinalsClub/karmaworld/issues/273#issuecomment-32572169
  55 all_read_xml_acl = '<?xml version="1.0" encoding="UTF-8"?>\n<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>READ_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>710efc05767903a0eae5064bbc541f1c8e68f8f344fa809dc92682146b401d9c</ID><DisplayName>Andrew</DisplayName></Grantee><Permission>WRITE_ACP</Permission></Grant><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="Group"><URI>http://acs.amazonaws.com/groups/global/AllUsers</URI></Grantee><Permission>READ</Permission></Grant></AccessControlList></AccessControlPolicy>'
  56
  57
  58 class Document(models.Model):
  59     """
  60     An Abstract Base Class representing a document intended to be subclassed.
  61     """
  62     course          = models.ForeignKey(Course)
  63     tags            = TaggableManager(blank=True)
  64     name            = models.CharField(max_length=255, blank=True, null=True)
  65     slug            = models.SlugField(max_length=255, unique=True)
  66
  67     # license if different from default
  68     license         = models.ForeignKey(License, blank=True, null=True)
  69
  70     # provide an upstream file link
  71     upstream_link   = models.URLField(max_length=1024, blank=True, null=True, unique=True)
  72
  73     # metadata relevant to the Upload process
  74     user            = models.ForeignKey(User, blank=True, null=True, on_delete=SET_NULL)
  75     ip              = models.GenericIPAddressField(blank=True, null=True,
  76                         help_text=u"IP address of the uploader")
  77     uploaded_at     = models.DateTimeField(null=True, default=datetime.datetime.utcnow)
  78
  79
  80     # if True, NEVER show this file
  81     # WARNING: This may throw an error on migration
  82     is_hidden       = models.BooleanField(default=False)
  83
  84     ###
  85     # Everything Filepicker, now in one small area
  86
  87     # Allow pick (choose files), store (upload to S3), read (from FP repo),
  88     # stat (status of FP repo files) for 1 year (current time + 365 * 24 * 3600
  89     # seconds). Generated one time, at class definition upon import. So the
  90     # server will need to be rebooted at least one time each year or this will
  91     # go stale.
  92     fp_policy_json = '{{"expiry": {0}, "call": ["pick","store","read","stat"]}}'
  93     fp_policy_json = fp_policy_json.format(int(time.time() + 31536000))
  94     fp_policy      = encode_fp_policy(fp_policy_json)
  95     fp_signature   = sign_fp_policy(fp_policy)
  96
  97     # Hack because mimetypes conflict with extensions, but there is no way to
  98     # disable mimetypes.
  99     # https://github.com/Ink/django-filepicker/issues/22
 100     django_filepicker.forms.FPFieldMixin.default_mimetypes = ''
 101     # Now let django-filepicker do the heavy lifting. Sort of. Look at all those
 102     # parameters!
 103     fp_file = django_filepicker.models.FPFileField(
 104                 # FPFileField settings
 105                 apikey=FILEPICKER_API_KEY,
 106                 services='COMPUTER,DROPBOX,URL,GOOGLE_DRIVE,EVERNOTE,GMAIL,BOX,FACEBOOK,FLICKR,PICASA,IMAGE_SEARCH,WEBCAM,FTP',
 107                 additional_params={
 108                     'data-fp-multiple': 'true',
 109                     'data-fp-folders': 'true',
 110                     'data-fp-button-class':
 111                       'add-note-btn small-10 columns large-4',
 112                     'data-fp-button-text':
 113                       mark_safe("<i class='fa fa-arrow-circle-o-up'></i> add notes"),
 114                     'data-fp-drag-class':
 115                       'dragdrop show-for-medium-up large-7 columns',
 116                     'data-fp-drag-text': 'Drop Some Knowledge',
 117                     'data-fp-extensions':
 118                       '.pdf,.doc,.docx,.txt,.html,.rtf,.odt,.png,.jpg,.jpeg,.ppt,.pptx',
 119                     'data-fp-store-location': 'S3',
 120                     'data-fp-policy': fp_policy,
 121                     'data-fp-signature': fp_signature,
 122                     'onchange': "got_file(event)",
 123                 },
 124                 # FileField settings
 125                 null=True, blank=True,
 126                 upload_to='nil', # field ignored because S3, but required.
 127                 verbose_name='', # prevent a label from showing up
 128                 )
 129     mimetype = models.CharField(max_length=255, blank=True, null=True)
 130
 131     class Meta:
 132         abstract = True
 133         ordering = ['-uploaded_at']
 134
 135     def _generate_unique_slug(self):
 136         """ generate a unique slug based on name and uploaded_at  """
 137         _slug = slugify(unicode(self.name))
 138         klass = self.__class__
 139         collision = klass.objects.filter(slug=_slug)
 140         if collision:
 141             _slug = u"{0}-{1}-{2}-{3}".format(
 142                     _slug, self.uploaded_at.month,
 143                     self.uploaded_at.day, self.uploaded_at.microsecond)
 144         self.slug = _slug
 145
 146     def _get_fpf(self):
 147         """
 148         Memoized FilepickerFile getter. Returns FilepickerFile.
 149         """
 150         if not hasattr(self, 'cached_fpf'):
 151             # Fetch additional_params containing signature, etc
 152             aps = self.fp_file.field.additional_params
 153             self.cached_fpf = django_filepicker.utils.FilepickerFile(self.fp_file.name, aps)
 154         return self.cached_fpf
 155
 156     def get_fp_url(self):
 157         """
 158         Returns the Filepicker URL for reading the upstream document.
 159         """
 160         # Fetch FilepickerFile
 161         if not self.fp_file.name:
 162             return None
 163
 164         fpf = self._get_fpf()
 165         # Return proper URL for reading
 166         return fpf.get_url()
 167
 168     def get_file(self):
 169         """
 170         Downloads the file from filepicker.io and returns a Django File wrapper
 171         object.
 172         """
 173         # Fetch FilepickerFile
 174         fpf = self._get_fpf()
 175         # Return Django File
 176         return fpf.get_file()
 177
 178     def save(self, *args, **kwargs):
 179         if self.name and not self.slug:
 180             self._generate_unique_slug()
 181         super(Document, self).save(*args, **kwargs)
 182
 183
 184 class NoteManager(models.Manager):
 185     """ Handle restoring data. """
 186     def get_by_natural_key(self, fp_file, upstream_link):
 187         """
 188         Return a Note defined by its Filepicker and upstream URLs.
 189         """
 190         return self.get(fp_file=fp_file,upstream_link=upstream_link)
 191
 192
 193 class Note(Document):
 194     """
 195     A django model representing an uploaded file and associated metadata.
 196     """
 197     objects = NoteManager()
 198
 199     # FIXME: refactor file choices after FP.io integration
 200     UNKNOWN_FILE = '???'
 201     FILE_TYPE_CHOICES = (
 202         ('doc', 'MS Word compatible file (.doc, .docx, .rtf, .odf)'),
 203         ('img', 'Scan or picture of notes'),
 204         ('pdf', 'PDF file'),
 205         ('ppt', 'Powerpoint'),
 206         ('txt', 'Text'),
 207         (UNKNOWN_FILE, 'Unknown file'),
 208     )
 209
 210     file_type       = models.CharField(max_length=15,
 211                             choices=FILE_TYPE_CHOICES,
 212                             default=UNKNOWN_FILE,
 213                             blank=True, null=True)
 214
 215     # Cache the Google drive file link
 216     gdrive_url      = models.URLField(max_length=1024, blank=True, null=True, unique=True)
 217
 218     # Upload files to MEDIA_ROOT/notes/YEAR/MONTH/DAY, 2012/10/30/filename
 219     pdf_file       = models.FileField(
 220                             storage=fs,
 221                             upload_to="notes/%Y/%m/%d/",
 222                             blank=True, null=True)
 223
 224     # Generated by Google Drive but saved locally
 225     text            = models.TextField(blank=True, null=True)
 226     static_html     = models.BooleanField(default=False)
 227
 228     # html is deprecated. delete once data is all sorted.
 229     html            = models.TextField(blank=True, null=True)
 230
 231     # Academic year of course
 232     year            = models.IntegerField(blank=True, null=True,\
 233                         default=datetime.datetime.utcnow().year)
 234
 235     # Number of times this note has been flagged as abusive/spam.
 236     flags           = models.IntegerField(default=0,null=False)
 237
 238     # Social media tracking
 239     tweeted         = models.BooleanField(default=False)
 240     thanks          = models.PositiveIntegerField(default=0)
 241
 242     class Meta:
 243         unique_together = ('fp_file', 'upstream_link')
 244         ordering = ['-uploaded_at']
 245
 246     def __unicode__(self):
 247         return u"Note at {0} (from {1})".format(self.fp_file, self.upstream_link)
 248
 249     def natural_key(self):
 250         """
 251         A Note is uniquely defined by both the Filepicker link and the upstream
 252         link. The Filepicker link should be unique by itself, but it may be
 253         null in the database, so the upstream link component should resolve
 254         those cases.
 255         """
 256         # gdrive_url might also fit the bill?
 257         return (self.fp_file, self.upstream_link)
 258
 259     def get_relative_s3_path(self):
 260         """
 261         returns s3 path relative to the appropriate bucket.
 262         """
 263         # Note.slug will be unique and brought in from RawDocument or created
 264         # upon save() inside RawDocument.convert_to_note(). It makes for a good
 265         # filename and its pretty well guaranteed to be there.
 266         return 'html/{0}.html'.format(self.slug)
 267
 268     def send_to_s3(self, html, do_save=True):
 269         """
 270         Push the given HTML up to S3 for this Note.
 271         Set do_save to False if the note will be saved outside this call.
 272         """
 273         # do nothing if HTML is empty.
 274         if not html or not len(html):
 275             return
 276         # do nothing if already uploaded.
 277         # Maybe run checksums if possible to confirm its really done?
 278         # (but then you gotta wonder was the original correct or is the new
 279         # one correct)
 280         if self.static_html:
 281             return
 282         # upload the HTML file to static host if it is not already there
 283         filepath = self.get_relative_s3_path()
 284         if not default_storage.exists(filepath):
 285             # This is a pretty ugly hackified answer to some s3boto shortcomings
 286             # and some decent default settings chosen by django-storages.
 287
 288             # Create the new key (key == filename in S3 bucket)
 289             newkey = default_storage.bucket.new_key(filepath)
 290             # Upload data!
 291             newkey.set_contents_from_string(html, headers=s3_upload_headers)
 292             if not newkey.exists():
 293                 raise LookupError('Unable to find uploaded S3 document {0}'.format(str(newkey)))
 294
 295             # set the permissions for everyone to read.
 296             newkey.set_xml_acl(all_read_xml_acl)
 297
 298         # If the code reaches here, either:
 299         # filepath exists on S3 but static_html is not marked.
 300         # or
 301         # file was just uploaded successfully to filepath
 302         # Regardless, set note as uploaded.
 303         self.static_html = True
 304         if do_save:
 305             self.save()
 306
 307     def update_note_on_s3(self, html):
 308         # do nothing if HTML is empty.
 309         if not html or not len(html):
 310             return
 311         # if it's not already there then bail out
 312         filepath = self.get_relative_s3_path()
 313         if not default_storage.exists(filepath):
 314             logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
 315             return
 316
 317         key = default_storage.bucket.get_key(filepath)
 318         key.set_contents_from_string(html, headers=s3_upload_headers)
 319         key.set_xml_acl(all_read_xml_acl)
 320
 321     def get_absolute_url(self):
 322         """ Resolve note url, use 'note' route and slug if slug
 323             otherwise use note.id
 324         """
 325         if self.slug is not None:
 326             # return a url ending in slug
 327             return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.slug)
 328         else:
 329             # return a url ending in id
 330             return u"/{0}/{1}/{2}".format(self.course.school.slug, self.course.slug, self.id)
 331
 332     def filter_html(self, html):
 333         """
 334         Apply all sanitizing filters to HTML.
 335         Takes in HTML string and outputs HTML string.
 336         """
 337         # Fun fact: This could be made into a static method.
 338         if not html or not len(html):
 339             # if there was no HTML, return an empty string
 340             return ''
 341
 342         soup = BS(html)
 343         # Iterate through filters, applying all to the soup object.
 344         for soupfilter in (
 345             self.sanitize_anchor_html,
 346             self.set_canonical_link,
 347         ):
 348             soup = soupfilter(soup)
 349         return str(soup)
 350
 351     def sanitize_anchor_html(self, soup):
 352         """
 353         Filter the given BeautifulSoup obj by adding target=_blank to all
 354         anchor tags.
 355         Returns BeautifulSoup obj.
 356         """
 357         # Fun fact: This could be made into a static method.
 358         # Find all a tags in the HTML
 359         a_tags = soup.find_all('a')
 360         if not a_tags or not len(a_tags):
 361             # nothing to process.
 362             return soup
 363
 364         # build a tag sanitizer
 365         def set_attribute_target(tag):
 366             tag['target'] = '_blank'
 367         # set all anchors to have target="_blank"
 368         map(set_attribute_target, a_tags)
 369
 370         # return filtered soup
 371         return soup
 372
 373     @staticmethod
 374     def canonical_link_predicate(tag):
 375         return tag.name == u'link' and \
 376             tag.has_attr('rel') and \
 377             u'canonical' in tag['rel']
 378
 379     def set_canonical_link(self, soup):
 380         """
 381         Filter the given BeautifulSoup obj by adding
 382         <link rel="canonical" href="note.get_absolute_url" />
 383         to the document head.
 384         Returns BeautifulSoup obj.
 385         """
 386         domain = Site.objects.all()[0].domain
 387         note_full_href = 'http://' + domain + self.get_absolute_url()
 388         canonical_tags = soup.find_all(self.canonical_link_predicate)
 389         if canonical_tags:
 390             for tag in canonical_tags:
 391                 tag['href'] = note_full_href
 392         else:
 393             new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
 394             head = soup.find('head')
 395             head.append(new_tag)
 396
 397         # return filtered soup
 398         return soup
 399
 400     def _update_parent_updated_at(self):
 401         """ update the parent Course.updated_at model
 402             with the latest uploaded_at """
 403         self.course.updated_at = self.uploaded_at
 404         self.course.save()
 405
 406     def save(self, *args, **kwargs):
 407         if self.uploaded_at and self.uploaded_at > self.course.updated_at:
 408             self._update_parent_updated_at()
 409         super(Note, self).save(*args, **kwargs)
 410
 411
 412 auto_add_check_unique_together(Note)
 413
 414
 415 def update_note_counts(note_instance):
 416     try:
 417         # test if the course still exists, or if this is a cascade delete.
 418         note_instance.course
 419     except Course.DoesNotExist:
 420         # this is a cascade delete. there is no course to update
 421         pass
 422     else:
 423         # course exists
 424         note_instance.course.update_note_count()
 425         note_instance.course.school.update_note_count()
 426
 427 @receiver(pre_save, sender=Note, weak=False)
 428 def note_pre_save_receiver(sender, **kwargs):
 429     """Stick an instance of the pre-save value of
 430     the given Note instance in the instances itself.
 431     This will be looked at in post_save."""
 432     if not 'instance' in kwargs:
 433         return
 434
 435     try:
 436         kwargs['instance'].old_instance = Note.objects.get(id=kwargs['instance'].id)
 437     except ObjectDoesNotExist:
 438         pass
 439
 440 @receiver(post_save, sender=Note, weak=False)
 441 def note_save_receiver(sender, **kwargs):
 442     if not 'instance' in kwargs:
 443         return
 444     note = kwargs['instance']
 445
 446     if kwargs['created']:
 447         update_note_counts(note)
 448
 449     try:
 450         index = SearchIndex()
 451         if kwargs['created']:
 452             index.add_note(note)
 453         else:
 454             index.update_note(note, note.old_instance)
 455     except Exception:
 456         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 457
 458
 459 @receiver(post_delete, sender=Note, weak=False)
 460 def note_delete_receiver(sender, **kwargs):
 461     if not 'instance' in kwargs:
 462         return
 463     note = kwargs['instance']
 464
 465     # Update course and school counts of how
 466     # many notes they have
 467     update_note_counts(kwargs['instance'])
 468
 469     # Remove document from search index
 470     try:
 471         index = SearchIndex()
 472         index.remove_note(note)
 473     except Exception:
 474         logger.error("Error with IndexDen:\n" + traceback.format_exc())
 475
 476     if note.user:
 477         GenericKarmaEvent.create_event(note.user, note.name, GenericKarmaEvent.NOTE_DELETED)
 478
 479
 480 class UserUploadMapping(models.Model):
 481     user = models.ForeignKey(User)
 482     fp_file = models.CharField(max_length=255)
 483
 484     class Meta:
 485         unique_together = ('user', 'fp_file')
 486
 487
 488 @receiver(user_logged_in, weak=True)
 489 def find_orphan_notes(sender, **kwargs):
 490     user = kwargs['user']
 491     s = kwargs['request'].session
 492     uploaded_note_urls = s.get(ANONYMOUS_UPLOAD_URLS, [])
 493     for uploaded_note_url in uploaded_note_urls:
 494         try:
 495             note = Note.objects.get(fp_file=uploaded_note_url)
 496             note.user = user
 497             note.save()
 498             NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
 499         except (ObjectDoesNotExist, MultipleObjectsReturned):
 500             mapping = UserUploadMapping.objects.create(fp_file=uploaded_note_url, user=user)
 501             mapping.save()
 502