Set canonical links in note contents #309
authorCharles Connell <charles@connells.org>
Sat, 8 Feb 2014 21:12:52 +0000 (16:12 -0500)
committerCharles Connell <charles@connells.org>
Sat, 8 Feb 2014 21:17:57 +0000 (16:17 -0500)
karmaworld/apps/notes/management/commands/add_canonical_link.py [new file with mode: 0644]
karmaworld/apps/notes/models.py
reqs/common.txt

diff --git a/karmaworld/apps/notes/management/commands/add_canonical_link.py b/karmaworld/apps/notes/management/commands/add_canonical_link.py
new file mode 100644 (file)
index 0000000..b543573
--- /dev/null
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2014  FinalsClub Foundation
+from bs4 import BeautifulSoup
+from django.core.management import BaseCommand
+from karmaworld.apps.notes.models import Note
+from karmaworld.secret.static_s3 import S3_URL
+import requests
+
+
+class Command(BaseCommand):
+    help = """
+           Add a <link rel='canonical' ... /> to every note stored in S3
+           """
+
+    def handle(self, *args, **kwargs):
+        for note in Note.objects.all():
+            note_path = 'http:' + S3_URL + note.get_relative_s3_path()
+            resp = requests.get(note_path)
+            if resp.status_code != 200:
+                print("Could not retrieve " + note_path)
+                continue
+            html = resp.text
+
+            soup = BeautifulSoup(html)
+            soup = note.set_canonical_link(soup)
+
+            note.update_note_on_s3(unicode(soup))
+            print("Updated note " + unicode(note))
+
index 83f00c3d46635ece523f345a6acc957ddd18a71c..3e58f4886aa3d23d2dc84d76afe33ecadea9d9de 100644 (file)
@@ -11,6 +11,7 @@ import traceback
 import logging
 from allauth.account.signals import user_logged_in
 from django.contrib.auth.models import User
+from django.contrib.sites.models import Site
 from django.utils.safestring import mark_safe
 from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
 from django.core.files.storage import default_storage
@@ -300,6 +301,20 @@ class Note(Document):
         if do_save:
             self.save()
 
+    def update_note_on_s3(self, html):
+        # do nothing if HTML is empty.
+        if not html or not len(html):
+            return
+        # if it's not already there then bail out
+        filepath = self.get_relative_s3_path()
+        if not default_storage.exists(filepath):
+            logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
+            return
+
+        key = default_storage.bucket.get_key(filepath)
+        key.set_contents_from_string(html, headers=s3_upload_headers)
+        key.set_xml_acl(all_read_xml_acl)
+
     def get_absolute_url(self):
         """ Resolve note url, use 'note' route and slug if slug
             otherwise use note.id
@@ -324,7 +339,8 @@ class Note(Document):
         soup = BS(html)
         # Iterate through filters, applying all to the soup object.
         for soupfilter in (
-          self.sanitize_anchor_html,
+            self.sanitize_anchor_html,
+            self.set_canonical_link,
         ):
             soup = soupfilter(soup)
         return str(soup)
@@ -351,6 +367,33 @@ class Note(Document):
         # return filtered soup
         return soup
 
+    @staticmethod
+    def canonical_link_predicate(tag):
+        return tag.name == u'link' and \
+            tag.has_attr('rel') and \
+            u'canonical' in tag['rel']
+
+    def set_canonical_link(self, soup):
+        """
+        Filter the given BeautifulSoup obj by adding
+        <link rel="canonical" href="note.get_absolute_url" />
+        to the document head.
+        Returns BeautifulSoup obj.
+        """
+        domain = Site.objects.all()[0].domain
+        note_full_href = 'http://' + domain + self.get_absolute_url()
+        canonical_tags = soup.find_all(self.canonical_link_predicate)
+        if canonical_tags:
+            for tag in canonical_tags:
+                tag['href'] = note_full_href
+        else:
+            new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
+            head = soup.find('head')
+            head.append(new_tag)
+
+        # return filtered soup
+        return soup
+
     def _update_parent_updated_at(self):
         """ update the parent Course.updated_at model
             with the latest uploaded_at """
index 08ec9c8673304298cdcd37bc5669b35b4328b45a..9a058cf9cada881b67628e85a23ecb8e8d92af1e 100644 (file)
@@ -9,7 +9,7 @@ urllib3==1.5
 google-api-python-client==1.0
 django-grappelli==2.4.8
 git+https://github.com/FinalsClub/django-taggit.git
-git+https://github.com/btbonval/django-filepicker
+git+https://github.com/btbonval/django-filepicker.git
 filemagic==1.6
 requests
 beautifulsoup4