Set canonical links in note contents #309

author Charles Connell <charles@connells.org>

Sat, 8 Feb 2014 21:12:52 +0000 (16:12 -0500)

committer Charles Connell <charles@connells.org>

Sat, 8 Feb 2014 21:17:57 +0000 (16:17 -0500)
author Charles Connell <charles@connells.org>
Sat, 8 Feb 2014 21:12:52 +0000 (16:12 -0500)
committer Charles Connell <charles@connells.org>
Sat, 8 Feb 2014 21:17:57 +0000 (16:17 -0500)
diff --git a/karmaworld/apps/notes/management/commands/add_canonical_link.py b/karmaworld/apps/notes/management/commands/add_canonical_link.py

new file mode 100644 (file)

index 0000000..b543573
--- /dev/null
+++ b/karmaworld/apps/notes/management/commands/add_canonical_link.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding:utf8 -*-
+# Copyright (C) 2014  FinalsClub Foundation
+from bs4 import BeautifulSoup
+from django.core.management import BaseCommand
+from karmaworld.apps.notes.models import Note
+from karmaworld.secret.static_s3 import S3_URL
+import requests
+
+
+class Command(BaseCommand):
+    help = """
+           Add a <link rel='canonical' ... /> to every note stored in S3
+           """
+
+    def handle(self, *args, **kwargs):
+        for note in Note.objects.all():
+            note_path = 'http:' + S3_URL + note.get_relative_s3_path()
+            resp = requests.get(note_path)
+            if resp.status_code != 200:
+                print("Could not retrieve " + note_path)
+                continue
+            html = resp.text
+
+            soup = BeautifulSoup(html)
+            soup = note.set_canonical_link(soup)
+
+            note.update_note_on_s3(unicode(soup))
+            print("Updated note " + unicode(note))
+
diff --git a/karmaworld/apps/notes/models.py b/karmaworld/apps/notes/models.py

index 83f00c3d46635ece523f345a6acc957ddd18a71c..3e58f4886aa3d23d2dc84d76afe33ecadea9d9de 100644 (file)
--- a/karmaworld/apps/notes/models.py
+++ b/karmaworld/apps/notes/models.py
@@ -11,6 +11,7 @@ import traceback
  import logging
  from allauth.account.signals import user_logged_in
  from django.contrib.auth.models import User
+from django.contrib.sites.models import Site
  from django.utils.safestring import mark_safe
  from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned
  from django.core.files.storage import default_storage
@@ -300,6 +301,20 @@ class Note(Document):
          if do_save:
              self.save()
  
+    def update_note_on_s3(self, html):
+        # do nothing if HTML is empty.
+        if not html or not len(html):
+            return
+        # if it's not already there then bail out
+        filepath = self.get_relative_s3_path()
+        if not default_storage.exists(filepath):
+            logger.warn("Cannot update note on S3, it does not exist already: " + unicode(self))
+            return
+
+        key = default_storage.bucket.get_key(filepath)
+        key.set_contents_from_string(html, headers=s3_upload_headers)
+        key.set_xml_acl(all_read_xml_acl)
+
      def get_absolute_url(self):
          """ Resolve note url, use 'note' route and slug if slug
              otherwise use note.id
@@ -324,7 +339,8 @@ class Note(Document):
          soup = BS(html)
          # Iterate through filters, applying all to the soup object.
          for soupfilter in (
-          self.sanitize_anchor_html,
+            self.sanitize_anchor_html,
+            self.set_canonical_link,
          ):
              soup = soupfilter(soup)
          return str(soup)
@@ -351,6 +367,33 @@ class Note(Document):
          # return filtered soup
          return soup
  
+    @staticmethod
+    def canonical_link_predicate(tag):
+        return tag.name == u'link' and \
+            tag.has_attr('rel') and \
+            u'canonical' in tag['rel']
+
+    def set_canonical_link(self, soup):
+        """
+        Filter the given BeautifulSoup obj by adding
+        <link rel="canonical" href="note.get_absolute_url" />
+        to the document head.
+        Returns BeautifulSoup obj.
+        """
+        domain = Site.objects.all()[0].domain
+        note_full_href = 'http://' + domain + self.get_absolute_url()
+        canonical_tags = soup.find_all(self.canonical_link_predicate)
+        if canonical_tags:
+            for tag in canonical_tags:
+                tag['href'] = note_full_href
+        else:
+            new_tag = soup.new_tag('link', rel='canonical', href=note_full_href)
+            head = soup.find('head')
+            head.append(new_tag)
+
+        # return filtered soup
+        return soup
+
      def _update_parent_updated_at(self):
          """ update the parent Course.updated_at model
              with the latest uploaded_at """
diff --git a/reqs/common.txt b/reqs/common.txt

index 08ec9c8673304298cdcd37bc5669b35b4328b45a..9a058cf9cada881b67628e85a23ecb8e8d92af1e 100644 (file)
--- a/reqs/common.txt
+++ b/reqs/common.txt
@@ -9,7 +9,7 @@ urllib3==1.5
  google-api-python-client==1.0
  django-grappelli==2.4.8
  git+https://github.com/FinalsClub/django-taggit.git
-git+https://github.com/btbonval/django-filepicker
+git+https://github.com/btbonval/django-filepicker.git
  filemagic==1.6
  requests
  beautifulsoup4
author	Charles Connell <charles@connells.org>
	Sat, 8 Feb 2014 21:12:52 +0000 (16:12 -0500)
committer	Charles Connell <charles@connells.org>
	Sat, 8 Feb 2014 21:17:57 +0000 (16:17 -0500)
karmaworld/apps/notes/management/commands/add_canonical_link.py	[new file with mode: 0644]	patch \| blob
karmaworld/apps/notes/models.py		patch \| blob \| history
reqs/common.txt		patch \| blob \| history