fetch USDE csv and import it closes #195

author Bryan <btbonval@gmail.com>

Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)

committer Bryan <btbonval@gmail.com>

Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)
author Bryan <btbonval@gmail.com>
Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)
committer Bryan <btbonval@gmail.com>
Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)
diff --git a/fabfile.py b/fabfile.py

index 18ad0150f7f36ea48f7efcc95abbbbc74143d0ae..2686d37219b5e2d726d95aa76a8c6bdfeae9cd7c 100644 (file)
--- a/fabfile.py
+++ b/fabfile.py
@@ -3,10 +3,8 @@
      Finals Club (c) 2013"""
  
  import os
-import requests
  import ConfigParser
  
-from bs4 import BeautifulSoup as BS
  from fabric.api import cd, env, lcd, prefix, run, sudo, task, local, settings
  from fabric.contrib import files
  
@@ -20,11 +18,10 @@ env.branch = 'prod'
  env.code_root = env.proj_root
  env.env_root = env.proj_root
  env.supervisor_conf = '{0}/confs/{1}/supervisord.conf'.format(env.code_root, env.branch)
+env.usde_csv = '{0}/confs/acceditation.csv'.format(env.code_root)
  
  env.use_ssh_config = True
  
-USDE_LINK = "http://ope.ed.gov/accreditation/GetDownloadFile.aspx"
-
  ######## Define host(s)
  def here():
      """
@@ -269,36 +266,19 @@ def check_secrets():
          raise Exception('\n'.join(errors))
  
  @task
-def fetch_accreditation():
-    """
-    Connects to USDE accreditation and drops a CSV into confs.
-    """
-    r = requests.get(USDE_LINK)
-    # Ensure the page was retrieved with 200
-    if not r.ok:
-        r.raise_for_status()
-
-    # Process the HTML with BeautifulSoup
-    soup = BS(r.text)
-    # Extract all the anchor links.
-    a = soup.find_all('a')
-    # TODO maybe hit up itertools for speed? Probably.
-    # Extract the HREFs from anchors.
-    def get_href(anchor):
-        return anchor.get('href')
-    a = map(get_href, a)
-    # Filter out all but the Accreditation links.
-    def contains_accreditation(link):
-        return 'Accreditation' in link and 'zip' in link
-    a = filter(contains_accreditation, a)      
-    # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort)
-    link = sorted(a)[-1]
-
-    # Download the linked file to the FS and extract the CSV
-    tempfile = '/tmp/accreditation.zip'
-    csvfile = env.proj_root + '/confs/accreditation.csv'
-    run('wget -B {0} -O {1} {2}'.format(USDE_LINK, tempfile, link))
-    run("7z e -i'!*.csv' -so {0} >> {1}".format(tempfile, csvfile))
+def fetch_usde():
+    """
+    Download USDE accreditation school CSV.
+    """
+    virtenv_exec('{0}/manage.py fetch_usde_csv {1}'.format(env.code_root, env.usde_csv))
+
+@task
+def import_usde():
+    """
+    Import accreditation school CSV into the database and scrub it.
+    """
+    virtenv_exec('{0}/manage.py import_usde_csv {1}'.format(env.code_root, env.usde_csv))
+    virtenv_exec('{0}/manage.py sanitize_usde_schools'.format(env.code_root))
  
  @task
  def first_deploy():
@@ -312,6 +292,8 @@ def first_deploy():
      update_reqs()
      syncdb()
      collect_static()
+    fetch_usde()
+    import_and_clean_usde()
      start_supervisord()
  
  
diff --git a/karmaworld/apps/courses/management/commands/fetch_usde_csv.py b/karmaworld/apps/courses/management/commands/fetch_usde_csv.py

new file mode 100644 (file)

index 0000000..57f6415
--- /dev/null
+++ b/karmaworld/apps/courses/management/commands/fetch_usde_csv.py
@@ -0,0 +1,61 @@
+import csv
+import requests
+import itertools as it
+
+from bs4 import BeautifulSoup as BS
+from urlparse import urljoin
+from subprocess import call
+
+from django.core.management.base import BaseCommand
+from karmaworld.apps.courses.models import School
+
+class Command(BaseCommand):
+    args = '<destination>'
+    USDE_LINK = 'http://ope.ed.gov/accreditation/GetDownloadFile.aspx'
+    help = (""" Downloads data from US Department of Education.
+                Supply a destination for the csv file to be written to. """)
+
+    def handle(self, *args, **kwargs):
+
+        if len(args) < 1:
+            self.stdout.write('Provide a filename to save csv data into.\n')
+            return
+
+        filename = args[0]
+
+        r = requests.get(self.USDE_LINK)
+        # Ensure the page was retrieved with 200
+        if not r.ok:
+            r.raise_for_status()
+    
+        # Process the HTML with BeautifulSoup
+        soup = BS(r.text)
+        # Extract all the anchor links.
+        a = soup.find_all('a')
+
+        # Extract the HREFs from anchors.
+        def get_href(anchor):
+            return anchor.get('href')
+        #a = map(get_href, a)
+
+        # Filter out all but the Accreditation links.
+        def contains_accreditation(link):
+            return 'Accreditation' in link and 'zip' in link
+        #a = filter(contains_accreditation, a)
+
+        # do the above stuff with itertools
+        a_iter = it.ifilter(contains_accreditation, it.imap(get_href, iter(a)))
+
+        # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort)
+        link = sorted(a_iter)[-1]
+
+        # Ensure link is absolute not relative
+        link = urljoin(self.USDE_LINK, link)
+
+        # Download the linked file to the FS and extract the CSV
+        tempfile = '/tmp/accreditation.zip'
+        call(['wget', '-O', tempfile, link])
+        fd = open(filename, 'w')
+        call(['7z', 'e', "-i!*.csv", '-so', tempfile], stdout=fd)
+        fd.close()
+        call(['rm', tempfile])
author	Bryan <btbonval@gmail.com>
	Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)
committer	Bryan <btbonval@gmail.com>
	Thu, 12 Dec 2013 23:12:39 +0000 (18:12 -0500)
fabfile.py		patch \| blob \| history
karmaworld/apps/courses/management/commands/fetch_usde_csv.py	[new file with mode: 0644]	patch \| blob