Finals Club (c) 2013"""
import os
-import requests
import ConfigParser
-from bs4 import BeautifulSoup as BS
from fabric.api import cd, env, lcd, prefix, run, sudo, task, local, settings
from fabric.contrib import files
env.code_root = env.proj_root
env.env_root = env.proj_root
env.supervisor_conf = '{0}/confs/{1}/supervisord.conf'.format(env.code_root, env.branch)
+env.usde_csv = '{0}/confs/acceditation.csv'.format(env.code_root)
env.use_ssh_config = True
-USDE_LINK = "http://ope.ed.gov/accreditation/GetDownloadFile.aspx"
-
######## Define host(s)
def here():
"""
raise Exception('\n'.join(errors))
@task
-def fetch_accreditation():
- """
- Connects to USDE accreditation and drops a CSV into confs.
- """
- r = requests.get(USDE_LINK)
- # Ensure the page was retrieved with 200
- if not r.ok:
- r.raise_for_status()
-
- # Process the HTML with BeautifulSoup
- soup = BS(r.text)
- # Extract all the anchor links.
- a = soup.find_all('a')
- # TODO maybe hit up itertools for speed? Probably.
- # Extract the HREFs from anchors.
- def get_href(anchor):
- return anchor.get('href')
- a = map(get_href, a)
- # Filter out all but the Accreditation links.
- def contains_accreditation(link):
- return 'Accreditation' in link and 'zip' in link
- a = filter(contains_accreditation, a)
- # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort)
- link = sorted(a)[-1]
-
- # Download the linked file to the FS and extract the CSV
- tempfile = '/tmp/accreditation.zip'
- csvfile = env.proj_root + '/confs/accreditation.csv'
- run('wget -B {0} -O {1} {2}'.format(USDE_LINK, tempfile, link))
- run("7z e -i'!*.csv' -so {0} >> {1}".format(tempfile, csvfile))
+def fetch_usde():
+ """
+ Download USDE accreditation school CSV.
+ """
+ virtenv_exec('{0}/manage.py fetch_usde_csv {1}'.format(env.code_root, env.usde_csv))
+
+@task
+def import_usde():
+ """
+ Import accreditation school CSV into the database and scrub it.
+ """
+ virtenv_exec('{0}/manage.py import_usde_csv {1}'.format(env.code_root, env.usde_csv))
+ virtenv_exec('{0}/manage.py sanitize_usde_schools'.format(env.code_root))
@task
def first_deploy():
update_reqs()
syncdb()
collect_static()
+ fetch_usde()
+ import_and_clean_usde()
start_supervisord()
--- /dev/null
+import csv
+import requests
+import itertools as it
+
+from bs4 import BeautifulSoup as BS
+from urlparse import urljoin
+from subprocess import call
+
+from django.core.management.base import BaseCommand
+from karmaworld.apps.courses.models import School
+
+class Command(BaseCommand):
+ args = '<destination>'
+ USDE_LINK = 'http://ope.ed.gov/accreditation/GetDownloadFile.aspx'
+ help = (""" Downloads data from US Department of Education.
+ Supply a destination for the csv file to be written to. """)
+
+ def handle(self, *args, **kwargs):
+
+ if len(args) < 1:
+ self.stdout.write('Provide a filename to save csv data into.\n')
+ return
+
+ filename = args[0]
+
+ r = requests.get(self.USDE_LINK)
+ # Ensure the page was retrieved with 200
+ if not r.ok:
+ r.raise_for_status()
+
+ # Process the HTML with BeautifulSoup
+ soup = BS(r.text)
+ # Extract all the anchor links.
+ a = soup.find_all('a')
+
+ # Extract the HREFs from anchors.
+ def get_href(anchor):
+ return anchor.get('href')
+ #a = map(get_href, a)
+
+ # Filter out all but the Accreditation links.
+ def contains_accreditation(link):
+ return 'Accreditation' in link and 'zip' in link
+ #a = filter(contains_accreditation, a)
+
+ # do the above stuff with itertools
+ a_iter = it.ifilter(contains_accreditation, it.imap(get_href, iter(a)))
+
+ # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort)
+ link = sorted(a_iter)[-1]
+
+ # Ensure link is absolute not relative
+ link = urljoin(self.USDE_LINK, link)
+
+ # Download the linked file to the FS and extract the CSV
+ tempfile = '/tmp/accreditation.zip'
+ call(['wget', '-O', tempfile, link])
+ fd = open(filename, 'w')
+ call(['7z', 'e', "-i!*.csv", '-so', tempfile], stdout=fd)
+ fd.close()
+ call(['rm', tempfile])