From ce41c833838cde6fe3f9c17f35dd576d16a899e8 Mon Sep 17 00:00:00 2001 From: Bryan Date: Thu, 12 Dec 2013 18:12:39 -0500 Subject: [PATCH] fetch USDE csv and import it closes #195 --- fabfile.py | 50 +++++---------- .../management/commands/fetch_usde_csv.py | 61 +++++++++++++++++++ 2 files changed, 77 insertions(+), 34 deletions(-) create mode 100644 karmaworld/apps/courses/management/commands/fetch_usde_csv.py diff --git a/fabfile.py b/fabfile.py index 18ad015..2686d37 100644 --- a/fabfile.py +++ b/fabfile.py @@ -3,10 +3,8 @@ Finals Club (c) 2013""" import os -import requests import ConfigParser -from bs4 import BeautifulSoup as BS from fabric.api import cd, env, lcd, prefix, run, sudo, task, local, settings from fabric.contrib import files @@ -20,11 +18,10 @@ env.branch = 'prod' env.code_root = env.proj_root env.env_root = env.proj_root env.supervisor_conf = '{0}/confs/{1}/supervisord.conf'.format(env.code_root, env.branch) +env.usde_csv = '{0}/confs/acceditation.csv'.format(env.code_root) env.use_ssh_config = True -USDE_LINK = "http://ope.ed.gov/accreditation/GetDownloadFile.aspx" - ######## Define host(s) def here(): """ @@ -269,36 +266,19 @@ def check_secrets(): raise Exception('\n'.join(errors)) @task -def fetch_accreditation(): - """ - Connects to USDE accreditation and drops a CSV into confs. - """ - r = requests.get(USDE_LINK) - # Ensure the page was retrieved with 200 - if not r.ok: - r.raise_for_status() - - # Process the HTML with BeautifulSoup - soup = BS(r.text) - # Extract all the anchor links. - a = soup.find_all('a') - # TODO maybe hit up itertools for speed? Probably. - # Extract the HREFs from anchors. - def get_href(anchor): - return anchor.get('href') - a = map(get_href, a) - # Filter out all but the Accreditation links. - def contains_accreditation(link): - return 'Accreditation' in link and 'zip' in link - a = filter(contains_accreditation, a) - # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort) - link = sorted(a)[-1] - - # Download the linked file to the FS and extract the CSV - tempfile = '/tmp/accreditation.zip' - csvfile = env.proj_root + '/confs/accreditation.csv' - run('wget -B {0} -O {1} {2}'.format(USDE_LINK, tempfile, link)) - run("7z e -i'!*.csv' -so {0} >> {1}".format(tempfile, csvfile)) +def fetch_usde(): + """ + Download USDE accreditation school CSV. + """ + virtenv_exec('{0}/manage.py fetch_usde_csv {1}'.format(env.code_root, env.usde_csv)) + +@task +def import_usde(): + """ + Import accreditation school CSV into the database and scrub it. + """ + virtenv_exec('{0}/manage.py import_usde_csv {1}'.format(env.code_root, env.usde_csv)) + virtenv_exec('{0}/manage.py sanitize_usde_schools'.format(env.code_root)) @task def first_deploy(): @@ -312,6 +292,8 @@ def first_deploy(): update_reqs() syncdb() collect_static() + fetch_usde() + import_and_clean_usde() start_supervisord() diff --git a/karmaworld/apps/courses/management/commands/fetch_usde_csv.py b/karmaworld/apps/courses/management/commands/fetch_usde_csv.py new file mode 100644 index 0000000..57f6415 --- /dev/null +++ b/karmaworld/apps/courses/management/commands/fetch_usde_csv.py @@ -0,0 +1,61 @@ +import csv +import requests +import itertools as it + +from bs4 import BeautifulSoup as BS +from urlparse import urljoin +from subprocess import call + +from django.core.management.base import BaseCommand +from karmaworld.apps.courses.models import School + +class Command(BaseCommand): + args = '' + USDE_LINK = 'http://ope.ed.gov/accreditation/GetDownloadFile.aspx' + help = (""" Downloads data from US Department of Education. + Supply a destination for the csv file to be written to. """) + + def handle(self, *args, **kwargs): + + if len(args) < 1: + self.stdout.write('Provide a filename to save csv data into.\n') + return + + filename = args[0] + + r = requests.get(self.USDE_LINK) + # Ensure the page was retrieved with 200 + if not r.ok: + r.raise_for_status() + + # Process the HTML with BeautifulSoup + soup = BS(r.text) + # Extract all the anchor links. + a = soup.find_all('a') + + # Extract the HREFs from anchors. + def get_href(anchor): + return anchor.get('href') + #a = map(get_href, a) + + # Filter out all but the Accreditation links. + def contains_accreditation(link): + return 'Accreditation' in link and 'zip' in link + #a = filter(contains_accreditation, a) + + # do the above stuff with itertools + a_iter = it.ifilter(contains_accreditation, it.imap(get_href, iter(a))) + + # Find the most recent. (Accreditation_YYYY_MM.zip means alphanumeric sort) + link = sorted(a_iter)[-1] + + # Ensure link is absolute not relative + link = urljoin(self.USDE_LINK, link) + + # Download the linked file to the FS and extract the CSV + tempfile = '/tmp/accreditation.zip' + call(['wget', '-O', tempfile, link]) + fd = open(filename, 'w') + call(['7z', 'e', "-i!*.csv", '-so', tempfile], stdout=fd) + fd.close() + call(['rm', tempfile]) -- 2.25.1