Created
October 11, 2012 21:00
Revisions
-
harperreed revised this gist
Oct 11, 2012 . 2 changed files with 72 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,12 +1,18 @@ #enter dates if you want to crawl specific dates start_date = "08-01-2012" end_date = "10-01-2012" #enter the number of days you want to crawl (90 is the max) num_days = 90 #username and password for chicago-card.com email = '' # username password = '' #passwd #url root url_root = 'https://www.chicago-card.com/' #name of the logfile log_file = "CTA" This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,60 @@ #chicago card crawler A crawler for the chicago card plus website. ##Why We need more data! ##How open up a terminal copy config.py.example to config.py: `$ cp config.py.example config.py` edit config.py and fill out your info #enter dates if you want to crawl specific dates start_date = "08-01-2012" end_date = "10-01-2012" #enter the number of days you want to crawl (90 is the max) num_days = 90 #username and password for chicago-card.com email = '' # username password = '' #passwd #url root url_root = 'https://www.chicago-card.com/' #name of the logfile log_file = "CTA" then run the script `$ python chicago_card_crawler.py` Magic INFO Starting crawl of Chicago Card plus INFO start date: 10-11-2012 INFO End date: 08-01-2012 INFO Crawling 4 days INFO Logging in to CTA INFO Logged in to CTA INFO Parsing cards INFO Account id: 176009 INFO 2 cards found: ['1266731', '1392327'] INFO Crawling card id #1266731 INFO Dumping 10 lines INFO writing data to cta_1266731_10-11-2012_08-01-2012.csv INFO Crawling card id #1392327 INFO Dumping 11 lines INFO writing data to cta_1392327_10-11-2012_08-01-2012.csv should work great! -
harperreed created this gist
Oct 11, 2012 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,4 @@ config.py *.log *.csv *.pyc This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,141 @@ import urllib, mechanize from mechanize import ParseResponse, urlopen, urljoin import time import simplejson import os,sys import csv import time from datetime import datetime, timedelta import logging import config """ Setup logger """ root = logging.getLogger() root.setLevel(logging.INFO) logger = logging.getLogger(config.log_file) hdlr = logging.FileHandler(config.log_file+'.log') log_format = '%(asctime)s %(levelname)s %(message)s' formatter = logging.Formatter(log_format) hdlr.setFormatter(formatter) logger.addHandler(hdlr) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter(log_format) ch.setFormatter(formatter) logger.addHandler(ch) logger.setLevel(logging.DEBUG) logger.info('Starting crawl of Chicago Card plus') end_date = "10-01-2012" if config.start_date: start_date = config.start_date start_date_time = datetime.strptime(start_date, "%m-%d-%Y") if config.end_date: end_date = config.end_date end_date_time = datetime.strptime(end_date, "%m-%d-%Y") if config.num_days: end_date_time = datetime.now() end_date = start_date_time.strftime("%m-%d-%Y") start_date_time = datetime.now() - timedelta(days=config.num_days) start_date = end_date_time.strftime("%m-%d-%Y") delta = end_date_time - start_date_time logger.info('start date: '+start_date) logger.info('End date: '+end_date) logger.info('Crawling '+str(delta.days)+" days") if delta.days >90: logger.error("max 90 days data available. requesting "+str(delta.days)+" days. ") sys.exit(1) br = mechanize.Browser() """ Login to CTA """ logger.info("Logging in to CTA") r = br.open(config.url_root) params = { 'hdrUSERNAME':config.email, 'hdrPassword':config.password, } data = urllib.urlencode(params) r = br.open(config.url_root + "login-process.aspx", data) page = r.read() logger.info("Logged in to CTA") """ end logging """ logger.info("Parsing cards") account_id = page.split('<input name="AccountID" id="AccountID" type="hidden" value="')[1].split("\" />\r\n")[0] logger.info("Account id: "+str(account_id)) cards_html = page.split('"></a><b class="acct-name">') cards =[] for c in cards_html: try: cards .append(c.split('TransactionHistoryEx.aspx?F_CTA_CARD=')[1].split('" class="view90">Export Last 90 Days')[0]) except: pass logger.info(str(len(cards))+" cards found: "+str(cards)) for c_id in cards: logger.info('Crawling card id #'+c_id) url = config.url_root + "/ccplus/TransactionHistoryEx.aspx?F_CTA_CARD="+c_id export_page = br.open(url).read() view_state= export_page.split('<input type="hidden" name="__VIEWSTATE" value="')[1].split("\" />\r\n")[0] file_name = 'cta_'+c_id+"_"+start_date+'_'+end_date+'.csv' params = { 'AccountID':account_id, 'F_CTA_CARD':c_id, 'F_TRAN_DATE_FROM_MONTH':start_date_time.strftime("%m"), 'F_TRAN_DATE_FROM_DAY':start_date_time.strftime("%d"), 'F_TRAN_DATE_FROM_YEAR':start_date_time.strftime("%Y"), 'F_TRAN_DATE_TO_MONTH':end_date_time.strftime("%m"), 'F_TRAN_DATE_TO_DAY':end_date_time.strftime("%d"), 'F_TRAN_DATE_TO_YEAR':end_date_time.strftime("%Y"), 'F_TRAN_DISPLAY':"ALL", 'Search':'Export', '__VIEWSTATE':view_state, } data = urllib.urlencode(params) r = br.open(url, data) csv_dump = r.read() logger.info('Dumping '+str(len(csv_dump.split("\n"))) +" lines") logger.info('writing data to '+file_name) f = open(file_name, 'w') f.write(csv_dump) f.closed This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,12 @@ url_root = 'https://www.chicago-card.com/' email = '' # username password = '' #passwd log_file = "CTA" start_date = "08-01-2012" end_date = "10-01-2012" num_days = 5 Empty file.