harperreed · October 11, 2012 21:00 · Oct 11, 2012 · Oct 11, 2012
diff --git a/config.py.example b/config.py.example
@@ -1,12 +1,18 @@
-url_root = 'https://www.chicago-card.com/'
+#enter dates if you want to crawl specific dates
+start_date = "08-01-2012"
+end_date = "10-01-2012"
 
+#enter the number of days you want to crawl (90 is the max)
+num_days = 90
+
+#username and password for chicago-card.com
 email = '' # username
-password = '' #passwd
+password = '' #passwd   
 
-log_file = "CTA"
+#url root
+url_root = 'https://www.chicago-card.com/'
 
 
-start_date = "08-01-2012"
-end_date = "10-01-2012"
 
-num_days = 5
+#name of the logfile
+log_file = "CTA"
diff --git a/readme.md b/readme.md
@@ -0,0 +1,60 @@
+#chicago card crawler
+A crawler for the chicago card plus website. 
+
+
+##Why
+We need more data!
+
+##How
+
+open up a terminal
+
+copy config.py.example to config.py: 
+
+`$ cp config.py.example config.py`
+
+
+
+edit config.py and fill out your info
+
+
+	#enter dates if you want to crawl specific dates
+	start_date = "08-01-2012"
+	end_date = "10-01-2012"
+
+	#enter the number of days you want to crawl (90 is the max)
+	num_days = 90
+
+	#username and password for chicago-card.com
+	email = '' # username
+	password = '' #passwd	
+
+	#url root
+	url_root = 'https://www.chicago-card.com/'
+
+	#name of the logfile
+	log_file = "CTA"
+
+then run the script
+
+`$ python chicago_card_crawler.py`
+
+Magic
+
+	INFO Starting crawl of Chicago Card plus
+	INFO start date: 10-11-2012
+	INFO End date: 08-01-2012
+	INFO Crawling 4 days
+	INFO Logging in to CTA
+	INFO Logged in to CTA
+	INFO Parsing cards
+	INFO Account id: 176009
+	INFO 2 cards found: ['1266731', '1392327']
+	INFO Crawling card id #1266731
+	INFO Dumping 10 lines
+	INFO writing data to cta_1266731_10-11-2012_08-01-2012.csv
+	INFO Crawling card id #1392327
+	INFO Dumping 11 lines
+	INFO writing data to cta_1392327_10-11-2012_08-01-2012.csv 
+
+should work great!
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+config.py
+*.log
+*.csv
+*.pyc
diff --git a/chicago_card_crawler.py b/chicago_card_crawler.py
@@ -0,0 +1,141 @@
+import urllib, mechanize
+from mechanize import ParseResponse, urlopen, urljoin
+import time
+import simplejson
+import os,sys
+import csv
+import time
+from datetime import datetime, timedelta
+
+
+import logging
+
+import config
+
+"""
+Setup logger
+"""
+root = logging.getLogger()
+root.setLevel(logging.INFO)
+
+logger = logging.getLogger(config.log_file)
+hdlr = logging.FileHandler(config.log_file+'.log')
+log_format = '%(asctime)s %(levelname)s %(message)s'
+formatter = logging.Formatter(log_format)
+hdlr.setFormatter(formatter)
+logger.addHandler(hdlr)
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+formatter = logging.Formatter(log_format)
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+logger.setLevel(logging.DEBUG)
+
+
+logger.info('Starting crawl of Chicago Card plus')
+
+
+
+end_date = "10-01-2012"
+
+if config.start_date:
+  start_date = config.start_date
+  start_date_time = datetime.strptime(start_date, "%m-%d-%Y")
+
+
+if config.end_date:
+  end_date = config.end_date
+  end_date_time = datetime.strptime(end_date, "%m-%d-%Y")
+
+if config.num_days:
+
+  end_date_time  = datetime.now()
+  end_date  = start_date_time.strftime("%m-%d-%Y")
+  start_date_time = datetime.now() - timedelta(days=config.num_days)
+  start_date = end_date_time.strftime("%m-%d-%Y")
+
+
+delta = end_date_time - start_date_time
+
+logger.info('start date: '+start_date)
+logger.info('End date: '+end_date)
+logger.info('Crawling '+str(delta.days)+" days")
+
+if delta.days >90:
+  logger.error("max 90 days data available. requesting "+str(delta.days)+" days. ")
+  sys.exit(1)
+
+br = mechanize.Browser()
+
+
+"""
+Login to CTA 
+"""
+logger.info("Logging in to CTA")
+r = br.open(config.url_root)
+params = {
+    'hdrUSERNAME':config.email,
+    'hdrPassword':config.password,
+
+    }
+data = urllib.urlencode(params)
+r = br.open(config.url_root + "login-process.aspx", data)
+page = r.read()
+logger.info("Logged in to CTA")
+"""
+end logging
+"""
+
+logger.info("Parsing cards")
+
+account_id =  page.split('<input name="AccountID" id="AccountID" type="hidden" value="')[1].split("\" />\r\n")[0]
+
+
+logger.info("Account id: "+str(account_id))
+
+cards_html = page.split('"></a><b class="acct-name">')
+
+
+cards =[]
+for c in cards_html:
+  try:
+     cards .append(c.split('TransactionHistoryEx.aspx?F_CTA_CARD=')[1].split('" class="view90">Export Last 90 Days')[0])
+  except:
+    pass
+
+logger.info(str(len(cards))+" cards found: "+str(cards))
+
+
+for c_id in cards:
+  logger.info('Crawling card id #'+c_id)
+  url = config.url_root + "/ccplus/TransactionHistoryEx.aspx?F_CTA_CARD="+c_id
+  export_page = br.open(url).read()
+  view_state= export_page.split('<input type="hidden" name="__VIEWSTATE" value="')[1].split("\" />\r\n")[0]
+  file_name = 'cta_'+c_id+"_"+start_date+'_'+end_date+'.csv'
+
+  params = {
+      'AccountID':account_id,
+      'F_CTA_CARD':c_id,
+      'F_TRAN_DATE_FROM_MONTH':start_date_time.strftime("%m"),
+      'F_TRAN_DATE_FROM_DAY':start_date_time.strftime("%d"),
+      'F_TRAN_DATE_FROM_YEAR':start_date_time.strftime("%Y"),
+      'F_TRAN_DATE_TO_MONTH':end_date_time.strftime("%m"),
+      'F_TRAN_DATE_TO_DAY':end_date_time.strftime("%d"),
+      'F_TRAN_DATE_TO_YEAR':end_date_time.strftime("%Y"),
+      'F_TRAN_DISPLAY':"ALL",
+      'Search':'Export',
+      '__VIEWSTATE':view_state,
+
+      }
+
+  data = urllib.urlencode(params)
+  r = br.open(url, data)
+
+  csv_dump = r.read()
+  logger.info('Dumping '+str(len(csv_dump.split("\n"))) +" lines")
+
+  logger.info('writing data to '+file_name)
+  f = open(file_name, 'w')
+  f.write(csv_dump)
+  f.closed
+
diff --git a/config.py.example b/config.py.example
@@ -0,0 +1,12 @@
+url_root = 'https://www.chicago-card.com/'
+
+email = '' # username
+password = '' #passwd
+
+log_file = "CTA"
+
+
+start_date = "08-01-2012"
+end_date = "10-01-2012"
+
+num_days = 5
diff --git a/readme.md b/readme.md