Skip to content

Instantly share code, notes, and snippets.

@harperreed
Created October 11, 2012 21:00

Revisions

  1. harperreed revised this gist Oct 11, 2012. 2 changed files with 72 additions and 6 deletions.
    18 changes: 12 additions & 6 deletions config.py.example
    Original file line number Diff line number Diff line change
    @@ -1,12 +1,18 @@
    url_root = 'https://www.chicago-card.com/'
    #enter dates if you want to crawl specific dates
    start_date = "08-01-2012"
    end_date = "10-01-2012"

    #enter the number of days you want to crawl (90 is the max)
    num_days = 90

    #username and password for chicago-card.com
    email = '' # username
    password = '' #passwd
    password = '' #passwd

    log_file = "CTA"
    #url root
    url_root = 'https://www.chicago-card.com/'


    start_date = "08-01-2012"
    end_date = "10-01-2012"

    num_days = 5
    #name of the logfile
    log_file = "CTA"
    60 changes: 60 additions & 0 deletions readme.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    #chicago card crawler
    A crawler for the chicago card plus website.


    ##Why
    We need more data!

    ##How

    open up a terminal

    copy config.py.example to config.py:

    `$ cp config.py.example config.py`



    edit config.py and fill out your info


    #enter dates if you want to crawl specific dates
    start_date = "08-01-2012"
    end_date = "10-01-2012"

    #enter the number of days you want to crawl (90 is the max)
    num_days = 90

    #username and password for chicago-card.com
    email = '' # username
    password = '' #passwd

    #url root
    url_root = 'https://www.chicago-card.com/'

    #name of the logfile
    log_file = "CTA"

    then run the script

    `$ python chicago_card_crawler.py`

    Magic

    INFO Starting crawl of Chicago Card plus
    INFO start date: 10-11-2012
    INFO End date: 08-01-2012
    INFO Crawling 4 days
    INFO Logging in to CTA
    INFO Logged in to CTA
    INFO Parsing cards
    INFO Account id: 176009
    INFO 2 cards found: ['1266731', '1392327']
    INFO Crawling card id #1266731
    INFO Dumping 10 lines
    INFO writing data to cta_1266731_10-11-2012_08-01-2012.csv
    INFO Crawling card id #1392327
    INFO Dumping 11 lines
    INFO writing data to cta_1392327_10-11-2012_08-01-2012.csv

    should work great!
  2. harperreed created this gist Oct 11, 2012.
    4 changes: 4 additions & 0 deletions .gitignore
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,4 @@
    config.py
    *.log
    *.csv
    *.pyc
    141 changes: 141 additions & 0 deletions chicago_card_crawler.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,141 @@
    import urllib, mechanize
    from mechanize import ParseResponse, urlopen, urljoin
    import time
    import simplejson
    import os,sys
    import csv
    import time
    from datetime import datetime, timedelta


    import logging

    import config

    """
    Setup logger
    """
    root = logging.getLogger()
    root.setLevel(logging.INFO)

    logger = logging.getLogger(config.log_file)
    hdlr = logging.FileHandler(config.log_file+'.log')
    log_format = '%(asctime)s %(levelname)s %(message)s'
    formatter = logging.Formatter(log_format)
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    formatter = logging.Formatter(log_format)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    logger.setLevel(logging.DEBUG)


    logger.info('Starting crawl of Chicago Card plus')



    end_date = "10-01-2012"

    if config.start_date:
    start_date = config.start_date
    start_date_time = datetime.strptime(start_date, "%m-%d-%Y")


    if config.end_date:
    end_date = config.end_date
    end_date_time = datetime.strptime(end_date, "%m-%d-%Y")

    if config.num_days:

    end_date_time = datetime.now()
    end_date = start_date_time.strftime("%m-%d-%Y")
    start_date_time = datetime.now() - timedelta(days=config.num_days)
    start_date = end_date_time.strftime("%m-%d-%Y")


    delta = end_date_time - start_date_time

    logger.info('start date: '+start_date)
    logger.info('End date: '+end_date)
    logger.info('Crawling '+str(delta.days)+" days")

    if delta.days >90:
    logger.error("max 90 days data available. requesting "+str(delta.days)+" days. ")
    sys.exit(1)

    br = mechanize.Browser()


    """
    Login to CTA
    """
    logger.info("Logging in to CTA")
    r = br.open(config.url_root)
    params = {
    'hdrUSERNAME':config.email,
    'hdrPassword':config.password,

    }
    data = urllib.urlencode(params)
    r = br.open(config.url_root + "login-process.aspx", data)
    page = r.read()
    logger.info("Logged in to CTA")
    """
    end logging
    """

    logger.info("Parsing cards")

    account_id = page.split('<input name="AccountID" id="AccountID" type="hidden" value="')[1].split("\" />\r\n")[0]


    logger.info("Account id: "+str(account_id))

    cards_html = page.split('"></a><b class="acct-name">')


    cards =[]
    for c in cards_html:
    try:
    cards .append(c.split('TransactionHistoryEx.aspx?F_CTA_CARD=')[1].split('" class="view90">Export Last 90 Days')[0])
    except:
    pass

    logger.info(str(len(cards))+" cards found: "+str(cards))


    for c_id in cards:
    logger.info('Crawling card id #'+c_id)
    url = config.url_root + "/ccplus/TransactionHistoryEx.aspx?F_CTA_CARD="+c_id
    export_page = br.open(url).read()
    view_state= export_page.split('<input type="hidden" name="__VIEWSTATE" value="')[1].split("\" />\r\n")[0]
    file_name = 'cta_'+c_id+"_"+start_date+'_'+end_date+'.csv'

    params = {
    'AccountID':account_id,
    'F_CTA_CARD':c_id,
    'F_TRAN_DATE_FROM_MONTH':start_date_time.strftime("%m"),
    'F_TRAN_DATE_FROM_DAY':start_date_time.strftime("%d"),
    'F_TRAN_DATE_FROM_YEAR':start_date_time.strftime("%Y"),
    'F_TRAN_DATE_TO_MONTH':end_date_time.strftime("%m"),
    'F_TRAN_DATE_TO_DAY':end_date_time.strftime("%d"),
    'F_TRAN_DATE_TO_YEAR':end_date_time.strftime("%Y"),
    'F_TRAN_DISPLAY':"ALL",
    'Search':'Export',
    '__VIEWSTATE':view_state,

    }

    data = urllib.urlencode(params)
    r = br.open(url, data)

    csv_dump = r.read()
    logger.info('Dumping '+str(len(csv_dump.split("\n"))) +" lines")

    logger.info('writing data to '+file_name)
    f = open(file_name, 'w')
    f.write(csv_dump)
    f.closed

    12 changes: 12 additions & 0 deletions config.py.example
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,12 @@
    url_root = 'https://www.chicago-card.com/'

    email = '' # username
    password = '' #passwd

    log_file = "CTA"


    start_date = "08-01-2012"
    end_date = "10-01-2012"

    num_days = 5
    Empty file added readme.md
    Empty file.