Skip to content

Instantly share code, notes, and snippets.

@actongorton
Created March 29, 2016 04:27
Show Gist options
  • Save actongorton/f6009248d932cc21d728 to your computer and use it in GitHub Desktop.
Save actongorton/f6009248d932cc21d728 to your computer and use it in GitHub Desktop.
Jail Scraper
# coding: utf-8
# import python libraries
from bs4 import BeautifulSoup
import urllib2
from urlparse import urlparse, parse_qs
import re
from mechanize import Browser
import lxml
import csv
# use mechanize to create a fake form submission
browser = Browser()
browser.open('http://app.bernco.gov/custodylist/CustodyListInter.aspx?submitted=true')
browser.select_form(nr=0)
browser['DescList'] = ['ALL']
response = browser.submit()
content = response.read()
# assign the results from mechanize to BeautifulSoup for parsing
inmate_soup = BeautifulSoup(content, 'lxml')
custody_list = inmate_soup.find('table', {'rules': 'all'})
# create an array to hold each row of information about inmates
inmate_list = []
# parse through the table and add inmate information to the inmate_list
for i in custody_list.find_all('tr'):
# only retrieve rows with a hyperlink
if i.find('a', href=True):
# get all of the table data fields within the row
inmate_details = i.find_all('td')
# inmate name
inmate_name = inmate_details[0].get_text()
# inmate link
inmate_link = str(i.find('a', href=True).get('href'))
# inmate unique id
inmate_id = inmate_details[1].get_text()
# inmate booking id
inmate_booking = inmate_details[2].get_text()
# inmate birth year
inmate_yob = inmate_details[3].get_text()
# inmate age
inmate_age = inmate_details[4].get_text()
# inmate gender
inmate_gender = inmate_details[5].get_text()
# inmate race
inmate_race = inmate_details[6].get_text()
# inmate arrival date
inmate_arrival = inmate_details[7].get_text()
# inmate cell number
inmate_cell = inmate_details[8].get_text()
# inmate description field
inmate_desc = inmate_details[9].get_text()
# assign to temporary array
inmate_personal = [
inmate_name,
inmate_link,
inmate_id,
inmate_booking,
inmate_yob,
inmate_age,
inmate_gender,
inmate_race,
inmate_arrival,
inmate_cell,
inmate_desc
]
# clean up each object in array
for x in inmate_personal:
x = unicode(x).strip()
# add to inmate_list
inmate_list.append(inmate_personal)
# write the inmate personal details to a csv file
with open('inmate_details.csv', 'w') as csvfile:
# define the column names
fieldnames = [
'inmate_name',
'inmate_link',
'inmate_id',
'inmate_booking',
'inmate_yob',
'inmate_age',
'inmate_gender',
'inmate_race',
'inmate_arrival',
'inmate_cell',
'inmate_desc'
]
# instantiate csv writing object
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# write the columns names
writer.writeheader()
# loop through inmate_list array and write the rows to the csv file:
for i in inmate_list:
writer.writerow({
'inmate_name': i[0],
'inmate_link': i[1],
'inmate_id': i[2],
'inmate_booking': i[3],
'inmate_yob': i[4],
'inmate_age': i[5],
'inmate_gender': i[6],
'inmate_race': i[7],
'inmate_arrival': i[8],
'inmate_cell': i[9],
'inmate_desc': i[10]
})
# ###################################################################################################################
# Because its possible for an inmate to have multiple arrest charges and a history or warrants and bails,
# we'll create individual files for each of those situations and then use the unique inmate id to join those
# files later on. These files can be imported into a database as "one to many" relationships (one inmate, many things)
# ###################################################################################################################
# arrest csv writer
arrest_record = open('arrest_details.csv', 'a')
# define the column names
arrest_fieldnames = [
'inmate_id',
'case_number',
'confirm',
'arrest_date',
'arrest_time',
'arrest_location',
'warrant_description',
'warrant_comment'
]
# instantiate csv writing object
arrest_writer = csv.DictWriter(arrest_record, fieldnames=arrest_fieldnames)
# write the columns names
arrest_writer.writeheader()
# write the record
def write_arrest(inmate_id, data):
arrest_writer.writerow({
'inmate_id': inmate_id,
'case_number': data[0],
'confirm': data[1],
'arrest_date': data[2],
'arrest_time': data[3],
'arrest_location': data[4],
'warrant_description': data[5],
'warrant_comment': data[6]
})
# warrant csv writer
warrant_record = open('warrant_details.csv', 'a')
# define the column names
warrant_fieldnames = [
'inmate_id',
'case_number',
'confirm',
'arrest_date',
'arrest_time',
'arrest_location',
'warrant_description',
'warrant_comment'
]
# instantiate csv writing object
warrant_writer = csv.DictWriter(warrant_record, fieldnames=warrant_fieldnames)
# write the columns names
warrant_writer.writeheader()
# write the record
def write_bail(inmate_id, data):
warrant_writer.writerow({
'inmate_id': inmate_id,
'case_number': data[0],
'confirm': data[1],
'arrest_date': data[2],
'arrest_time': data[3],
'arrest_location': data[4],
'warrant_description': data[5],
'warrant_comment': data[6]
})
# bail csv writer
bail_record = open('bail_details.csv', 'a')
# define the column names
bail_fieldnames = [
'inmate_id',
'case_number',
'bond_amount',
'bond_desc'
]
# instantiate csv writing object
bail_writer = csv.DictWriter(bail_record, fieldnames=bail_fieldnames)
# write the columns names
bail_writer.writeheader()
# write the record
def write_bail(inmate_id, data):
bail_writer.writerow({
'inmate_id': inmate_id,
'case_number': data[0],
'bond_amount': data[1],
'bond_desc': data[2]
})
# arrest charges
def arrest_charges(inmate_id, data):
# create a definition function to parse the buried details
def parse_arrests(d):
find_tables = d.find_all('table')
num_tables = int(len(find_tables)) -1
deepest_table = find_tables[num_tables]
records = deepest_table.find_all('tr')
# assign variable names
case_number = records[0].find('span').get_text()
release_type = records[1].find('span').get_text()
arrest_date = records[2].find('span').get_text()
arrest_time = records[3].find('span').get_text()
arrest_location = records[4].find('span').get_text()
statute = records[5].find('span').get_text()
description = records[6].find('span').get_text()
# write each record to csv
print '......', case_number, arrest_date, description
arrest = [
case_number,
release_type,
arrest_date,
arrest_time,
arrest_location,
statute,
description
]
write_warrant(inmate_id, arrest)
# but first we have to dig into these obnoxiously buried tables, one step at a time
first_level = data.find('table')
# go through each warrant record
arrest_record = first_level.find('tr')
while True:
try:
parse_arrests(arrest_record)
except:
pass
if arrest_record.next_sibling:
arrest_record = arrest_record.next_sibling
else:
break
# warrant history
def warrant_history(inmate_id, data):
# create a definition function to parse the buried details
def parse_warrants(d):
find_tables = d.find_all('table')
num_tables = int(len(find_tables)) -1
deepest_table = find_tables[num_tables]
records = deepest_table.find_all('tr')
# assign variable names
case_number = records[0].find('span').get_text()
confirm = records[1].find('span').get_text()
arrest_date = records[2].find('span').get_text()
arrest_time = records[3].find('span').get_text()
arrest_location = records[4].find('span').get_text()
warrant_desc = records[5].find('span').get_text()
warrant_comm = records[6].find('span').get_text()
# write each record to csv
print '......', case_number, confirm, arrest_date
warrant = [
case_number,
confirm,
arrest_date,
arrest_time,
arrest_location,
warrant_desc,
warrant_comm
]
write_warrant(inmate_id, warrant)
# but first we have to dig into these obnoxiously buried tables, one step at a time
first_level = data.find('table')
# go through each warrant record
warrant_record = first_level.find('tr')
while True:
try:
parse_warrants(warrant_record)
except:
pass
if warrant_record.next_sibling:
warrant_record = warrant_record.next_sibling
else:
break
# bail history
def bail_history(inmate_id, data):
# create a definition function to parse the buried details
def parse_bail(d):
find_tables = d.find_all('table')
num_tables = int(len(find_tables)) -1
deepest_table = find_tables[num_tables]
records = deepest_table.find_all('tr')
# assign variable names
case_number = records[0].find('span').get_text()
bond_amount = records[1].find('span').get_text()
bond_desc = records[2].find('span').get_text()
# write each record to csv
print '......', case_number, bond_amount, bond_desc
bail = [case_number, bond_amount, bond_desc]
write_bail(inmate_id, bail)
# but first we have to dig into these obnoxiously buried tables, one step at a time
first_level = data.find('table')
# go through each record
bail_record = first_level.find('tr')
while True:
try:
parse_bail(bail_record)
except:
pass
if bail_record.next_sibling:
bail_record = bail_record.next_sibling
else:
break
# flow control parsers
def parse_inmate(inmate_id, data):
# arrest charges
if data.find(id='GridView2_Panel'):
print '... gathering arrest records'
arrest_charges(inmate_id, data.find(id='GridView2_Panel'))
# warrant history
if data.find(id='GridView3_Panel'):
print '... gathering warrant records'
warrant_history(inmate_id, data.find(id='GridView3_Panel'))
# bail history
if data.find(id='GridView4_Panel'):
print '... gathering bail records'
bail_history(inmate_id, data.find(id='GridView4_Panel'))
# setup python to fetch individual inmate details
url = 'http://app.bernco.gov/custodylist/'
# loop through inmate_list and fetch details:
for i in inmate_list:
# assign variables for inmate id and link to inmate page
inmate_id = i[2]
inmate_link = i[1]
# create the complete url link to the inmate's details
inmate_url = url + inmate_link
# print a status update to let us know what's going on
print 'checking ', inmate_url
# setup beautifulsoup to parse the page
page = urllib2.urlopen(inmate_url)
soup = BeautifulSoup(page, 'lxml')
# send to parse_inmate definition
parse_inmate(inmate_id, soup)
# add a few spaces between the output to make it easier to read
print '\n\n'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment