Skip to content

Instantly share code, notes, and snippets.

@jrue
Created March 16, 2018 19:06
Show Gist options
  • Save jrue/5fac9923f4d96dfbf4cbd9a0dce102d6 to your computer and use it in GitHub Desktop.
Save jrue/5fac9923f4d96dfbf4cbd9a0dce102d6 to your computer and use it in GitHub Desktop.
Fresno County Jail Blotter Scraper
#!/usr/bin/python
from bs4 import BeautifulSoup
import urllib.request as ur
import re
import json
r = ur.urlopen("https://www.fresnosheriff.org/releases/jailblotter.aspx")
soup = BeautifulSoup(r, "html5lib")
tables = soup.find_all("table")
prisoners = []
j=0
#range to length of tables, counting by twos because charges are in adjacent table.
for i in range(0,len(tables)-1,2):
prisoners.append({})
#names
prisoner_name = re.search(u'^\s+?Booking#:([0-9]+)\s+?\n\s+?Name:(.*?)\s+?$', tables[i].find_all("font")[0].get_text())
prisoners[j]['booking'] = prisoner_name.group(1)
prisoners[j]['first_name'] = re.search(u',\s(.*?)$', prisoner_name.group(2)).group(1)
prisoners[j]['last_name'] = re.search(u'^(.*?),', prisoner_name.group(2)).group(1)
#arrest data
if i == 0:
k = 0
else:
k = 2
arrest_data = tables[i].find_all(class_="label")
prisoners[j]['jid'] = arrest_data[k].get_text()
prisoners[j]['dob'] = arrest_data[k + 1].get_text()
prisoners[j]['age'] = arrest_data[k + 2].get_text()
prisoners[j]['sex'] = arrest_data[k + 3].get_text()
prisoners[j]['race'] = arrest_data[k + 4].get_text()
prisoners[j]['height'] = arrest_data[k + 5].get_text()
prisoners[j]['weight'] = arrest_data[k + 6].get_text()
prisoners[j]['hair'] = arrest_data[k + 7].get_text()
prisoners[j]['eyes'] = arrest_data[k + 8].get_text()
prisoners[j]['arrest_date'] = arrest_data[k + 9].get_text()
prisoners[j]['booking_date'] = arrest_data[k + 10].get_text()
prisoners[j]['release_date'] = arrest_data[k + 11].get_text()
prisoners[j]['arresting_agency'] = arrest_data[k + 12].get_text()
prisoners[j]['agency_case'] = arrest_data[k + 13].get_text()
prisoners[j]['housing_location'] = arrest_data[k + 14].get_text()
prisoners[j]['occupation'] = arrest_data[k + 15].get_text()
#Charges
charges = tables[i+1].find_all("td")
num_of_charges = int(len(tables[i+1].find_all("td"))/8) - 1
prisoners[j]['charges'] = []
for charge in range(0, num_of_charges):
prisoners[j]['charges'].append({
'index' : charges[(8 * charge) + 8].get_text().strip(),
'charge': charges[(8 * charge) + 9].get_text().strip(),
'description': charges[(8 * charge) + 10].get_text().strip(),
'lvl': charges[(8 * charge) + 11].get_text().strip(),
'authority': charges[(8 * charge) + 12].get_text().strip(),
'case_number': charges[(8 * charge) + 13].get_text().strip(),
'is_bailable': charges[(8 * charge) + 14].get_text().strip(),
'bail_amount': charges[(8 * charge) + 15].get_text().strip()
})
j += 1
#save json data TODO: capture where we left off, and append new arrests
with open('data.json', 'w') as outfile:
json.dump(prisoners, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment