Created
March 16, 2018 19:06
-
-
Save jrue/5fac9923f4d96dfbf4cbd9a0dce102d6 to your computer and use it in GitHub Desktop.
Fresno County Jail Blotter Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from bs4 import BeautifulSoup | |
import urllib.request as ur | |
import re | |
import json | |
r = ur.urlopen("https://www.fresnosheriff.org/releases/jailblotter.aspx") | |
soup = BeautifulSoup(r, "html5lib") | |
tables = soup.find_all("table") | |
prisoners = [] | |
j=0 | |
#range to length of tables, counting by twos because charges are in adjacent table. | |
for i in range(0,len(tables)-1,2): | |
prisoners.append({}) | |
#names | |
prisoner_name = re.search(u'^\s+?Booking#:([0-9]+)\s+?\n\s+?Name:(.*?)\s+?$', tables[i].find_all("font")[0].get_text()) | |
prisoners[j]['booking'] = prisoner_name.group(1) | |
prisoners[j]['first_name'] = re.search(u',\s(.*?)$', prisoner_name.group(2)).group(1) | |
prisoners[j]['last_name'] = re.search(u'^(.*?),', prisoner_name.group(2)).group(1) | |
#arrest data | |
if i == 0: | |
k = 0 | |
else: | |
k = 2 | |
arrest_data = tables[i].find_all(class_="label") | |
prisoners[j]['jid'] = arrest_data[k].get_text() | |
prisoners[j]['dob'] = arrest_data[k + 1].get_text() | |
prisoners[j]['age'] = arrest_data[k + 2].get_text() | |
prisoners[j]['sex'] = arrest_data[k + 3].get_text() | |
prisoners[j]['race'] = arrest_data[k + 4].get_text() | |
prisoners[j]['height'] = arrest_data[k + 5].get_text() | |
prisoners[j]['weight'] = arrest_data[k + 6].get_text() | |
prisoners[j]['hair'] = arrest_data[k + 7].get_text() | |
prisoners[j]['eyes'] = arrest_data[k + 8].get_text() | |
prisoners[j]['arrest_date'] = arrest_data[k + 9].get_text() | |
prisoners[j]['booking_date'] = arrest_data[k + 10].get_text() | |
prisoners[j]['release_date'] = arrest_data[k + 11].get_text() | |
prisoners[j]['arresting_agency'] = arrest_data[k + 12].get_text() | |
prisoners[j]['agency_case'] = arrest_data[k + 13].get_text() | |
prisoners[j]['housing_location'] = arrest_data[k + 14].get_text() | |
prisoners[j]['occupation'] = arrest_data[k + 15].get_text() | |
#Charges | |
charges = tables[i+1].find_all("td") | |
num_of_charges = int(len(tables[i+1].find_all("td"))/8) - 1 | |
prisoners[j]['charges'] = [] | |
for charge in range(0, num_of_charges): | |
prisoners[j]['charges'].append({ | |
'index' : charges[(8 * charge) + 8].get_text().strip(), | |
'charge': charges[(8 * charge) + 9].get_text().strip(), | |
'description': charges[(8 * charge) + 10].get_text().strip(), | |
'lvl': charges[(8 * charge) + 11].get_text().strip(), | |
'authority': charges[(8 * charge) + 12].get_text().strip(), | |
'case_number': charges[(8 * charge) + 13].get_text().strip(), | |
'is_bailable': charges[(8 * charge) + 14].get_text().strip(), | |
'bail_amount': charges[(8 * charge) + 15].get_text().strip() | |
}) | |
j += 1 | |
#save json data TODO: capture where we left off, and append new arrests | |
with open('data.json', 'w') as outfile: | |
json.dump(prisoners, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment