Created
March 4, 2020 01:12
-
-
Save ryanpitts/c2cb78900a8ab51c8bb68c961f10e74a to your computer and use it in GitHub Desktop.
example scraper code for NICAR 2020
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import csv | |
URL = 'http://www.dllr.state.md.us/employment/warn.shtml' | |
warn_page = requests.get(URL) | |
soup = bs4.BeautifulSoup(warn_page.text, 'html.parser') | |
table = soup.find('table') | |
rows = table.find_all('tr') | |
HEADERS = ['warn_date', 'naics_code', 'biz', 'address', 'wia_code', 'total_employees', 'effective_date', 'type_code'] | |
with open('warn-data.csv', 'w', newline='') as outfile: | |
writer = csv.writer(outfile) | |
writer.writerow(HEADERS) | |
for row in rows[1:]: | |
cells = row.find_all('td') | |
warn_date = cells[0].text.strip() | |
naics_code = cells[1].text.strip() | |
biz = cells[2].text.strip().split() | |
clean_biz = ' '.join(biz) | |
address = cells[3].text.strip().split() | |
clean_address = ' '.join(address) | |
wia_code = cells[4].text.strip() | |
total_employees = cells[5].text.strip() | |
effective_date = cells[6].text.strip() | |
type_code = cells[7].text.strip() | |
data_out = [warn_date, naics_code, clean_biz, clean_address, wia_code, total_employees, effective_date, type_code] | |
writer.writerow(data_out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://raw.githubusercontent.com/ireapps/teaching-guide-python-scraping/master/mlb.html
http://www.dllr.state.md.us/employment/warn.shtml