Created
October 20, 2020 22:34
-
-
Save eisenjulian/81dd3a1d7e2e0ea9c594082f5b737506 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pdfminer.high_level | |
import datetime | |
import requests | |
import sys | |
import os | |
import re | |
import unidecode | |
import collections | |
def split(delimiters, string, maxsplit=0): | |
regexPattern = '|'.join(map(re.escape, delimiters)) | |
return re.split(regexPattern, string, maxsplit) | |
provinces = [ | |
"Buenos Aires", | |
"CABA", | |
"Catamarca", | |
"Chaco", | |
"Chubut", | |
"Córdoba", | |
"Corrientes", | |
"Entre Ríos", | |
"Formosa", | |
"Jujuy", | |
"La Pampa", | |
"La Rioja", | |
"Mendoza", | |
"Misiones", | |
"Neuquén", | |
"Río Negro", | |
"Salta", | |
"San Juan", | |
"San Luis", | |
"Santa Cruz", | |
"Santa Fe", | |
"Santiago del Estero", | |
"Tierra del Fuego", | |
"Tucumán" | |
] | |
def get_cases(rows): | |
cases = collections.Counter() | |
for row in rows: | |
row_split = unidecode.unidecode( | |
row.replace('.', '').replace('*', '')).lower().split() | |
if '|' not in row_split: | |
continue | |
index = row_split.index('|') | |
if not row_split[index-1].isnumeric(): | |
continue | |
if not row_split[index+1].isnumeric(): | |
continue | |
if "ciudad" in row_split or "caba" in row_split: | |
province = "caba" | |
else: | |
province = ' '.join(row_split[:index-1]) | |
cases[province] += int(row_split[index-1]) | |
return {province: cases[unidecode.unidecode(province.lower())] for province in provinces} | |
def get_deaths(rows): | |
deaths = collections.Counter() | |
for row in rows: | |
row_split = row.lower().split() | |
if len(row_split) <= 5: | |
continue | |
if row_split[1] not in ("residentes", "residente"): | |
continue | |
if not row_split[0].isnumeric(): | |
continue | |
if "ciudad" in row_split or "caba" in row_split: | |
province = "caba" | |
else: | |
province = unidecode.unidecode(' '.join(row_split[6:])) | |
deaths[province] += int(row_split[0]) | |
return {province: deaths[unidecode.unidecode(province.lower())] for province in provinces} | |
def run(): | |
if len(sys.argv) > 2: | |
print('Usage: pytohn crawl_report 19-10-20') | |
return | |
elif len(sys.argv) == 2: | |
formated_date = sys.argv[1] | |
else: | |
formated_date = datetime.datetime.now().strftime('%d-%m-%y') | |
filename = f'report-{formated_date}.pdf' | |
if os.path.exists(filename): | |
print('File already exists') | |
else: | |
url = f'https://www.argentina.gob.ar/sites/default/files/{formated_date}-reporte-vespertino-covid-19.pdf' | |
print('Fetching url', url) | |
result = requests.get(url) | |
if not result.ok: | |
print('Cound not fetch report') | |
return | |
with open(filename, 'wb') as outfile: | |
outfile.write(result.content) | |
report = pdfminer.high_level.extract_text(filename).replace(' ', '. ').replace('\n', '. ') | |
delimiters = "; y ", ", y ", "; ", ", ", ". ", "- ", " - ", " *" | |
rows = split(delimiters, report) | |
deaths = get_deaths(rows) | |
cases = get_cases(rows) | |
print("total deaths", sum(deaths.values())) | |
print(deaths) | |
print("total cases", sum(cases.values())) | |
print(cases) | |
if __name__ == '__main__': | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment