Skip to content

Instantly share code, notes, and snippets.

@mdzhang
Created November 25, 2024 12:15
Show Gist options
  • Save mdzhang/7f931b5caf26d843e3127138fb1a47cf to your computer and use it in GitHub Desktop.
Save mdzhang/7f931b5caf26d843e3127138fb1a47cf to your computer and use it in GitHub Desktop.
laurel.py
"""
Convert https://laurelhillphl.comtch/app/uploads/2024/07/Laurel-Hill-Trees-as-of-7.2.2024.pdf to CSV.
"""
import re
import tabula
import pandas as pd
import logging
FORMAT = "%(message)s"
logging.basicConfig(level=logging.INFO, format=FORMAT, datefmt="[%X]")
logger = logging.getLogger("laurel")
dfs = tabula.read_pdf("laurel-hill-trees.pdf", pages="all")
real_dfs = []
columns = ['Tag Number', 'Section', 'Species']
for idx, df in enumerate(dfs):
try:
diff = set(df.columns).difference(columns)
if len(diff) == 0:
real_dfs.append(df)
continue
df2 = df.copy()
df2.loc['-1'] = df.columns
df2.columns = columns
df2['Tag Number'] = df2['Tag Number'].astype(int)
df2 = df2.sort_values(by='Tag Number').reset_index(drop=True)
real_dfs.append(df2)
except ValueError as err:
logger.error(f'Err {err} on idx {idx}')
import pdb; pdb.set_trace()
combined_df = pd.concat(real_dfs, ignore_index=True)
def extract_names(row):
pattern = r'^(.*?)\s*\((.*?)\)$'
match = re.findall(pattern, row['Species'])
if match:
match = match[0]
scientific_name = match[0]
common_name = match[1]
return {
'scientific_name': scientific_name,
'common_name ': common_name ,
}
logger.error(f'No match found for {row["Species"]}')
return {
'scientific_name': None,
'common_name ': None,
}
final_df = combined_df.apply(extract_names, axis='columns', result_type='expand')
out_df = pd.concat([combined_df, final_df], axis='columns')
out_df.to_csv("laurel-hill-trees.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment