Created
November 25, 2024 12:15
-
-
Save mdzhang/7f931b5caf26d843e3127138fb1a47cf to your computer and use it in GitHub Desktop.
laurel.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Convert https://laurelhillphl.comtch/app/uploads/2024/07/Laurel-Hill-Trees-as-of-7.2.2024.pdf to CSV. | |
""" | |
import re | |
import tabula | |
import pandas as pd | |
import logging | |
FORMAT = "%(message)s" | |
logging.basicConfig(level=logging.INFO, format=FORMAT, datefmt="[%X]") | |
logger = logging.getLogger("laurel") | |
dfs = tabula.read_pdf("laurel-hill-trees.pdf", pages="all") | |
real_dfs = [] | |
columns = ['Tag Number', 'Section', 'Species'] | |
for idx, df in enumerate(dfs): | |
try: | |
diff = set(df.columns).difference(columns) | |
if len(diff) == 0: | |
real_dfs.append(df) | |
continue | |
df2 = df.copy() | |
df2.loc['-1'] = df.columns | |
df2.columns = columns | |
df2['Tag Number'] = df2['Tag Number'].astype(int) | |
df2 = df2.sort_values(by='Tag Number').reset_index(drop=True) | |
real_dfs.append(df2) | |
except ValueError as err: | |
logger.error(f'Err {err} on idx {idx}') | |
import pdb; pdb.set_trace() | |
combined_df = pd.concat(real_dfs, ignore_index=True) | |
def extract_names(row): | |
pattern = r'^(.*?)\s*\((.*?)\)$' | |
match = re.findall(pattern, row['Species']) | |
if match: | |
match = match[0] | |
scientific_name = match[0] | |
common_name = match[1] | |
return { | |
'scientific_name': scientific_name, | |
'common_name ': common_name , | |
} | |
logger.error(f'No match found for {row["Species"]}') | |
return { | |
'scientific_name': None, | |
'common_name ': None, | |
} | |
final_df = combined_df.apply(extract_names, axis='columns', result_type='expand') | |
out_df = pd.concat([combined_df, final_df], axis='columns') | |
out_df.to_csv("laurel-hill-trees.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment