Last active
November 2, 2023 15:38
-
-
Save lokal-profil/0bb32eb05d7a401757e7734fde55c4f7 to your computer and use it in GitHub Desktop.
Merges the two files in wcvp.zip from Kew Gardens on plant_name_id, split the result by family
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from collections import defaultdict | |
from tqdm import tqdm | |
distribution_file = "wcvp_distribution.csv" | |
names_file = "wcvp_names.csv" | |
merge_file = "output_family/merge_{}.csv" | |
fieldnames_distribution = None | |
plant_id = defaultdict(list) | |
demo = False # only output matches for plant_name_id = 1 or 2 | |
def make_writer(family, fieldnames_out): | |
out_file = open(merge_file.format(family), 'w', newline='') | |
writer = csv.DictWriter(out_file, fieldnames=fieldnames_out, delimiter='|') | |
_ = writer.writeheader() | |
return writer, out_file | |
with open(distribution_file, "r") as infile_distribution: | |
distribution_reader = csv.DictReader(infile_distribution, delimiter='|') | |
fieldnames_distribution = distribution_reader.fieldnames | |
for row in tqdm(distribution_reader, desc ="Reading distribution"): | |
if (not demo or row.get('plant_name_id') in ['1', '2']): | |
plant_id[row.get('plant_name_id')].append(row) | |
families = defaultdict(list) | |
fieldnames_out = None | |
with open(names_file, "r") as infile_names: | |
names_reader = csv.DictReader(infile_names, delimiter='|') | |
fieldnames_out = list(names_reader.fieldnames) | |
fieldnames_out.extend(x for x in fieldnames_distribution if x not in fieldnames_out) | |
for row in tqdm(names_reader, desc ="Reading names"): | |
families[row.get('family')].append(row) | |
for family, rows in tqdm(families.items(), desc ="Processing families"): | |
writer = None # only iniitate for families with hits | |
out_file = None | |
for row in rows: | |
if row.get('plant_name_id') in plant_id: | |
if not writer: | |
writer, out_file = make_writer(family, fieldnames_out) | |
for dist in plant_id.get(row.get('plant_name_id')): | |
_ = writer.writerow(row|dist) | |
if out_file: | |
out_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment