Last active
October 1, 2023 15:59
-
-
Save MarinaGolivets/a126ab6111c7f815c4ed9ae24fde8730 to your computer and use it in GitHub Desktop.
An R function for standardising plant taxon names against the GBIF taxonomic backbone
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# as input, provide a vector of verbatim taxon names (preferably with authorship) | |
# and a vector of existing local identifiers for those names | |
# load R packages | |
library(dplyr) | |
library(tidyr) | |
match_to_gbif.fn <- function(taxon_name, taxon_id, include_genus = FALSE) { | |
# perform initial matching in parallel | |
no_cores <- parallel::detectCores() | |
cl <- parallel::makeCluster(no_cores) | |
all_matches <- pbapply::pblapply( | |
taxon_name, | |
rgbif::name_backbone_verbose, | |
kingdom = "plants", strict = TRUE, cl = cl | |
) | |
parallel::stopCluster(cl) | |
# retrieve alternative matches | |
alternative_matches <- lapply( | |
all_matches, | |
function(x) { | |
y <- x$alternatives | |
if (nrow(y) == 0) { | |
y[1, 1] <- NA | |
colnames(y) <- "usageKey" | |
} else { | |
y <- y | |
} | |
return(y) | |
} | |
) %>% | |
mapply( | |
cbind, ., | |
taxon_name = taxon_name, taxon_id = taxon_id, | |
stringsAsFactors = FALSE, SIMPLIFY = FALSE | |
) %>% | |
data.table::rbindlist(fill = TRUE) %>% | |
filter(!is.na(usageKey)) %>% | |
distinct() | |
# retrieve best matches | |
best_matches <- lapply(all_matches, function(x) x$data) %>% | |
mapply( | |
cbind, ., | |
taxon_name = taxon_name, taxon_id = taxon_id, | |
stringsAsFactors = FALSE, SIMPLIFY = FALSE | |
) %>% | |
data.table::rbindlist(fill = TRUE) %>% | |
distinct() | |
matched <- best_matches %>% | |
filter(!(matchType %in% c("NONE", "HIGHERRANK"))) | |
nonmatched <- best_matches %>% | |
filter(matchType %in% c("NONE", "HIGHERRANK")) | |
matched_alternative <- try( | |
alternative_matches %>% | |
filter(phylum == "Tracheophyta") %>% # use only vascular plants | |
filter(confidence >= 0) %>% | |
filter(taxon_id %in% nonmatched$taxon_id) | |
) | |
if (class(matched_alternative)[1] == "try-error") { | |
taxon_list <- matched | |
} else { | |
taxon_list <- bind_rows(matched, matched_alternative) | |
} | |
if (include_genus == FALSE) taxon_list %<>% filter(rank != "GENUS") | |
# get names that were matched as accepted | |
accepted <- taxon_list %>% | |
group_by(taxon_id) %>% | |
filter(status == "ACCEPTED") %>% | |
filter(confidence == max(confidence)) %>% | |
ungroup() | |
# get names that were matched as synonyms only | |
synonyms <- taxon_list %>% | |
group_by(taxon_id) %>% | |
summarise(has_accepted = n_distinct(status == "ACCEPTED") > 1) %>% | |
full_join(taxon_list) %>% | |
filter(has_accepted == FALSE) %>% | |
filter(status == "SYNONYM") %>% | |
group_by(taxon_id) %>% | |
filter(confidence == max(confidence)) %>% | |
ungroup() | |
# get names that were matched as doubtful only | |
doubtful <- taxon_list %>% | |
group_by(taxon_id) %>% | |
summarise(has_accepted = n_distinct(status == "ACCEPTED") > 1) %>% | |
full_join(taxon_list) %>% | |
filter(has_accepted == FALSE) %>% | |
group_by(taxon_id) %>% | |
filter(status == "DOUBTFUL") %>% | |
filter(confidence == max(confidence)) %>% | |
ungroup() | |
# combine all names | |
taxon_list_final <- bind_rows(accepted, synonyms, doubtful) %>% | |
group_by(taxon_id) %>% | |
filter(confidence == max(confidence)) %>% | |
filter(status != "NONE") %>% # exclude non-matched names | |
dplyr::select(-has_accepted) %>% | |
ungroup() %>% | |
relocate(taxon_name, taxon_id) | |
return(taxon_list_final) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment