Skip to content

Instantly share code, notes, and snippets.

@ashenkin
Created November 29, 2015 10:14

Revisions

  1. ashenkin created this gist Nov 29, 2015.
    40 changes: 40 additions & 0 deletions query_higher_taxa_classes.r
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,40 @@
    query_higher_taxa_classes <- function(species_list, known = "genus", order = c("dataframe", "unique_sp")) {
    # Pass in a character vector of species, genera, families, or whatever (the search is flexible)
    # Returns a dataframe with the columns: query, db, family, order, subdivision
    # The dataframe returned is guaranteed to be in the same order as the species list passed in if order is "dataframe"
    order = match.arg(order)

    library(taxize)
    library(plyr)
    species_list = sub("^([^ ]*).*$","\\1",species_list) # just take the top level name before the space
    # remove short names that clog the taxon query - replace later
    which_short = which(nchar(species_list) < 3)
    short_names = species_list[which_short]
    species_list[which_short] = "TEMPTAXON"
    # agglomerate to be more efficient
    species_list_uniq = unique(species_list)
    if (known == "genus") {
    highertaxa <- tax_name(species_list_uniq,get=c("family", "order", "subdivision"), db = "both")
    } else if (known == "family") {
    highertaxa <- tax_name(species_list_uniq,get=c("order", "subdivision"), db = "both")
    }

    highertaxa_ncbi = subset(highertaxa, db == "ncbi")
    highertaxa_itis = subset(highertaxa, db == "itis")

    merdat <- merge(highertaxa_ncbi, highertaxa_ncbi, by="query") # seems self-documenting
    merdat$family.y[ is.na(merdat$family.y) ] <- merdat$family.x[ is.na(merdat$family.y) ]
    merdat$order.y[ is.na(merdat$order.y) ] <- merdat$order.x[ is.na(merdat$order.y) ]
    merdat$subdivision.y[ is.na(merdat$subdivision.y) ] <- merdat$subdivision.x[ is.na(merdat$subdivision.y) ]
    merdat = merdat[,c(1,grep("\\.x",names(merdat)))]
    names(merdat) = sub("\\.x","",names(merdat))
    highertaxa = merdat

    species_list[which_short] = short_names # replace short names removed earlier
    ret_df = join(data.frame(query = species_list), highertaxa, by = "query", type = "left") # NA's will be filled in for short names that don't have a match in highertaxa now
    if (order == "dataframe") {
    return(ret_df)
    } else if (order == "unique_sp") {
    return(ret_df[-duplicated(ret_df$query),])
    }
    }