ashenkin · November 29, 2015 10:14 · Nov 29, 2015
diff --git a/query_higher_taxa_classes.r b/query_higher_taxa_classes.r
@@ -0,0 +1,40 @@
+query_higher_taxa_classes <- function(species_list, known = "genus", order = c("dataframe", "unique_sp")) {
+    # Pass in a character vector of species, genera, families, or whatever (the search is flexible)
+    # Returns a dataframe with the columns: query, db, family, order, subdivision
+    # The dataframe returned is guaranteed to be in the same order as the species list passed in if order is "dataframe"
+    order = match.arg(order)
+
+    library(taxize)
+    library(plyr)
+    species_list = sub("^([^ ]*).*$","\\1",species_list) # just take the top level name before the space
+    # remove short names that clog the taxon query - replace later
+        which_short = which(nchar(species_list) < 3)
+        short_names = species_list[which_short]
+        species_list[which_short] = "TEMPTAXON"
+    # agglomerate to be more efficient
+        species_list_uniq = unique(species_list)
+    if (known == "genus") {
+        highertaxa <- tax_name(species_list_uniq,get=c("family", "order", "subdivision"), db = "both")
+    } else if (known == "family") {
+        highertaxa <- tax_name(species_list_uniq,get=c("order", "subdivision"), db = "both")
+    }
+
+    highertaxa_ncbi = subset(highertaxa, db == "ncbi")
+    highertaxa_itis = subset(highertaxa, db == "itis")
+
+    merdat <- merge(highertaxa_ncbi, highertaxa_ncbi, by="query")  # seems self-documenting
+    merdat$family.y[ is.na(merdat$family.y) ] <- merdat$family.x[ is.na(merdat$family.y) ]
+    merdat$order.y[ is.na(merdat$order.y) ] <- merdat$order.x[ is.na(merdat$order.y) ]
+    merdat$subdivision.y[ is.na(merdat$subdivision.y) ] <- merdat$subdivision.x[ is.na(merdat$subdivision.y) ]
+    merdat = merdat[,c(1,grep("\\.x",names(merdat)))]
+    names(merdat) = sub("\\.x","",names(merdat))
+    highertaxa = merdat
+
+    species_list[which_short] = short_names # replace short names removed earlier
+    ret_df = join(data.frame(query = species_list), highertaxa, by = "query", type = "left") # NA's will be filled in for short names that don't have a match in highertaxa now
+    if (order == "dataframe") {
+        return(ret_df)
+    } else if (order == "unique_sp") {
+        return(ret_df[-duplicated(ret_df$query),])
+    }
+}