Created
May 29, 2014 17:26
-
-
Save smdabdoub/93c235cd375bc1f8b3fc to your computer and use it in GitHub Desktop.
Method for extracting a simple genus_species string identifier from a BIOM table with taxonomy metadata. If no species, use Genus_spp, if no genus, use Unclassified_ and lowest available taxonomic identifier. This example script will also take a BIOM file from the command line and run the method on every row in the table and print the results.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sys | |
def otu_name(biom_row): | |
""" | |
Determine a simple Genus-species identifier for an OTU, if possible. | |
If OTU is not identified to the species level, name it as Unclassified (familly/genus/etc...) | |
""" | |
tax = biom_row['metadata']['taxonomy'] | |
for i, lvl in enumerate(tax): | |
lvl = lvl.strip() | |
if i < len(tax) - 1 and len(tax[i + 1].strip()) == 3: | |
if tax[i].strip()[0] == 'g': | |
return lvl.split('_')[-1] + '_spp.' | |
else: | |
return 'Unclassified_' + lvl.split('_')[-1] | |
elif i == len(tax) - 1: | |
name = lvl.split('_')[-1] | |
if lvl[0] == 's': | |
name = tax[i-1].split('_')[-1] + '_' + name | |
return name | |
if __name__ == '__main__': | |
with open(sys.argv[1], 'rU') as biomF: | |
biom = json.loads(biomF.readline()) | |
for row in biom['rows']: | |
print otu_name(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment