PhylogeneticTree with FastOMA
Updating codes frm the F1000 paper on PhylogeneticTree on github for FastOMA
#!/usr/bin/env python | |
import logging | |
import sys | |
import argparse | |
from Bio import AlignIO | |
from Bio.Align import MultipleSeqAlignment | |
from Bio.Seq import Seq | |
from Bio.SeqRecord import SeqRecord | |
from collections import defaultdict | |
# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py | |
logger = logging.getLogger("concat") | |
def load_alignments(alignmentfiles, format): | |
alignments = [] | |
for file in alignmentfiles: | |
try: | |
for alignment in AlignIO.parse(file, format=format): | |
logger.debug("loaded alignment of length {} from {}".format(len(alignment), file)) | |
alignments.append(alignment) | |
except ValueError as e: | |
logger.error("Cannot parse input file {}: {}".format(file, e)) | |
raise | |
logger.info("Successfully loaded {} alignments from {} input files" | |
.format(len(alignments), len(alignmentfiles))) | |
return alignments | |
def concatenate(alignments): | |
# Get the full set of labels (i.e. sequence ids) for all the alignments | |
all_labels = set(seq.id for aln in alignments for seq in aln) | |
logger.debug("extracted {} different labels in all alignments: {}" | |
.format(len(all_labels), all_labels)) | |
# Make a dictionary to store info as we go along | |
# (defaultdict is convenient -- asking for a missing key gives back an empty list) | |
concat_buf = defaultdict(list) | |
for aln in alignments: | |
length = aln.get_alignment_length() | |
# check if any labels are missing in the current alignment | |
these_labels = set(rec.id for rec in aln) | |
missing = all_labels - these_labels | |
logger.debug("alignment of length {} with {} sequences, {} missing ({})" | |
.format(length, len(these_labels), len(missing), missing)) | |
# if any are missing, create unknown data of the right length, | |
# stuff the string representation into the concat_buf dict | |
for label in missing: | |
new_seq = 'X'*length # UnknownSeq(length, character='X') | |
concat_buf[label].append(str(new_seq)) | |
# else stuff the string representation into the concat_buf dict | |
for rec in aln: | |
concat_buf[rec.id].append(str(rec.seq)) | |
# Stitch all the substrings together using join (most efficient way), | |
# and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment | |
msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr)), id=label) | |
for (label, seq_arr) in concat_buf.items()) | |
logger.info("concatenated MSA of {} taxa and total length {} created" | |
.format(len(msa), len(msa[0]))) | |
return msa | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Concatenate alignments", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('-f', '--format-in', default='fasta', | |
help="input format of the alignments. Any format that is understood" | |
"by Biopython's AlignIO module is possible.") | |
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, | |
help="Path to the output file where the concatenated multiple " | |
"sequence alignment will be written") | |
parser.add_argument('-u', '--format-output', default='phylip-relaxed', | |
help="output format of the concatenated multiple sequence alignment") | |
parser.add_argument('-v', '--verbose', action='store_true', default=False, | |
help="Produce some output and status reports") | |
parser.add_argument('-d', '--debug', action="store_true", default=False, | |
help="Be more verbose for debugging purposes") | |
parser.add_argument('alignment', nargs='+', type=str, | |
help="Path to the alignment files. Use shell expansion to pass many files " | |
"in a simple way, e.g. \"/path/to/folder/*.fa\".") | |
conf = parser.parse_args() | |
level = logging.WARNING | |
if conf.verbose: | |
level = logging.INFO | |
if conf.debug: | |
level = logging.DEBUG | |
logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
logger.debug("Concatenate alignments: arguments: {}".format(conf)) | |
alignments = load_alignments(conf.alignment, conf.format_in.lower()) | |
msa = concatenate(alignments) | |
AlignIO.write(msa, conf.output, conf.format_output.lower()) |
#!/usr/bin/env python | |
# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py | |
import math | |
import multiprocessing | |
import argparse | |
import os | |
import Bio.SeqIO | |
import re | |
import logging | |
logger = logging.getLogger('filter-group') | |
RE_SPECIES = re.compile("\[(?P<species>[^]]*)\].*") | |
def get_species_from_header(header): | |
#print(header, header.split("||")) | |
m = header.split("||")[1] #RE_SPECIES.search(header) | |
return m | |
#if m is not None: | |
# return m.group('species') | |
#raise InvalidHeaderFormat(header) | |
def convert_fasta_file(infile, outfile, min_nr_species): | |
seen = set([]) | |
records = [] #[convert_id_to_species_name(z) for z in Bio.SeqIO.parse(infile, 'fasta')] | |
for record in Bio.SeqIO.parse(infile, 'fasta'): | |
record.id = get_species_from_header(record.description) | |
if record.id not in seen: | |
records.append(record) | |
seen.add(record.id) | |
if len(records) >= min_nr_species: | |
cnts = Bio.SeqIO.write(records, outfile, 'fasta') | |
logger.debug("\"{}\" contains {} sequences. IDs have been converted and file rewritten to {}" | |
.format(infile, cnts, outfile)) | |
else: | |
logger.debug("skipping \"{}\" with {} sequences".format(infile, len(records))) | |
cnts = 0 | |
return cnts | |
def get_matching_files(input_dir, pattern): | |
with os.scandir(input_dir) as dir_iter: | |
for f in dir_iter: | |
if f.name.endswith(pattern): | |
yield f | |
def convert_files(input_dir, output_dir, min_nr_species, pattern='.fa', nr_processes=None): | |
os.makedirs(output_dir, exist_ok=True) | |
args_iter = ((x.path, os.path.join(output_dir, x.name), min_nr_species) for x in get_matching_files(input_dir, pattern)) | |
with multiprocessing.Pool(processes=nr_processes) as mp: | |
logger.info("converting files in {} with {} processes in parallel" | |
.format(input_dir, nr_processes if nr_processes else os.cpu_count())) | |
result = mp.starmap(convert_fasta_file, args_iter) | |
converted = [x for x in result if x > 0] | |
logger.info("{} (of {}) files filtered and converted. Converted files " | |
"contain {:.2f} species on average (sd {:.4f})" | |
.format(len(converted), len(result), sum(converted)/len(converted), | |
math.sqrt((sum(x**2 for x in converted) - (sum(converted)**2)/len(converted))/(len(converted)-1)) | |
)) | |
class InvalidHeaderFormat(Exception): | |
pass | |
class SeveralSequencePerSpeciesException(Exception): | |
pass | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser("Filter marker genes based on how complete they cover the " | |
"species set") | |
parser.add_argument('-n', '--min-nr-species', type=int, required=True, | |
help="Specify the minimum number of species that should be " | |
"covered by each marker gene.") | |
parser.add_argument('-i', '--input', type=str, required=True, | |
help='Path to a directory containing the input files to be filtered') | |
parser.add_argument('-o', '--output', required=True, | |
help="Path to the output directory where the filtered fasta files will " | |
"be stored. This directory does not need to exist. If it does, existing " | |
"files will not be removed, but might be overwritten.") | |
parser.add_argument('-e', '--ext', type=str, default=".fa", | |
help="Extension of files to be considered for conversion. (default %(default)s)") | |
parser.add_argument('-#', '--threads', type=int, | |
help="Nr of threads to use to filter input files in parallel. " | |
"(defaults to the number of available cores)") | |
parser.add_argument('-v', '--verbose', action='store_true', default=False, | |
help="Produce some output and status reports") | |
parser.add_argument('-d', '--debug', action="store_true", default=False, | |
help="Be more verbose for debugging purposes") | |
conf = parser.parse_args() | |
level = logging.WARNING | |
if conf.verbose: | |
level = logging.INFO | |
if conf.debug: | |
level = logging.DEBUG | |
logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
logger.debug("Concatenate alignments: arguments: {}".format(conf)) | |
convert_files(conf.input, conf.output, conf.min_nr_species, pattern=conf.ext, nr_processes=conf.threads) |