Skip to content

Instantly share code, notes, and snippets.

@sinamajidian
Last active February 15, 2025 20:11
Show Gist options
  • Save sinamajidian/39f71fe3733f55ead4147a8f315aa1bc to your computer and use it in GitHub Desktop.
Save sinamajidian/39f71fe3733f55ead4147a8f315aa1bc to your computer and use it in GitHub Desktop.
PhylogeneticTree using FastOMA

PhylogeneticTree with FastOMA

Updating codes frm the F1000 paper on PhylogeneticTree on github for FastOMA

#!/usr/bin/env python
import logging
import sys
import argparse
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import defaultdict
# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py
logger = logging.getLogger("concat")
def load_alignments(alignmentfiles, format):
alignments = []
for file in alignmentfiles:
try:
for alignment in AlignIO.parse(file, format=format):
logger.debug("loaded alignment of length {} from {}".format(len(alignment), file))
alignments.append(alignment)
except ValueError as e:
logger.error("Cannot parse input file {}: {}".format(file, e))
raise
logger.info("Successfully loaded {} alignments from {} input files"
.format(len(alignments), len(alignmentfiles)))
return alignments
def concatenate(alignments):
# Get the full set of labels (i.e. sequence ids) for all the alignments
all_labels = set(seq.id for aln in alignments for seq in aln)
logger.debug("extracted {} different labels in all alignments: {}"
.format(len(all_labels), all_labels))
# Make a dictionary to store info as we go along
# (defaultdict is convenient -- asking for a missing key gives back an empty list)
concat_buf = defaultdict(list)
for aln in alignments:
length = aln.get_alignment_length()
# check if any labels are missing in the current alignment
these_labels = set(rec.id for rec in aln)
missing = all_labels - these_labels
logger.debug("alignment of length {} with {} sequences, {} missing ({})"
.format(length, len(these_labels), len(missing), missing))
# if any are missing, create unknown data of the right length,
# stuff the string representation into the concat_buf dict
for label in missing:
new_seq = 'X'*length # UnknownSeq(length, character='X')
concat_buf[label].append(str(new_seq))
# else stuff the string representation into the concat_buf dict
for rec in aln:
concat_buf[rec.id].append(str(rec.seq))
# Stitch all the substrings together using join (most efficient way),
# and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr)), id=label)
for (label, seq_arr) in concat_buf.items())
logger.info("concatenated MSA of {} taxa and total length {} created"
.format(len(msa), len(msa[0])))
return msa
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Concatenate alignments",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-f', '--format-in', default='fasta',
help="input format of the alignments. Any format that is understood"
"by Biopython's AlignIO module is possible.")
parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
help="Path to the output file where the concatenated multiple "
"sequence alignment will be written")
parser.add_argument('-u', '--format-output', default='phylip-relaxed',
help="output format of the concatenated multiple sequence alignment")
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help="Produce some output and status reports")
parser.add_argument('-d', '--debug', action="store_true", default=False,
help="Be more verbose for debugging purposes")
parser.add_argument('alignment', nargs='+', type=str,
help="Path to the alignment files. Use shell expansion to pass many files "
"in a simple way, e.g. \"/path/to/folder/*.fa\".")
conf = parser.parse_args()
level = logging.WARNING
if conf.verbose:
level = logging.INFO
if conf.debug:
level = logging.DEBUG
logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger.debug("Concatenate alignments: arguments: {}".format(conf))
alignments = load_alignments(conf.alignment, conf.format_in.lower())
msa = concatenate(alignments)
AlignIO.write(msa, conf.output, conf.format_output.lower())
#!/usr/bin/env python
# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py
import math
import multiprocessing
import argparse
import os
import Bio.SeqIO
import re
import logging
logger = logging.getLogger('filter-group')
RE_SPECIES = re.compile("\[(?P<species>[^]]*)\].*")
def get_species_from_header(header):
#print(header, header.split("||"))
m = header.split("||")[1] #RE_SPECIES.search(header)
return m
#if m is not None:
# return m.group('species')
#raise InvalidHeaderFormat(header)
def convert_fasta_file(infile, outfile, min_nr_species):
seen = set([])
records = [] #[convert_id_to_species_name(z) for z in Bio.SeqIO.parse(infile, 'fasta')]
for record in Bio.SeqIO.parse(infile, 'fasta'):
record.id = get_species_from_header(record.description)
if record.id not in seen:
records.append(record)
seen.add(record.id)
if len(records) >= min_nr_species:
cnts = Bio.SeqIO.write(records, outfile, 'fasta')
logger.debug("\"{}\" contains {} sequences. IDs have been converted and file rewritten to {}"
.format(infile, cnts, outfile))
else:
logger.debug("skipping \"{}\" with {} sequences".format(infile, len(records)))
cnts = 0
return cnts
def get_matching_files(input_dir, pattern):
with os.scandir(input_dir) as dir_iter:
for f in dir_iter:
if f.name.endswith(pattern):
yield f
def convert_files(input_dir, output_dir, min_nr_species, pattern='.fa', nr_processes=None):
os.makedirs(output_dir, exist_ok=True)
args_iter = ((x.path, os.path.join(output_dir, x.name), min_nr_species) for x in get_matching_files(input_dir, pattern))
with multiprocessing.Pool(processes=nr_processes) as mp:
logger.info("converting files in {} with {} processes in parallel"
.format(input_dir, nr_processes if nr_processes else os.cpu_count()))
result = mp.starmap(convert_fasta_file, args_iter)
converted = [x for x in result if x > 0]
logger.info("{} (of {}) files filtered and converted. Converted files "
"contain {:.2f} species on average (sd {:.4f})"
.format(len(converted), len(result), sum(converted)/len(converted),
math.sqrt((sum(x**2 for x in converted) - (sum(converted)**2)/len(converted))/(len(converted)-1))
))
class InvalidHeaderFormat(Exception):
pass
class SeveralSequencePerSpeciesException(Exception):
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser("Filter marker genes based on how complete they cover the "
"species set")
parser.add_argument('-n', '--min-nr-species', type=int, required=True,
help="Specify the minimum number of species that should be "
"covered by each marker gene.")
parser.add_argument('-i', '--input', type=str, required=True,
help='Path to a directory containing the input files to be filtered')
parser.add_argument('-o', '--output', required=True,
help="Path to the output directory where the filtered fasta files will "
"be stored. This directory does not need to exist. If it does, existing "
"files will not be removed, but might be overwritten.")
parser.add_argument('-e', '--ext', type=str, default=".fa",
help="Extension of files to be considered for conversion. (default %(default)s)")
parser.add_argument('-#', '--threads', type=int,
help="Nr of threads to use to filter input files in parallel. "
"(defaults to the number of available cores)")
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help="Produce some output and status reports")
parser.add_argument('-d', '--debug', action="store_true", default=False,
help="Be more verbose for debugging purposes")
conf = parser.parse_args()
level = logging.WARNING
if conf.verbose:
level = logging.INFO
if conf.debug:
level = logging.DEBUG
logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger.debug("Concatenate alignments: arguments: {}".format(conf))
convert_files(conf.input, conf.output, conf.min_nr_species, pattern=conf.ext, nr_processes=conf.threads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment