sinamajidian · February 15, 2025 20:11
diff --git a/README.md b/README.md
diff --git a/concat_alignments_v2.py b/concat_alignments_v2.py
 #!/usr/bin/env python

 import logging
 import sys
 import argparse
 from Bio import AlignIO
 from Bio.Align import MultipleSeqAlignment
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 from collections import defaultdict


 # slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py

 logger = logging.getLogger("concat")


 def load_alignments(alignmentfiles, format):
    alignments = []
    for file in alignmentfiles:
        try:
            for alignment in AlignIO.parse(file, format=format):
                logger.debug("loaded alignment of length {} from {}".format(len(alignment), file))
                alignments.append(alignment)
        except ValueError as e:
            logger.error("Cannot parse input file {}: {}".format(file, e))
            raise
    logger.info("Successfully loaded {} alignments from {} input files"
                .format(len(alignments), len(alignmentfiles)))
    return alignments


 def concatenate(alignments):
    # Get the full set of labels (i.e. sequence ids) for all the alignments
    all_labels = set(seq.id for aln in alignments for seq in aln)
    logger.debug("extracted {} different labels in all alignments: {}"
                 .format(len(all_labels), all_labels))

    # Make a dictionary to store info as we go along
    # (defaultdict is convenient -- asking for a missing key gives back an empty list)
    concat_buf = defaultdict(list)

    for aln in alignments:
        length = aln.get_alignment_length()

        # check if any labels are missing in the current alignment
        these_labels = set(rec.id for rec in aln)
        missing = all_labels - these_labels
        logger.debug("alignment of length {} with {} sequences, {} missing ({})"
                     .format(length, len(these_labels), len(missing), missing))

        # if any are missing, create unknown data of the right length,
        # stuff the string representation into the concat_buf dict
        for label in missing:
            new_seq =  'X'*length # UnknownSeq(length, character='X')
            concat_buf[label].append(str(new_seq))

        # else stuff the string representation into the concat_buf dict
        for rec in aln:
            concat_buf[rec.id].append(str(rec.seq))

    # Stitch all the substrings together using join (most efficient way),
    # and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
    msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr)), id=label)
                                for (label, seq_arr) in concat_buf.items())
    logger.info("concatenated MSA of {} taxa and total length {} created"
                .format(len(msa), len(msa[0])))
    return msa


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Concatenate alignments",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-f', '--format-in', default='fasta',
                        help="input format of the alignments. Any format that is understood"
                             "by Biopython's AlignIO module is possible.")
    parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
                        help="Path to the output file where the concatenated multiple "
                             "sequence alignment will be written")
    parser.add_argument('-u', '--format-output', default='phylip-relaxed',
                        help="output format of the concatenated multiple sequence alignment")
    parser.add_argument('-v', '--verbose', action='store_true', default=False,
                        help="Produce some output and status reports")
    parser.add_argument('-d', '--debug', action="store_true", default=False,
                        help="Be more verbose for debugging purposes")
    parser.add_argument('alignment', nargs='+', type=str,
                        help="Path to the alignment files. Use shell expansion to pass many files "
                             "in a simple way, e.g. \"/path/to/folder/*.fa\".")
    conf = parser.parse_args()

    level = logging.WARNING
    if conf.verbose:
        level = logging.INFO
    if conf.debug:
        level = logging.DEBUG
    logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logger.debug("Concatenate alignments: arguments: {}".format(conf))

    alignments = load_alignments(conf.alignment, conf.format_in.lower())
    msa = concatenate(alignments)
    AlignIO.write(msa, conf.output, conf.format_output.lower())
diff --git a/filter_groups_fastoma_v2.py b/filter_groups_fastoma_v2.py

 #!/usr/bin/env python

 # slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py


 import math
 import multiprocessing
 import argparse
 import os
 import Bio.SeqIO
 import re
 import logging
 logger = logging.getLogger('filter-group')
 RE_SPECIES = re.compile("\[(?P<species>[^]]*)\].*")


 def get_species_from_header(header):
    #print(header, header.split("||"))
    m = header.split("||")[1] #RE_SPECIES.search(header)
    return m
    #if m is not None:
    #    return m.group('species')
    #raise InvalidHeaderFormat(header)


 def convert_fasta_file(infile, outfile, min_nr_species):
    seen = set([])
    records = [] #[convert_id_to_species_name(z) for z in Bio.SeqIO.parse(infile, 'fasta')]
    for record in Bio.SeqIO.parse(infile, 'fasta'):
        record.id = get_species_from_header(record.description)
        if record.id not in seen:
            records.append(record)
            seen.add(record.id) 

    if len(records) >= min_nr_species:
        cnts = Bio.SeqIO.write(records, outfile, 'fasta')
        logger.debug("\"{}\" contains {} sequences. IDs have been converted and file rewritten to {}"
                     .format(infile, cnts, outfile))
    else:
        logger.debug("skipping \"{}\" with {} sequences".format(infile, len(records)))
        cnts = 0
    return cnts


 def get_matching_files(input_dir, pattern):
    with os.scandir(input_dir) as dir_iter:
        for f in dir_iter:
            if f.name.endswith(pattern):
                yield f


 def convert_files(input_dir, output_dir, min_nr_species, pattern='.fa', nr_processes=None):
    os.makedirs(output_dir, exist_ok=True)
    args_iter = ((x.path, os.path.join(output_dir, x.name), min_nr_species) for x in get_matching_files(input_dir, pattern))
    with multiprocessing.Pool(processes=nr_processes) as mp:
        logger.info("converting files in {} with {} processes in parallel"
                    .format(input_dir, nr_processes if nr_processes else os.cpu_count()))
        result = mp.starmap(convert_fasta_file, args_iter)
        converted = [x for x in result if x > 0]
    logger.info("{} (of {}) files filtered and converted. Converted files "
                "contain {:.2f} species on average (sd {:.4f})"
                .format(len(converted), len(result), sum(converted)/len(converted),
                        math.sqrt((sum(x**2 for x in converted) - (sum(converted)**2)/len(converted))/(len(converted)-1))
                        ))


 class InvalidHeaderFormat(Exception):
    pass


 class SeveralSequencePerSpeciesException(Exception):
    pass


 if __name__ == "__main__":
    parser = argparse.ArgumentParser("Filter marker genes based on how complete they cover the "
                                     "species set")
    parser.add_argument('-n', '--min-nr-species', type=int, required=True,
                        help="Specify the minimum number of species that should be "
                             "covered by each marker gene.")
    parser.add_argument('-i', '--input', type=str, required=True,
                        help='Path to a directory containing the input files to be filtered')
    parser.add_argument('-o', '--output', required=True,
                        help="Path to the output directory where the filtered fasta files will "
                             "be stored. This directory does not need to exist. If it does, existing "
                             "files will not be removed, but might be overwritten.")
    parser.add_argument('-e', '--ext', type=str, default=".fa",
                        help="Extension of files to be considered for conversion. (default %(default)s)")
    parser.add_argument('-#', '--threads', type=int,
                        help="Nr of threads to use to filter input files in parallel. "
                             "(defaults to the number of available cores)")
    parser.add_argument('-v', '--verbose', action='store_true', default=False,
                        help="Produce some output and status reports")
    parser.add_argument('-d', '--debug', action="store_true", default=False,
                        help="Be more verbose for debugging purposes")

    conf = parser.parse_args()
    level = logging.WARNING
    if conf.verbose:
        level = logging.INFO
    if conf.debug:
        level = logging.DEBUG
    logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    logger.debug("Concatenate alignments: arguments: {}".format(conf))

    convert_files(conf.input, conf.output, conf.min_nr_species, pattern=conf.ext, nr_processes=conf.threads)
	#!/usr/bin/env python

	import logging
	import sys
	import argparse
	from Bio import AlignIO
	from Bio.Align import MultipleSeqAlignment
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from collections import defaultdict


	# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py

	logger = logging.getLogger("concat")


	def load_alignments(alignmentfiles, format):
	alignments = []
	for file in alignmentfiles:
	try:
	for alignment in AlignIO.parse(file, format=format):
	logger.debug("loaded alignment of length {} from {}".format(len(alignment), file))
	alignments.append(alignment)
	except ValueError as e:
	logger.error("Cannot parse input file {}: {}".format(file, e))
	raise
	logger.info("Successfully loaded {} alignments from {} input files"
	.format(len(alignments), len(alignmentfiles)))
	return alignments


	def concatenate(alignments):
	# Get the full set of labels (i.e. sequence ids) for all the alignments
	all_labels = set(seq.id for aln in alignments for seq in aln)
	logger.debug("extracted {} different labels in all alignments: {}"
	.format(len(all_labels), all_labels))

	# Make a dictionary to store info as we go along
	# (defaultdict is convenient -- asking for a missing key gives back an empty list)
	concat_buf = defaultdict(list)

	for aln in alignments:
	length = aln.get_alignment_length()

	# check if any labels are missing in the current alignment
	these_labels = set(rec.id for rec in aln)
	missing = all_labels - these_labels
	logger.debug("alignment of length {} with {} sequences, {} missing ({})"
	.format(length, len(these_labels), len(missing), missing))

	# if any are missing, create unknown data of the right length,
	# stuff the string representation into the concat_buf dict
	for label in missing:
	new_seq = 'X'*length # UnknownSeq(length, character='X')
	concat_buf[label].append(str(new_seq))

	# else stuff the string representation into the concat_buf dict
	for rec in aln:
	concat_buf[rec.id].append(str(rec.seq))

	# Stitch all the substrings together using join (most efficient way),
	# and build the Biopython data structures Seq, SeqRecord and MultipleSeqAlignment
	msa = MultipleSeqAlignment(SeqRecord(Seq(''.join(seq_arr)), id=label)
	for (label, seq_arr) in concat_buf.items())
	logger.info("concatenated MSA of {} taxa and total length {} created"
	.format(len(msa), len(msa[0])))
	return msa


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Concatenate alignments",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-f', '--format-in', default='fasta',
	help="input format of the alignments. Any format that is understood"
	"by Biopython's AlignIO module is possible.")
	parser.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout,
	help="Path to the output file where the concatenated multiple "
	"sequence alignment will be written")
	parser.add_argument('-u', '--format-output', default='phylip-relaxed',
	help="output format of the concatenated multiple sequence alignment")
	parser.add_argument('-v', '--verbose', action='store_true', default=False,
	help="Produce some output and status reports")
	parser.add_argument('-d', '--debug', action="store_true", default=False,
	help="Be more verbose for debugging purposes")
	parser.add_argument('alignment', nargs='+', type=str,
	help="Path to the alignment files. Use shell expansion to pass many files "
	"in a simple way, e.g. \"/path/to/folder/*.fa\".")
	conf = parser.parse_args()

	level = logging.WARNING
	if conf.verbose:
	level = logging.INFO
	if conf.debug:
	level = logging.DEBUG
	logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger.debug("Concatenate alignments: arguments: {}".format(conf))

	alignments = load_alignments(conf.alignment, conf.format_in.lower())
	msa = concatenate(alignments)
	AlignIO.write(msa, conf.output, conf.format_output.lower())

	#!/usr/bin/env python

	# slight changes of the code from https://github.com/DessimozLab/f1000_PhylogeneticTree/blob/master/concat_alignments.py


	import math
	import multiprocessing
	import argparse
	import os
	import Bio.SeqIO
	import re
	import logging
	logger = logging.getLogger('filter-group')
	RE_SPECIES = re.compile("\[(?P<species>[^]])\].")


	def get_species_from_header(header):
	#print(header, header.split("\|\|"))
	m = header.split("\|\|")[1] #RE_SPECIES.search(header)
	return m
	#if m is not None:
	# return m.group('species')
	#raise InvalidHeaderFormat(header)


	def convert_fasta_file(infile, outfile, min_nr_species):
	seen = set([])
	records = [] #[convert_id_to_species_name(z) for z in Bio.SeqIO.parse(infile, 'fasta')]
	for record in Bio.SeqIO.parse(infile, 'fasta'):
	record.id = get_species_from_header(record.description)
	if record.id not in seen:
	records.append(record)
	seen.add(record.id)

	if len(records) >= min_nr_species:
	cnts = Bio.SeqIO.write(records, outfile, 'fasta')
	logger.debug("\"{}\" contains {} sequences. IDs have been converted and file rewritten to {}"
	.format(infile, cnts, outfile))
	else:
	logger.debug("skipping \"{}\" with {} sequences".format(infile, len(records)))
	cnts = 0
	return cnts


	def get_matching_files(input_dir, pattern):
	with os.scandir(input_dir) as dir_iter:
	for f in dir_iter:
	if f.name.endswith(pattern):
	yield f


	def convert_files(input_dir, output_dir, min_nr_species, pattern='.fa', nr_processes=None):
	os.makedirs(output_dir, exist_ok=True)
	args_iter = ((x.path, os.path.join(output_dir, x.name), min_nr_species) for x in get_matching_files(input_dir, pattern))
	with multiprocessing.Pool(processes=nr_processes) as mp:
	logger.info("converting files in {} with {} processes in parallel"
	.format(input_dir, nr_processes if nr_processes else os.cpu_count()))
	result = mp.starmap(convert_fasta_file, args_iter)
	converted = [x for x in result if x > 0]
	logger.info("{} (of {}) files filtered and converted. Converted files "
	"contain {:.2f} species on average (sd {:.4f})"
	.format(len(converted), len(result), sum(converted)/len(converted),
	math.sqrt((sum(x2 for x in converted) - (sum(converted)2)/len(converted))/(len(converted)-1))
	))


	class InvalidHeaderFormat(Exception):
	pass


	class SeveralSequencePerSpeciesException(Exception):
	pass


	if __name__ == "__main__":
	parser = argparse.ArgumentParser("Filter marker genes based on how complete they cover the "
	"species set")
	parser.add_argument('-n', '--min-nr-species', type=int, required=True,
	help="Specify the minimum number of species that should be "
	"covered by each marker gene.")
	parser.add_argument('-i', '--input', type=str, required=True,
	help='Path to a directory containing the input files to be filtered')
	parser.add_argument('-o', '--output', required=True,
	help="Path to the output directory where the filtered fasta files will "
	"be stored. This directory does not need to exist. If it does, existing "
	"files will not be removed, but might be overwritten.")
	parser.add_argument('-e', '--ext', type=str, default=".fa",
	help="Extension of files to be considered for conversion. (default %(default)s)")
	parser.add_argument('-#', '--threads', type=int,
	help="Nr of threads to use to filter input files in parallel. "
	"(defaults to the number of available cores)")
	parser.add_argument('-v', '--verbose', action='store_true', default=False,
	help="Produce some output and status reports")
	parser.add_argument('-d', '--debug', action="store_true", default=False,
	help="Be more verbose for debugging purposes")

	conf = parser.parse_args()
	level = logging.WARNING
	if conf.verbose:
	level = logging.INFO
	if conf.debug:
	level = logging.DEBUG
	logging.basicConfig(level=level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger.debug("Concatenate alignments: arguments: {}".format(conf))

	convert_files(conf.input, conf.output, conf.min_nr_species, pattern=conf.ext, nr_processes=conf.threads)