rknx · June 24, 2022 22:25
diff --git a/roary2ffn.sh b/roary2ffn.sh
 #!/bin/bash

 #########################################
 # Anuj Sharma                           #
 # [email protected]                      #
 # 2022-05-18                            #
 #########################################

 # Reading the input parameters
 OPTIND=1
 out_dir="."
 unset -v ffn_dir
 unset -v geneid_csv
 while getopts "d:c:o:hv" opt; do
    case "$opt" in
        d)  ffn_dir=$OPTARG;; # Directory containing ffns.
        c)  geneid_csv=$OPTARG;; # Path of presence/absence CSV.
        o)  out_dir=$OPTARG;;
        h)  echo "Usage: $0  -d ffn_directory  -c presence_absence_csv  [-o output file]" >&2 && exit 0;;
        v)  echo "$(basename $0): v0.2" >&2 && exit 0;;
        *)  echo "Extra arguments not used!" >&2
    esac
 done

 shift $((OPTIND-1))

 # Exit if necessary input are not given
 [[ -z "$ffn_dir" && -z "$geneid_csv" ]] && echo "Some input arguments missing." >&2 && exit 1
 [[ -z "$ffn_dir" ]] && echo "Input ffn directory path missing." >&2 && exit 1
 [[ -z "$geneid_csv" ]] && echo "Input geneID CSV path missing." >&2 && exit 1

 # Make output directory.
 mkdir -p $out_dir

 # Read presence/absence CSV.
 # Convert , to ~ is a workaround to prevent error from non-delimiter ,
 cat $geneid_csv | sed 's/","/"~"/g' | {
    # Ignore header
    read -r

    # Read row elements into array and iterate over it.
    while IFS='~' read -ra gene
    do
        # Extract name for gene/gene group.
        genename=$(echo "${gene[0]}" | tr -d '"')

        # Parse outfut fasta name from gene name. Remove if existing.
        output="$out_dir/$genename.fasta"; rm -rf $output

        # Parse the gene loci for all genomes
        # Remove carriage return if generated in windows
        genelist=( $( echo "${gene[@]:14}" | tr -d '"' | tr -d '\r' ) )

        # Iterate over individual loci
        for i in "${genelist[@]}"
        do
            # Parse ffn file name from loci name
            input="$ffn_dir/$(echo $i | awk -F ".fasta" '{print $1}').fasta_p.ffn"

            # Extract right sequence from ffn based on seqID
            awk -vRS=">" -vORS="" -vFS="\n" -vOFS="\n" -vID="$i" '(index($1, ID) != 0) {print ">"$0}' $input >> $output
        done
    done
 }
	#!/bin/bash

	#########################################
	# Anuj Sharma #
	# [email protected] #
	# 2022-05-18 #
	#########################################

	# Reading the input parameters
	OPTIND=1
	out_dir="."
	unset -v ffn_dir
	unset -v geneid_csv
	while getopts "d:c:o:hv" opt; do
	case "$opt" in
	d) ffn_dir=$OPTARG;; # Directory containing ffns.
	c) geneid_csv=$OPTARG;; # Path of presence/absence CSV.
	o) out_dir=$OPTARG;;
	h) echo "Usage: $0 -d ffn_directory -c presence_absence_csv [-o output file]" >&2 && exit 0;;
	v) echo "$(basename $0): v0.2" >&2 && exit 0;;
	*) echo "Extra arguments not used!" >&2
	esac
	done

	shift $((OPTIND-1))

	# Exit if necessary input are not given
	[[ -z "$ffn_dir" && -z "$geneid_csv" ]] && echo "Some input arguments missing." >&2 && exit 1
	[[ -z "$ffn_dir" ]] && echo "Input ffn directory path missing." >&2 && exit 1
	[[ -z "$geneid_csv" ]] && echo "Input geneID CSV path missing." >&2 && exit 1

	# Make output directory.
	mkdir -p $out_dir

	# Read presence/absence CSV.
	# Convert , to ~ is a workaround to prevent error from non-delimiter ,
	cat $geneid_csv \| sed 's/","/"~"/g' \| {
	# Ignore header
	read -r

	# Read row elements into array and iterate over it.
	while IFS='~' read -ra gene
	do
	# Extract name for gene/gene group.
	genename=$(echo "${gene[0]}" \| tr -d '"')

	# Parse outfut fasta name from gene name. Remove if existing.
	output="$out_dir/$genename.fasta"; rm -rf $output

	# Parse the gene loci for all genomes
	# Remove carriage return if generated in windows
	genelist=( $( echo "${gene[@]:14}" \| tr -d '"' \| tr -d '\r' ) )

	# Iterate over individual loci
	for i in "${genelist[@]}"
	do
	# Parse ffn file name from loci name
	input="$ffn_dir/$(echo $i \| awk -F ".fasta" '{print $1}').fasta_p.ffn"

	# Extract right sequence from ffn based on seqID
	awk -vRS=">" -vORS="" -vFS="\n" -vOFS="\n" -vID="$i" '(index($1, ID) != 0) {print ">"$0}' $input >> $output
	done
	done
	}