Last active
June 24, 2022 22:25
-
-
Save rknx/4ffd9a8a4a80d1a8a3d6b56897c9b7c5 to your computer and use it in GitHub Desktop.
Extracting nucleotide sequence from ffn files following Roary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
######################################### | |
# Anuj Sharma # | |
# [email protected] # | |
# 2022-05-18 # | |
######################################### | |
# Reading the input parameters | |
OPTIND=1 | |
out_dir="." | |
unset -v ffn_dir | |
unset -v geneid_csv | |
while getopts "d:c:o:hv" opt; do | |
case "$opt" in | |
d) ffn_dir=$OPTARG;; # Directory containing ffns. | |
c) geneid_csv=$OPTARG;; # Path of presence/absence CSV. | |
o) out_dir=$OPTARG;; | |
h) echo "Usage: $0 -d ffn_directory -c presence_absence_csv [-o output file]" >&2 && exit 0;; | |
v) echo "$(basename $0): v0.2" >&2 && exit 0;; | |
*) echo "Extra arguments not used!" >&2 | |
esac | |
done | |
shift $((OPTIND-1)) | |
# Exit if necessary input are not given | |
[[ -z "$ffn_dir" && -z "$geneid_csv" ]] && echo "Some input arguments missing." >&2 && exit 1 | |
[[ -z "$ffn_dir" ]] && echo "Input ffn directory path missing." >&2 && exit 1 | |
[[ -z "$geneid_csv" ]] && echo "Input geneID CSV path missing." >&2 && exit 1 | |
# Make output directory. | |
mkdir -p $out_dir | |
# Read presence/absence CSV. | |
# Convert , to ~ is a workaround to prevent error from non-delimiter , | |
cat $geneid_csv | sed 's/","/"~"/g' | { | |
# Ignore header | |
read -r | |
# Read row elements into array and iterate over it. | |
while IFS='~' read -ra gene | |
do | |
# Extract name for gene/gene group. | |
genename=$(echo "${gene[0]}" | tr -d '"') | |
# Parse outfut fasta name from gene name. Remove if existing. | |
output="$out_dir/$genename.fasta"; rm -rf $output | |
# Parse the gene loci for all genomes | |
# Remove carriage return if generated in windows | |
genelist=( $( echo "${gene[@]:14}" | tr -d '"' | tr -d '\r' ) ) | |
# Iterate over individual loci | |
for i in "${genelist[@]}" | |
do | |
# Parse ffn file name from loci name | |
input="$ffn_dir/$(echo $i | awk -F ".fasta" '{print $1}').fasta_p.ffn" | |
# Extract right sequence from ffn based on seqID | |
awk -vRS=">" -vORS="" -vFS="\n" -vOFS="\n" -vID="$i" '(index($1, ID) != 0) {print ">"$0}' $input >> $output | |
done | |
done | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment