Last active
September 1, 2016 17:08
-
-
Save sestaton/751e4b352d6b8471d4708a4e09190589 to your computer and use it in GitHub Desktop.
Run tephra on Arabidopsis thaliana
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set this to the location of muscle and vmatch | |
export PATH=`pwd`:$PATH | |
set -euo pipefail | |
script=$(basename $0) | |
function usage() { | |
cat <<EOF | |
USAGE: $script <repeat_db> <threads> <repeat_hmm> | |
repeat_db : A (nucleotide) FASTA file database of repeats, such as RepBase. | |
threads : The number of parallel process to use for computations (Default: 1). | |
repeat_hmm : An HMM file (HMMERv3 format) of repeat domains for classification (optional). | |
EOF | |
} | |
function print_error() { | |
cat <<ERR | |
ERROR: Command line not parsed correctly. Check input. | |
ERR | |
} | |
function get_genome() { | |
for i in $(seq 1 5); | |
do | |
curl -o TAIR10_chr${i}.fas \ | |
-sL ftp://ftp.arabidopsis.org/home/tair/Sequences/whole_chromosomes/TAIR10_chr${i}.fas | |
done | |
genome=TAIR10_chr1-5.fas | |
cat TAIR10_chr[1-5].fas > $genome | |
rm TAIR10_chr[1-5].fas | |
echo "$genome" | |
} | |
function get_findltrs_config() { | |
curl -o tephra_ltr_config_arab.yml \ | |
-sL https://gist.githubusercontent.com/sestaton/29506e2b5048440de74a146848b5b869/raw/4762ac547f52d5f5da2ff486f20da45bb0716493/tephra_ltr_config_arab.yml | |
echo "tephra_ltr_config_arab.yml" | |
} | |
if [ $# -lt 1 ]; then | |
print_error | |
usage | |
exit 1 | |
fi | |
## main program | |
repdb=$1 | |
threads=$2 | |
genome=$(get_genome) | |
config=$(get_findltrs_config) | |
base=$(echo ${genome%.*}) | |
## LTRs | |
time tephra findltrs \ | |
-g $genome \ | |
-d $hmmdb \ | |
-t $trnas \ | |
-o ${base}_tephra_ltrs.gff3 \ | |
-c $config \ | |
--clean | |
time tephra classifyltrs \ | |
-g $genome \ | |
-d $repdb \ | |
-t $threads \ | |
-f ${base}_tephra_ltrs.gff3 \ | |
-o ${base}_classified_ltrs | |
time tephra maskref \ | |
-g $genome \ | |
-d ${base}_classified_ltrs/${base}_combined_LTR_families.fasta \ | |
-o ${base}_masked.fas | |
## solo-LTRs | |
time tephra sololtr -i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_copia \ | |
-g ${base}_masked.fas \ | |
-o ${base}_masked_copia_sololtrs.gff3 \ | |
-r ${base}_masked_copia_sololtr_rep.tsv \ | |
-s ${base}_masked_copia_sololtr_seqs.fas \ | |
-t $threads | |
time tephra sololtr -i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_gypsy \ | |
-g ${base}_masked.fas \ | |
-o ${base}_masked_copia_sololtrs.gff3 \ | |
-r ${base}_masked_gypsy_sololtr_rep.tsv \ | |
-s ${base}_masked_gypsy_sololtr_seqs.fas \ | |
-t $threads | |
## ltrage | |
time tephra ltrage -g $genome \ | |
-t $threads \ | |
-o ${base}_ltrages_all.tsv \ | |
-f TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_families.gff3 \ | |
--all \ | |
--clean | |
time tephra ltrage -g $genome \ | |
-t $threads \ | |
-i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_copia \ | |
-o ${base}_copia_ltrages_exemp.tsv \ | |
--clean | |
time tephra ltrage -g $genome \ | |
-t $threads \ | |
-i TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_unclassified \ | |
-o ${base}_unclassified_ltrages.tsv \ | |
-f TAIR10_chr1-5_classified_ltrs/TAIR10_chr1-5_tephra_ltrs_families.gff3 \ | |
--clean | |
## illrecomb | |
time tephra illrecomb -i ${base}_classified_ltrs/${base}_tephra_ltrs_copia \ | |
-o ${base}_masked_copia_illrecomb.fas \ | |
-r ${base}_masked_copia_illrecomb_rep.tsv \ | |
-s ${base}_masked_copia_illrecomb_stats.tsv \ | |
-t $threads | |
time tephra illrecomb -i ${base}_classified_ltrs/${base}_tephra_ltrs_gypsy \ | |
-o ${base}_masked_gypsy_illrecomb.fas \ | |
-r ${base}_masked_gypsy_illrecomb_rep.tsv \ | |
-s ${base}_masked_gypsy_illrecomb_stats.tsv \ | |
-t $threads | |
## TRIMs | |
time tephra findtrims \ | |
-g ${base}_masked.fas \ | |
-d $hmms \ | |
-t $trnas | |
time tephra maskref \ | |
-g ${base}_masked.fas \ | |
-d ${base}_masked_trim_ltrdigest85_combined_filtered.fasta \ | |
-o ${base}_masked2.fas | |
## Helitrons | |
time tephra findhelitrons \ | |
-g ${base}_masked2.fas \ | |
-o ${base}_masked2_helitrons.gff3 | |
time tephra maskref \ | |
-g ${base}_masked2.fas \ | |
-d ${base}_masked2_tephra_hscan_helitrons.hel.fa \ | |
-o ${base}_masked3.fas | |
## TIR elements | |
time tephra findtirs \ | |
-g ${base}_masked.fas \ | |
-d $hmms \ | |
-o ${base}_masked_tirs.gff3 | |
time tephra classifytirs \ | |
-g ${base}_masked.fas \ | |
-f ${base}_masked_tirs_filtered.gff3 | |
time tephra maskref \ | |
-d ${base}_masked3_tirs.fasta \ | |
-g ${base}_masked3.fas \ | |
-o ${base}_masked4.fas | |
## non-LTRs | |
time tephra findnonltrs \ | |
-g ${base}_masked4.fas | |
time tephra maskref \ | |
-g ${base}_masked_masked_masked.fas \ | |
-d nonLTRs_out/nonLTRs_out_tephra_nonltr.fasta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment