Last active
January 15, 2019 16:28
-
-
Save pontikos/f5ce942cecfd9b587f5bf08cd0dbf99c to your computer and use it in GitHub Desktop.
nicer cadd script than the one provided
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#CADDpath="`dirname \"$0\"`" | |
#CADDpath="`( cd \"$CADDpath/..\" && pwd )`" | |
set +x | |
CADDpath=/share/apps/genomics/CADD_v1.3/ | |
if [ -z "$VEPpath" ] ; then source ${CADDpath}/bin/config.sh; fi | |
echo $VEPpath | |
NCORES=`expr $(lscpu | grep '^CPU(s):' | cut -f2 -d: | tr -d ' ') - 2` | |
input=$1 | |
output=$2 | |
temp=${output%.vcf.gz} | |
echo prepare input | |
zcat $input | ${CADDpath}/bin/src/VCF2vepVCF.py | sort -k1,1 -k2,2n -k3,3 -k4,4 | uniq > ${temp}_clean.tmp | |
echo score pre-scored SNVs | |
cat ${temp}_clean.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/whole_genome_SNVs.tsv.gz --found_out=${temp}_scored_snvs.tmp > ${temp}_indels.tmp | |
echo score pre-scored indels | |
cat ${temp}_indels.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/InDels.tsv.gz --found_out=${temp}_scored_indels.tmp > ${temp}_novel.tmp | |
echo annotate novel indels with VEP | |
cat ${temp}_novel.tmp | perl ${VEPpath} --quiet --cache --offline --dir ${CADDpath}/annotations/vep/ --buffer 1000 --no_stats --species homo_sapiens --db_version=75 --format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains --numbers --canonical --total_length --force_overwrite --fork=${NCORES} --output_file ${temp}_VEP.tmp | |
echo score VEP-annotated novel indels | |
cat ${temp}_VEP.tmp | awk 'BEGIN{ FS="\t"; OFS="\t"; }{ if ($1 ~ /^#/) { if ($1 ~ /^#Up/) { sub("#","",$1); print "#Chrom","Start","End",$0 } else { print } } else { split($2,a,":"); split(a[2],b,"-"); if (length(b) == 2) { print a[1],b[1],b[2],$0 } else { print a[1],b[1],b[1],$0 } }}' | ${CADDpath}/bin/src/annotateVEP.py --all --PIPE | ${CADDpath}/bin/src/score_linModel.py --noMap --noSegDup -m ${CADDpath}/bin/src/model/model_coefficients.txt | ${CADDpath}/bin/src/max_line_hierarchy.py | ${CADDpath}/bin/src/appendPHREDscore.py -t ${CADDpath}/bin/src/model/conversion_table_ext.tsv | awk 'BEGIN{ OFS="\t"; FS="\t" }{ if ($0 !~ /^#/) { print $1,$2,$3,$5,$115,$116 } }' | uniq > ${temp}_scored_novel.tmp | |
echo combining scored output of SNVs, indels and novel indels | |
cat <(echo "## CADD v1.3 (c) University of Washington and Hudson-Alpha Institute for Biotechnology 2013-2015. All rights reserved.") <(echo -e "#CHROM\tPOS\tREF\tALT\tRawScore\tPHRED") <(cat ${temp}_scored_*.tmp | sort -k1,1 -k2,2n -k3,3 -k4,4) | /share/apps/genomics/htslib-1.1/bin/bgzip -c > ${output} | |
tabix -p vcf ${output} | |
rm ${temp}_*.tmp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment