Last active
July 8, 2019 20:45
-
-
Save tayabsoomro/b1b8438df8dbd912b663ae0fcdc0cb23 to your computer and use it in GitHub Desktop.
Generates a CSV file containing useful data about alignment from SAM file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
SAM=/isilon/saskatoon-rdc/users/soomrot/genome_comparison/minimap2/run4_pbpt3.sorted.bam | |
TOTAL_GAP_BP=0 | |
prev_end_ref=-404 | |
prev_end_query=-404 | |
prev_old_contig_name="" | |
PATH_TO_NEW_CONTIG_FILE=/isilon/saskatoon-rdc/users/soomrot/data/OX1905abc/OX1905-Assembly/run4/all_contigs.fasta | |
PATH_TO_FASTA=/home/AAFC-AAC/soomrot/software | |
PATH_TO_PBPT3=/isilon/saskatoon-rdc/users/soomrot/genome_comparison/nucmer_run/pbpt3.fasta | |
echo "Old Contig",\ | |
"Old Contig Length",\ | |
"New Contig",\ | |
"New Contig Length",\ | |
"Start in Ref",\ | |
"End in Ref",\ | |
"Length in Ref",\ | |
"Deletions from Ref",\ | |
"Insertions in Ref",\ | |
"Start in Query",\ | |
"End in Query",\ | |
"Length in Query",\ | |
"Gap length in Ref",\ | |
"Gap Sequence" | |
for contig in `cat all_contig_headers.txt`; do | |
IFS=$'\n' | |
lines=`samtools view $SAM | awk -v contig=$contig '{if($3==contig && $2!=4 && \ | |
$10 != "*")print $0}'` | |
for line in $lines; do | |
flag=$(echo $line | awk '{print $2}'); | |
cigar=$(echo $line | awk '{print $6}'); | |
old_contig=$(echo $line | awk '{print $1}'); | |
new_contig=$(echo $line | awk '{print $3}'); | |
start_ref=$(echo $line | awk '{print $4}'); | |
len_ref=$(echo $cigar | grep -oE "[0-9]{1,}[MDN]" | xargs | \ | |
sed "s/[MDN]/+/g;s/\s//g;s/.$//g;" | bc); | |
end_ref=$(echo "$start_ref+$len_ref" | bc); | |
insertions=$(echo $cigar | if [ $(grep -coE "[0-9]{1,}[I]") \ | |
-eq 0 ]; then echo "0"; else echo $cigar | \ | |
grep -oE "[0-9]{1,}I" | xargs | \ | |
sed "s/I/+/g;s/\s//g;s/.$//g;" | bc; fi); | |
deletions=$(echo $cigar | if [ $(grep -coE "[0-9]{1,}D") \ | |
-eq 0 ]; then echo "0"; else echo $cigar | grep -oE \ | |
"[0-9]{1,}D" | xargs | sed "s/D/+/g;s/\s//g;s/.$//g;" \ | |
| bc; fi); | |
tmp=$(echo $cigar | grep -oE "^[0-9]{1,}H" | sed "s/H//g;"); | |
start_query=$(echo $cigar | if [ $(grep -coE "^[0-9]{1,}H") \ | |
-eq 0 ]; then echo "1"; else echo $tmp; fi ); | |
len_query=$(echo $cigar | grep -oE "[0-9]{1,}[MI]" | xargs | \ | |
sed "s/[MI]/+/g;s/\s//g;s/.$//g;" | bc); | |
end_query=$(echo "$start_query+$len_query" | bc); | |
old_contig_length=$($PATH_TO_FASTA/fasta.py length \ | |
$PATH_TO_PBPT3 $old_contig) | |
new_contig_length=$($PATH_TO_FASTA/fasta.py length \ | |
$PATH_TO_NEW_CONTIG_FILE $new_contig) | |
res=$($PATH_TO_FASTA/getflags.py $flag 16); | |
if [ "$prev_end_ref" -eq -404 ] && [ "$prev_end_query" -eq -404 ]; then | |
gap="" | |
gap_start="" | |
gap_end="" | |
gap_start_query="" | |
gap_end_query="" | |
else | |
diff=$(echo "$start_ref-$prev_end_ref" | bc) | |
gap=${diff#-} | |
TOTAL_GAP_BP=$( bc <<< "$TOTAL_GAP_BP+$gap") | |
gap_start=$prev_end_ref | |
gap_end=$start_ref | |
gap_old_contig_name=$old_contig | |
gap_start_query=$prev_end_query | |
gap_end_query=$start_query | |
if [ "$prev_old_contig_name" = "$old_contig" ]; then | |
seq=1 | |
old_contig_name="$old_contig" | |
else seq=0; fi | |
fi | |
export prev_end_ref=$end_ref | |
export prev_end_query=$end_query | |
export prev_old_contig_name=$old_contig | |
if ! [ -z "$gap_start" ] && [ "$gap" -ne 0 ]; then | |
if ! [ -z $seq ]; then | |
abs_gap_start=$(echo "$gap_start_query-10"|bc) | |
gap_start_end=$(echo "$gap_start_query+5"|bc) | |
gap_end_start=$(echo "$gap_end_query-5"|bc) | |
abs_gap_end=$(echo "$gap_end_query+10"|bc) | |
if [ $gap_end_start -lt 1 ]; then | |
gap_end_start=1 | |
fi | |
if [ $abs_gap_start -lt 1 ]; then | |
abs_gap_start=1 | |
fi | |
if [ "$res" = "True" ]; then | |
SEQ_FIRST=$($PATH_TO_FASTA/fasta.py \ | |
record $PATH_TO_PBPT3 $old_contig_name \ | |
--range=$abs_gap_start-$gap_start_end\ | |
--revcomp | tail -n+2) | |
SEQ_SECOND=$($PATH_TO_FASTA/fasta.py \ | |
record $PATH_TO_PBPT3 $old_contig_name \ | |
--range=$gap_end_start-$abs_gap_end \ | |
--revcomp | tail -n+2) | |
else | |
SEQ_FIRST=$($PATH_TO_FASTA/fasta.py \ | |
record $PATH_TO_PBPT3 $old_contig_name \ | |
--range=$abs_gap_start-$gap_start_end\ | |
| tail -n+2) | |
SEQ_SECOND=$($PATH_TO_FASTA/fasta.py \ | |
record $PATH_TO_PBPT3 $old_contig_name \ | |
--range=$gap_end_start-$abs_gap_end \ | |
| tail -n+2) | |
fi | |
SEQ=$(echo "$SEQ_FIRST...$SEQ_SECOND") | |
fi | |
echo ,,$new_contig,$new_contig_length,$gap_start,$gap_end,$gap,,,,,,$gap,$SEQ | |
fi | |
echo $old_contig,\ | |
$old_contig_length,\ | |
$new_contig,\ | |
$new_contig_length,\ | |
$start_ref,\ | |
$end_ref,\ | |
$len_ref,\ | |
$insertions,\ | |
$deletions,\ | |
$start_query,\ | |
$end_query,\ | |
$len_query \ | |
| sed "s/\s//g" | sort -t, -n -k3,3; | |
done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment