Last active
October 14, 2022 05:34
-
-
Save rknx/4930a3c703c1da1a09be7f7ad5fa323e to your computer and use it in GitHub Desktop.
Intersect genes (gtf/gff) with alien_hunter output
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
######################################### | |
# Anuj Sharma # | |
# [email protected] # | |
# 2022-06-24 # | |
######################################### | |
# Reading the input parameters | |
OPTIND=1 | |
OUT_FILE="" | |
unset -v GFF | |
unset -v SCO | |
POS="3,4,4,5" | |
while getopts "o:p:g:s:hv" opt; do | |
case "$opt" in | |
o) OUT_FILE=$OPTARG;; | |
p) POS=$OPTARG;; | |
g) GFF=$OPTARG;; | |
s) SCO=$OPTARG;; | |
h) echo "Usage: $0 -g GFF file -s SCO file [-o output file] [-p column positions]" >&2 && exit 0;; | |
v) echo "$(basename $0): v0.4" >&2 && exit 0;; | |
*) echo "Extra arguments not used!" >&2 | |
esac | |
done | |
shift $((OPTIND-1)) | |
# Output to STDOUT or file based on argument -o | |
[ ! -z $OUT_FILE ] && exec >$OUT_FILE | |
# Exit if necessary input are not given | |
[[ -z "$GFF" && -z "$SCO" ]] && echo "Input GFF and SCO files missing." >&2 && exit 1 | |
[[ -z "$GFF" ]] && echo "Input GFF file missing." >&2 && exit 1 | |
[[ -z "$GFF" ]] && echo "Input SCO files missing." >&2 && exit 1 | |
# Parse SCO file into awk readable table and store as temp file | |
tmpfile=$(mktemp /tmp/alien_intersect.XXXXXX) | |
exec 3>"$tmpfile" | |
grep -v "score" $SCO | sed 's/\.\./\t/g' >&3 | |
# Grab the genes within the SCO regions | |
# | |
awk -vpos=$POS ' | |
# Parse columns numbers for manual input | |
BEGIN{split(pos,a,",")} | |
# Get all start and end regions from SCO file | |
NR==FNR{start[NR] = $a[1]; end[NR] = $a[2]; next;} | |
# Loop over each region (gtf) and find genes within SCO regions | |
!/^##/{for (i=1; i<=length(start); i++) { | |
#check if gene falls within the region and print if it does | |
if ($a[3] >= start[i] && $a[4] <= end[i]) {print $0;} | |
}} | |
# Break when annotation ends and sequence begins | |
/^##FASTA/ {exit;} | |
' "$tmpfile" $GFF | uniq | |
# Remove temp file | |
rm "$tmpfile" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment