Created
April 27, 2020 20:19
-
-
Save dinovski/1a58805a55c5c51ce85b50b3de7f1be3 to your computer and use it in GitHub Desktop.
Add padding up and/or downstream of TSS coordinates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
BEDTOOLS=/local/build/BEDTools_2.21.0/bin | |
FEATURECOUNTS=/local/build/subread/subread-1.5.1-Linux-x86_64/bin/featureCounts | |
R=/local/build/R/R-3.4.0/bin/R | |
IDIR=/in/path | |
DIR=/out/path | |
GTF=gencode.vM13.annotation.gtf | |
CHR_LEN=mm10.chrom.sizes | |
grep -v "chrUn" $CHR_LEN | grep -v "random" > ${DIR}/mm10.chrom.sizes | |
## extract TSS coordinates and convert to BED and SAF: GeneID,Chr,Start,End,Strand | |
## add slop downstream TSS: for transcripts <= slop; use len transcript | |
## NOTE: use transcript ID (some genes have alt transcripts where len is > & < slop | |
mkdir -p ${DIR} | |
LEN=3000 | |
GENOME=mm10 | |
############### | |
## TSS based ## | |
############### | |
## Extract coord for transcripts < $LEN kb; use actual coords | |
grep -w transcript ${GTF} | awk -v len=$LEN '($5-$4 <= len)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.tss.short | |
## Extract TSS coordinates for fwd strand genes > $LEN kb | |
grep -w transcript ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${ODIR}/tmp.tss.long+ | |
## Extract TSS coordinates for reverse strand genes > kb | |
grep -w transcript ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$4}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${ODIR}/tmp.tss.long- | |
## Add padding to TSS for transcripts > $LEN | |
cat ${DIR}/tmp.tss.long+ ${DIR}/tmp.tss.long- | sort -k1,1 -k 2n,2 |\ | |
${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} -s -l 0 -r $LEN > ${DIR}/tmp.tss.long | |
cat ${DIR}/tmp.tss.long ${DIR}/tmp.tss.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\ | |
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_tss_plus_${LEN}.bed | |
rm -rf ${DIR}/tmp.tss* | |
################ | |
## gene-based ## | |
################ | |
## keep gene_id and gene_name | |
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/gencode.vM13.gene.bed | |
GENE_BED=${DIR}/gencode.vM13.gene.bed | |
## gene bed > saf | |
awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' ${GENE_BED} > ${DIR}/gencode.vM13.gene.saf | |
## strand-specific gene start coords for plotting TSS | |
awk 'BEGIN{FS=OFS="\t"}($6=="+"){print $1,$2-1,$2,$4,$5,$6}' ${GENE_BED} > ${DIR}/tmp.bed | |
awk 'BEGIN{FS=OFS="\t"}($6=="-"){print $1,$3,$3+1,$4,$5,$6}' ${GENE_BED} >> ${DIR}/tmp.bed | |
sort -k1,1 -k2n,2 ${DIR}/tmp.bed > ${DIR}/mm10_gene_tss.bed | |
rm -rf ${DIR}/tmp.bed | |
## gene_id only | |
## TSS | |
cat ${DIR}/mm10_gene_tss.bed | tr '_' '\t' | cut -f 5 --complement > ${DIR}/mm10_gene_tss.gene_id.bed | |
##------------- | |
## gene body coordinates | |
awk '($3=="gene")' ${GTF} | cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6,$5,$4}' > ${DIR}/gencode.vM13.gene_id.bed | |
##------------- | |
## TSS +/- kb | |
GENE_TSS_BED=${DIR}/mm10_gene_tss.bed | |
${BEDTOOLS}/slopBed -i ${GENE_TSS_BED} -g ${CHR_LEN} -s -l $LEN -r $LEN | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $4,$1,$2,$3,$5,$6}' > ${DIR}/mm10_genes_$(LEN).saf | |
##------------- | |
## TSS +kb | |
## Extract coord for transcripts < $LEN; use actual coords | |
awk '($3=="gene")' ${GTF} | awk '($5-$4 <= 3000)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk -v OFS='\t' '{print $1,$2,$3,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.short | |
## Extract TSS coordinates for fwd strand genes > $LEN | |
awk '($3=="gene")' ${GTF} | awk '$7=="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk '{OFS="\t"} {print $1,$2-1,$2,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long+ | |
## Extract TSS coordinates for reverse strand genes > 3kb | |
awk '($3=="gene")' ${GTF} | awk '$7!="+"' | awk -v len=$LEN '($5-$4 > len)' |\ | |
cut -f 1,4,5,7,8,9 | awk -F";" '{sub(/gene_id/,""); print $1,$3}' | sed 's/gene_name//' |\ | |
sed 's/"//g' | awk '{OFS="\t"} {print $1,$3,$3+1,$6"_"$7,$5,$4}' > ${DIR}/tmp.gene.long- | |
## add slop to TSS for genes > slop | |
SLOP_OPTS='-s -l 0 -r 3000' | |
cat ${DIR}/tmp.gene.long+ ${DIR}/tmp.gene.long- | sort -k1,1 -k 2n,2 | ${BEDTOOLS}/slopBed -i - -g ${CHR_LEN} ${SLOP_OPTS} > ${DIR}/tmp.gene.long | |
cat ${DIR}/tmp.gene.long ${DIR}/tmp.gene.short | sort -k1,1 -k2n,2 | awk '{OFS="\t"} {print $1,$2,$3,$4,$5,$6}' |\ | |
grep -v "chrX\|chrY\|chrM" > ${DIR}/${GENOME}_genes_plus_${LEN}.bed | |
rm -rf ${DIR}/tmp.gene* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment