Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Last active May 14, 2024 14:01
Show Gist options
  • Save DannyArends/c70f21208438cd1305162f25435922f7 to your computer and use it in GitHub Desktop.
Save DannyArends/c70f21208438cd1305162f25435922f7 to your computer and use it in GitHub Desktop.
#
# Call expression from aligned SRA reads
# copyright (c) 2022 - Danny Arends
#
library("GenomicAlignments")
library("GenomicFeatures")
library("Rsamtools")
library("preprocessCore")
library("vioplot")
library("RColorBrewer")
library("biomaRt")
# Go into the output folder
setwd("c:/Shared/")
# Create DB and exons per Gene
db <- makeTxDbFromGFF("genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf",
format = "gtf", organism = "Saccharomyces",
dataSource = "https://ftp.ensembl.org/pub/release-108/")
# Get the exons per gene, and compute bp lengths of all genes
exons <- exonsBy(db, by = "gene")
gene.lengths <- lapply(exons, function(x){ sum(width(reduce(x))) })
setwd("c:/Shared/output")
# Samples and simple names
samples <- c("SRR13978640", "SRR13978641", "SRR13978642", "SRR13978643", "SRR13978644", "SRR13978645")
names(samples) <- c("SPRC_1", "SPRC_2", "SPRC_3", "CTRL_1", "CTRL_2", "CTRL_3")
# Check if BAM files for all samples exist
files <- c()
for(s in samples){
fp <- paste0(s, ".aln/", s, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
if(file.exists(fp)) files <- c(files, fp)
}
# Load in the BAM files
bams <- BamFileList(files, yieldSize = 100000, asMates=TRUE)
# Overlap BAM reads and genes
overlap <- summarizeOverlaps(exons, bams, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE)
# Extract the raw-reads per gene
readcount <- assay(overlap)
colnames(readcount) <- gsub("Aligned.sortedByCoord.RD.RG.RC.out.bam", "", colnames(readcount))
write.table(readcount, "readcount.raw.txt", sep = "\t", quote = FALSE)
# Calculate the RPKM values per gene
# RPKM = (10^9 * C)/(N * L)
# C = Number of reads mapped to a gene
# N = Total mapped reads in the sample
# L = gene length in base-pairs for a gene
# Get the total number of reads per samples
N <- apply(readcount, 2, sum)
# Loop through all genes, compute RPKM
n <- 1
RPKM <- t(apply(readcount, 1, function(C){
L <- as.numeric(gene.lengths[n])
RPKM <- (10^9 * C) / (N * L)
n <<- n + 1
return(round(RPKM, d = 1))
}))
op <- par(mar = c(8,4,2,2))
#Violin distribution plot
vioplot(RPKM, col = c(rep("orange", 3), rep("lightblue", 3)), ylab = "reads", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM, "RPKM.txt", sep = "\t", quote = FALSE)
# Quantile normalization of RPKM values
RPKM.norm <- round(normalize.quantiles(as.matrix(RPKM)), d = 1)
colnames(RPKM.norm) <- colnames(RPKM)
rownames(RPKM.norm) <- rownames(RPKM)
#Violin distribution plot
vioplot(RPKM.norm, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "normalized", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM.norm, "RPKM.norm.txt", sep = "\t", quote = FALSE)
# LOG2 Qnorm RPKM (so we can treat it as microarray data)
RPKM.l2 <- round(log2(RPKM.norm), d = 1)
RPKM.l2[RPKM.l2 < 0] <- 0
#Violin distribution plot
vioplot(RPKM.l2, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "log2(normalize)", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM.l2, "RPKM.norm.log2.txt", sep = "\t", quote = FALSE)
# P-values and Log2 fold change
pvals <- apply(RPKM.l2, 1, function(x){
tryCatch(t.test(x[1:3], x[4:6])$p.value, error = function(x){return(NA);})
})
fc <- apply(RPKM.l2, 1, function(x){
tryCatch(log2(mean(x[1:3]) / mean(x[4:6])), error = function(x){return(NA);})
})
# Assign colors based on P-values
colz <- rep("black", length(pvals))
colz[which(pvals < 5e-2)] <- "red"
colz[which(pvals < 1e-2)] <- "gold"
colz[which(pvals < 1e-3)] <- "blue"
# Volcano plot (x = fc, y = -log10(P-values))
plot(fc, -log10(pvals), col=colz, pch=18, main ="Vulcano plot", xlab="Fold Change")
legend("topleft", pch=18, c("<0.05", "<0.01", "<0.001"), col = c("red", "gold", "blue"))
# Down & Up regulated genes
down <- RPKM.l2[which(pvals < 5e-2 & fc < -0.3),]
dclust <- down[hclust(dist(down))$order,]
up <- RPKM.l2[which(pvals < 5e-2 & fc > 0.3),]
uclust <- up[hclust(dist(up))$order,]
# Gene IDs of up/Down regulated genes
geneIDs <- c(rownames(dclust), rownames(uclust))
# Custom heatmap using the spectral colors
op <- par(mar = c(8, 6, 2,1))
image(x = 1:ncol(RPKM.l2),
y = 1:(nrow(down)+nrow(up)),
z = t(rbind(dclust, uclust)),
xaxt='n',yaxt='n',xlab="", ylab="",
col = brewer.pal(11, "Spectral"))
axis(2, at = 1:length(geneIDs), labels = geneIDs, las = 2, cex.axis=0.7)
axis(1, at = 1:3, labels = colnames(RPKM.l2)[1:3], las = 2, col.axis = "orange")
axis(1, at = 4:6, labels = colnames(RPKM.l2)[4:6], las = 2, col.axis = "blue")
# Compute mean expression and standard deviations
means <- t(apply(RPKM.l2, 1, function(x){
tryCatch(round(c(mean(x[1:3]),mean(x[4:6])),1), error = function(x){return(NA);})
}))
sds <- t(apply(RPKM.l2, 1, function(x){
tryCatch(round(c(sd(x[1:3]),sd(x[4:6])),1), error = function(x){return(NA);})
}))
colnames(means) <- c("SPRC", "CTRL")
colnames(sds) <- c("SPRC", "CTRL")
# Create an overview table
overview <- cbind("CTRL" = means[, "CTRL"],
"CTRL(SD)" = sds[, "CTRL"],
"SPRC" = means[, "SPRC"],
"SPRC(SD)" = sds[, "SPRC"],
FC = round(fc,1),
P = round(pvals,6))
overview[1:10,]
# Use biomaRt to retrieve gene names, location, and description
library(biomaRt)
bio.mart <- useMart("ensembl", "scerevisiae_gene_ensembl")
mattr <- c("ensembl_gene_id", "external_gene_name",
"chromosome_name", "start_position", "end_position",
"description")
res.bm <- getBM(attributes = mattr,
filters = c("ensembl_gene_id"),
values = geneIDs, mart = bio.mart)
rownames(res.bm) <- res.bm[, "ensembl_gene_id"]
# Merge biomaRt results with the overview
p1 <- res.bm[geneIDs, c("external_gene_name", "chromosome_name", "start_position", "end_position")]
overview <- cbind(p1, overview[geneIDs,], res.bm[geneIDs, "description"])
colnames(overview)[1:4] <- c("GeneName", "Chr", "Start", "End")
colnames(overview)[11] <- c("Description")
overview[1:10,1:10]
# Write out the table
write.table(overview, "overview.ann.txt", sep = "\t", quote = FALSE)
#
# Align SRA reads to the Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
# Read the sample from the commandline
cmdlineargs <- commandArgs(trailingOnly = TRUE)
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
if(!is.na(outputfile) && file.exists(outputfile)){
cat("Output for step exists, skipping this step\n");
return("")
}
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
if(res[1] >= 1){
cat("Error external process did not finish\n\n");
if(quitOnError) q("no")
}
}
input.dir <- "/home/danny/data/raw"
input.base <- cmdlineargs[1] #"SRR13978643" # Now from the command line
output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
genome.path <- "/home/danny/genome/STAR"
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"
# Create an output folder
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }
# STEP 0 - SRA Download and Compress
setwd(input.dir)
execute(paste0("fasterq-dump ", input.base), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))
# STEP 1 - READ Trimming
trim.files <- c(
paste0(input.dir, "/", input.base,"_1.fastq.gz"),
paste0(input.dir, "/", input.base,"_2.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
)
trim.path <- "/home/danny/software/Trimmomatic"
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)
execute(trim.cmd, trim.files[3])
# STEP 1.1 - UNZIP for STAR
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])
# STEP 2 - Alignment using STAR
star.outbase <- paste0(output.dir, "/", input.base)
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")
star.exec <- "STAR --runMode alignReads"
star.opts <- paste0("--genomeDir=", genome.path, " --outSAMtype BAM SortedByCoordinate")
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
star.out <- paste0("--outFileNamePrefix ", star.outbase)
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)
execute(star.cmd, star.bam)
# STEP 2.1 - Create a samtools index
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
# STEP 2.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", star.bam))
execute(paste0("samtools coverage ", star.bam))
#STEP 3 - Remove duplicate reads using picard tools
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
metrics.out <- paste0(star.outbase, "_metrics.txt")
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
p.in <- paste0("-I ", star.bam)
p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
p.opts <- paste0("--REMOVE_DUPLICATES true")
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)
execute(p.cmd, p.bam)
# STEP 3.1 - Create a samtools index
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
# STEP 3.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", p.bam))
execute(paste0("samtools coverage ", p.bam))
# STEP 4 - Add read group (1) and sample run, library, and name
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
execute(p.cmd)
# STEP 4.1 - Create a samtools index
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))
# STEP 5 - GATK prep
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)
# STEP 5.1 - GATK BaseRecalibrator
gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
execute(gatk.cmd, gatk.cov1)
# STEP 5.2 - GATK ApplyBQSR
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
execute(gatk.cmd, recal.bam)
# STEP 5.3 - GATK BaseRecalibrator
gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
execute(gatk.cmd, gatk.cov2)
# STEP 5.4 - GATK AnalyzeCovariates
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
execute(gatk.cmd)
# STEP 6 - Index the recalibrated bam files
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))
# STEP 6.1 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", recal.bam))
execute(paste0("samtools coverage ", recal.bam))
q("no")
if(!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(
c("GenomicAlignments", "GenomicFeatures",
"Rsamtools", "biomaRt","preprocessCore")
)
install.packages("vioplot")
install.packages("RColorBrewer")
@DannyArends
Copy link
Author

DannyArends commented Oct 23, 2023 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment