Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Last active May 14, 2024 14:01
Show Gist options
  • Select an option

  • Save DannyArends/c70f21208438cd1305162f25435922f7 to your computer and use it in GitHub Desktop.

Select an option

Save DannyArends/c70f21208438cd1305162f25435922f7 to your computer and use it in GitHub Desktop.
#
# Call expression from aligned SRA reads
# copyright (c) 2022 - Danny Arends
#
library("GenomicAlignments")
library("GenomicFeatures")
library("Rsamtools")
library("preprocessCore")
library("vioplot")
library("RColorBrewer")
library("biomaRt")
# Go into the output folder
setwd("c:/Shared/")
# Create DB and exons per Gene
db <- makeTxDbFromGFF("genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf",
format = "gtf", organism = "Saccharomyces",
dataSource = "https://ftp.ensembl.org/pub/release-108/")
# Get the exons per gene, and compute bp lengths of all genes
exons <- exonsBy(db, by = "gene")
gene.lengths <- lapply(exons, function(x){ sum(width(reduce(x))) })
setwd("c:/Shared/output")
# Samples and simple names
samples <- c("SRR13978640", "SRR13978641", "SRR13978642", "SRR13978643", "SRR13978644", "SRR13978645")
names(samples) <- c("SPRC_1", "SPRC_2", "SPRC_3", "CTRL_1", "CTRL_2", "CTRL_3")
# Check if BAM files for all samples exist
files <- c()
for(s in samples){
fp <- paste0(s, ".aln/", s, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
if(file.exists(fp)) files <- c(files, fp)
}
# Load in the BAM files
bams <- BamFileList(files, yieldSize = 100000, asMates=TRUE)
# Overlap BAM reads and genes
overlap <- summarizeOverlaps(exons, bams, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE)
# Extract the raw-reads per gene
readcount <- assay(overlap)
colnames(readcount) <- gsub("Aligned.sortedByCoord.RD.RG.RC.out.bam", "", colnames(readcount))
write.table(readcount, "readcount.raw.txt", sep = "\t", quote = FALSE)
# Calculate the RPKM values per gene
# RPKM = (10^9 * C)/(N * L)
# C = Number of reads mapped to a gene
# N = Total mapped reads in the sample
# L = gene length in base-pairs for a gene
# Get the total number of reads per samples
N <- apply(readcount, 2, sum)
# Loop through all genes, compute RPKM
n <- 1
RPKM <- t(apply(readcount, 1, function(C){
L <- as.numeric(gene.lengths[n])
RPKM <- (10^9 * C) / (N * L)
n <<- n + 1
return(round(RPKM, d = 1))
}))
op <- par(mar = c(8,4,2,2))
#Violin distribution plot
vioplot(RPKM, col = c(rep("orange", 3), rep("lightblue", 3)), ylab = "reads", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM, "RPKM.txt", sep = "\t", quote = FALSE)
# Quantile normalization of RPKM values
RPKM.norm <- round(normalize.quantiles(as.matrix(RPKM)), d = 1)
colnames(RPKM.norm) <- colnames(RPKM)
rownames(RPKM.norm) <- rownames(RPKM)
#Violin distribution plot
vioplot(RPKM.norm, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "normalized", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM.norm, "RPKM.norm.txt", sep = "\t", quote = FALSE)
# LOG2 Qnorm RPKM (so we can treat it as microarray data)
RPKM.l2 <- round(log2(RPKM.norm), d = 1)
RPKM.l2[RPKM.l2 < 0] <- 0
#Violin distribution plot
vioplot(RPKM.l2, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "log2(normalize)", las = 2)
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))
write.table(RPKM.l2, "RPKM.norm.log2.txt", sep = "\t", quote = FALSE)
# P-values and Log2 fold change
pvals <- apply(RPKM.l2, 1, function(x){
tryCatch(t.test(x[1:3], x[4:6])$p.value, error = function(x){return(NA);})
})
fc <- apply(RPKM.l2, 1, function(x){
tryCatch(log2(mean(x[1:3]) / mean(x[4:6])), error = function(x){return(NA);})
})
# Assign colors based on P-values
colz <- rep("black", length(pvals))
colz[which(pvals < 5e-2)] <- "red"
colz[which(pvals < 1e-2)] <- "gold"
colz[which(pvals < 1e-3)] <- "blue"
# Volcano plot (x = fc, y = -log10(P-values))
plot(fc, -log10(pvals), col=colz, pch=18, main ="Vulcano plot", xlab="Fold Change")
legend("topleft", pch=18, c("<0.05", "<0.01", "<0.001"), col = c("red", "gold", "blue"))
# Down & Up regulated genes
down <- RPKM.l2[which(pvals < 5e-2 & fc < -0.3),]
dclust <- down[hclust(dist(down))$order,]
up <- RPKM.l2[which(pvals < 5e-2 & fc > 0.3),]
uclust <- up[hclust(dist(up))$order,]
# Gene IDs of up/Down regulated genes
geneIDs <- c(rownames(dclust), rownames(uclust))
# Custom heatmap using the spectral colors
op <- par(mar = c(8, 6, 2,1))
image(x = 1:ncol(RPKM.l2),
y = 1:(nrow(down)+nrow(up)),
z = t(rbind(dclust, uclust)),
xaxt='n',yaxt='n',xlab="", ylab="",
col = brewer.pal(11, "Spectral"))
axis(2, at = 1:length(geneIDs), labels = geneIDs, las = 2, cex.axis=0.7)
axis(1, at = 1:3, labels = colnames(RPKM.l2)[1:3], las = 2, col.axis = "orange")
axis(1, at = 4:6, labels = colnames(RPKM.l2)[4:6], las = 2, col.axis = "blue")
# Compute mean expression and standard deviations
means <- t(apply(RPKM.l2, 1, function(x){
tryCatch(round(c(mean(x[1:3]),mean(x[4:6])),1), error = function(x){return(NA);})
}))
sds <- t(apply(RPKM.l2, 1, function(x){
tryCatch(round(c(sd(x[1:3]),sd(x[4:6])),1), error = function(x){return(NA);})
}))
colnames(means) <- c("SPRC", "CTRL")
colnames(sds) <- c("SPRC", "CTRL")
# Create an overview table
overview <- cbind("CTRL" = means[, "CTRL"],
"CTRL(SD)" = sds[, "CTRL"],
"SPRC" = means[, "SPRC"],
"SPRC(SD)" = sds[, "SPRC"],
FC = round(fc,1),
P = round(pvals,6))
overview[1:10,]
# Use biomaRt to retrieve gene names, location, and description
library(biomaRt)
bio.mart <- useMart("ensembl", "scerevisiae_gene_ensembl")
mattr <- c("ensembl_gene_id", "external_gene_name",
"chromosome_name", "start_position", "end_position",
"description")
res.bm <- getBM(attributes = mattr,
filters = c("ensembl_gene_id"),
values = geneIDs, mart = bio.mart)
rownames(res.bm) <- res.bm[, "ensembl_gene_id"]
# Merge biomaRt results with the overview
p1 <- res.bm[geneIDs, c("external_gene_name", "chromosome_name", "start_position", "end_position")]
overview <- cbind(p1, overview[geneIDs,], res.bm[geneIDs, "description"])
colnames(overview)[1:4] <- c("GeneName", "Chr", "Start", "End")
colnames(overview)[11] <- c("Description")
overview[1:10,1:10]
# Write out the table
write.table(overview, "overview.ann.txt", sep = "\t", quote = FALSE)
#
# Align SRA reads to the Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
# Read the sample from the commandline
cmdlineargs <- commandArgs(trailingOnly = TRUE)
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
if(!is.na(outputfile) && file.exists(outputfile)){
cat("Output for step exists, skipping this step\n");
return("")
}
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
if(res[1] >= 1){
cat("Error external process did not finish\n\n");
if(quitOnError) q("no")
}
}
input.dir <- "/home/danny/data/raw"
input.base <- cmdlineargs[1] #"SRR13978643" # Now from the command line
output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
genome.path <- "/home/danny/genome/STAR"
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"
# Create an output folder
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }
# STEP 0 - SRA Download and Compress
setwd(input.dir)
execute(paste0("fasterq-dump ", input.base), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))
# STEP 1 - READ Trimming
trim.files <- c(
paste0(input.dir, "/", input.base,"_1.fastq.gz"),
paste0(input.dir, "/", input.base,"_2.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
)
trim.path <- "/home/danny/software/Trimmomatic"
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)
execute(trim.cmd, trim.files[3])
# STEP 1.1 - UNZIP for STAR
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])
# STEP 2 - Alignment using STAR
star.outbase <- paste0(output.dir, "/", input.base)
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")
star.exec <- "STAR --runMode alignReads"
star.opts <- paste0("--genomeDir=", genome.path, " --outSAMtype BAM SortedByCoordinate")
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
star.out <- paste0("--outFileNamePrefix ", star.outbase)
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)
execute(star.cmd, star.bam)
# STEP 2.1 - Create a samtools index
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
# STEP 2.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", star.bam))
execute(paste0("samtools coverage ", star.bam))
#STEP 3 - Remove duplicate reads using picard tools
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
metrics.out <- paste0(star.outbase, "_metrics.txt")
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
p.in <- paste0("-I ", star.bam)
p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
p.opts <- paste0("--REMOVE_DUPLICATES true")
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)
execute(p.cmd, p.bam)
# STEP 3.1 - Create a samtools index
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
# STEP 3.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", p.bam))
execute(paste0("samtools coverage ", p.bam))
# STEP 4 - Add read group (1) and sample run, library, and name
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
execute(p.cmd)
# STEP 4.1 - Create a samtools index
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))
# STEP 5 - GATK prep
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)
# STEP 5.1 - GATK BaseRecalibrator
gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
execute(gatk.cmd, gatk.cov1)
# STEP 5.2 - GATK ApplyBQSR
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
execute(gatk.cmd, recal.bam)
# STEP 5.3 - GATK BaseRecalibrator
gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
execute(gatk.cmd, gatk.cov2)
# STEP 5.4 - GATK AnalyzeCovariates
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
execute(gatk.cmd)
# STEP 6 - Index the recalibrated bam files
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))
# STEP 6.1 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", recal.bam))
execute(paste0("samtools coverage ", recal.bam))
q("no")
if(!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(
c("GenomicAlignments", "GenomicFeatures",
"Rsamtools", "biomaRt","preprocessCore")
)
install.packages("vioplot")
install.packages("RColorBrewer")
@DannyArends
Copy link
Copy Markdown
Author

DannyArends commented Oct 23, 2023 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment