Last active
May 14, 2024 14:01
-
-
Save DannyArends/c70f21208438cd1305162f25435922f7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Call expression from aligned SRA reads | |
# copyright (c) 2022 - Danny Arends | |
# | |
library("GenomicAlignments") | |
library("GenomicFeatures") | |
library("Rsamtools") | |
library("preprocessCore") | |
library("vioplot") | |
library("RColorBrewer") | |
library("biomaRt") | |
# Go into the output folder | |
setwd("c:/Shared/") | |
# Create DB and exons per Gene | |
db <- makeTxDbFromGFF("genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf", | |
format = "gtf", organism = "Saccharomyces", | |
dataSource = "https://ftp.ensembl.org/pub/release-108/") | |
# Get the exons per gene, and compute bp lengths of all genes | |
exons <- exonsBy(db, by = "gene") | |
gene.lengths <- lapply(exons, function(x){ sum(width(reduce(x))) }) | |
setwd("c:/Shared/output") | |
# Samples and simple names | |
samples <- c("SRR13978640", "SRR13978641", "SRR13978642", "SRR13978643", "SRR13978644", "SRR13978645") | |
names(samples) <- c("SPRC_1", "SPRC_2", "SPRC_3", "CTRL_1", "CTRL_2", "CTRL_3") | |
# Check if BAM files for all samples exist | |
files <- c() | |
for(s in samples){ | |
fp <- paste0(s, ".aln/", s, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
if(file.exists(fp)) files <- c(files, fp) | |
} | |
# Load in the BAM files | |
bams <- BamFileList(files, yieldSize = 100000, asMates=TRUE) | |
# Overlap BAM reads and genes | |
overlap <- summarizeOverlaps(exons, bams, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE) | |
# Extract the raw-reads per gene | |
readcount <- assay(overlap) | |
colnames(readcount) <- gsub("Aligned.sortedByCoord.RD.RG.RC.out.bam", "", colnames(readcount)) | |
write.table(readcount, "readcount.raw.txt", sep = "\t", quote = FALSE) | |
# Calculate the RPKM values per gene | |
# RPKM = (10^9 * C)/(N * L) | |
# C = Number of reads mapped to a gene | |
# N = Total mapped reads in the sample | |
# L = gene length in base-pairs for a gene | |
# Get the total number of reads per samples | |
N <- apply(readcount, 2, sum) | |
# Loop through all genes, compute RPKM | |
n <- 1 | |
RPKM <- t(apply(readcount, 1, function(C){ | |
L <- as.numeric(gene.lengths[n]) | |
RPKM <- (10^9 * C) / (N * L) | |
n <<- n + 1 | |
return(round(RPKM, d = 1)) | |
})) | |
op <- par(mar = c(8,4,2,2)) | |
#Violin distribution plot | |
vioplot(RPKM, col = c(rep("orange", 3), rep("lightblue", 3)), ylab = "reads", las = 2) | |
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue")) | |
write.table(RPKM, "RPKM.txt", sep = "\t", quote = FALSE) | |
# Quantile normalization of RPKM values | |
RPKM.norm <- round(normalize.quantiles(as.matrix(RPKM)), d = 1) | |
colnames(RPKM.norm) <- colnames(RPKM) | |
rownames(RPKM.norm) <- rownames(RPKM) | |
#Violin distribution plot | |
vioplot(RPKM.norm, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "normalized", las = 2) | |
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue")) | |
write.table(RPKM.norm, "RPKM.norm.txt", sep = "\t", quote = FALSE) | |
# LOG2 Qnorm RPKM (so we can treat it as microarray data) | |
RPKM.l2 <- round(log2(RPKM.norm), d = 1) | |
RPKM.l2[RPKM.l2 < 0] <- 0 | |
#Violin distribution plot | |
vioplot(RPKM.l2, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "log2(normalize)", las = 2) | |
legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue")) | |
write.table(RPKM.l2, "RPKM.norm.log2.txt", sep = "\t", quote = FALSE) | |
# P-values and Log2 fold change | |
pvals <- apply(RPKM.l2, 1, function(x){ | |
tryCatch(t.test(x[1:3], x[4:6])$p.value, error = function(x){return(NA);}) | |
}) | |
fc <- apply(RPKM.l2, 1, function(x){ | |
tryCatch(log2(mean(x[1:3]) / mean(x[4:6])), error = function(x){return(NA);}) | |
}) | |
# Assign colors based on P-values | |
colz <- rep("black", length(pvals)) | |
colz[which(pvals < 5e-2)] <- "red" | |
colz[which(pvals < 1e-2)] <- "gold" | |
colz[which(pvals < 1e-3)] <- "blue" | |
# Volcano plot (x = fc, y = -log10(P-values)) | |
plot(fc, -log10(pvals), col=colz, pch=18, main ="Vulcano plot", xlab="Fold Change") | |
legend("topleft", pch=18, c("<0.05", "<0.01", "<0.001"), col = c("red", "gold", "blue")) | |
# Down & Up regulated genes | |
down <- RPKM.l2[which(pvals < 5e-2 & fc < -0.3),] | |
dclust <- down[hclust(dist(down))$order,] | |
up <- RPKM.l2[which(pvals < 5e-2 & fc > 0.3),] | |
uclust <- up[hclust(dist(up))$order,] | |
# Gene IDs of up/Down regulated genes | |
geneIDs <- c(rownames(dclust), rownames(uclust)) | |
# Custom heatmap using the spectral colors | |
op <- par(mar = c(8, 6, 2,1)) | |
image(x = 1:ncol(RPKM.l2), | |
y = 1:(nrow(down)+nrow(up)), | |
z = t(rbind(dclust, uclust)), | |
xaxt='n',yaxt='n',xlab="", ylab="", | |
col = brewer.pal(11, "Spectral")) | |
axis(2, at = 1:length(geneIDs), labels = geneIDs, las = 2, cex.axis=0.7) | |
axis(1, at = 1:3, labels = colnames(RPKM.l2)[1:3], las = 2, col.axis = "orange") | |
axis(1, at = 4:6, labels = colnames(RPKM.l2)[4:6], las = 2, col.axis = "blue") | |
# Compute mean expression and standard deviations | |
means <- t(apply(RPKM.l2, 1, function(x){ | |
tryCatch(round(c(mean(x[1:3]),mean(x[4:6])),1), error = function(x){return(NA);}) | |
})) | |
sds <- t(apply(RPKM.l2, 1, function(x){ | |
tryCatch(round(c(sd(x[1:3]),sd(x[4:6])),1), error = function(x){return(NA);}) | |
})) | |
colnames(means) <- c("SPRC", "CTRL") | |
colnames(sds) <- c("SPRC", "CTRL") | |
# Create an overview table | |
overview <- cbind("CTRL" = means[, "CTRL"], | |
"CTRL(SD)" = sds[, "CTRL"], | |
"SPRC" = means[, "SPRC"], | |
"SPRC(SD)" = sds[, "SPRC"], | |
FC = round(fc,1), | |
P = round(pvals,6)) | |
overview[1:10,] | |
# Use biomaRt to retrieve gene names, location, and description | |
library(biomaRt) | |
bio.mart <- useMart("ensembl", "scerevisiae_gene_ensembl") | |
mattr <- c("ensembl_gene_id", "external_gene_name", | |
"chromosome_name", "start_position", "end_position", | |
"description") | |
res.bm <- getBM(attributes = mattr, | |
filters = c("ensembl_gene_id"), | |
values = geneIDs, mart = bio.mart) | |
rownames(res.bm) <- res.bm[, "ensembl_gene_id"] | |
# Merge biomaRt results with the overview | |
p1 <- res.bm[geneIDs, c("external_gene_name", "chromosome_name", "start_position", "end_position")] | |
overview <- cbind(p1, overview[geneIDs,], res.bm[geneIDs, "description"]) | |
colnames(overview)[1:4] <- c("GeneName", "Chr", "Start", "End") | |
colnames(overview)[11] <- c("Description") | |
overview[1:10,1:10] | |
# Write out the table | |
write.table(overview, "overview.ann.txt", sep = "\t", quote = FALSE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Align SRA reads to the Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
# Read the sample from the commandline | |
cmdlineargs <- commandArgs(trailingOnly = TRUE) | |
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){ | |
if(!is.na(outputfile) && file.exists(outputfile)){ | |
cat("Output for step exists, skipping this step\n"); | |
return("") | |
} | |
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n") | |
if(res[1] >= 1){ | |
cat("Error external process did not finish\n\n"); | |
if(quitOnError) q("no") | |
} | |
} | |
input.dir <- "/home/danny/data/raw" | |
input.base <- cmdlineargs[1] #"SRR13978643" # Now from the command line | |
output.dir <- paste0("/home/danny/data/output/", input.base,".aln") | |
genome.path <- "/home/danny/genome/STAR" | |
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz" | |
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz" | |
# Create an output folder | |
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) } | |
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) } | |
# STEP 0 - SRA Download and Compress | |
setwd(input.dir) | |
execute(paste0("fasterq-dump ", input.base), paste0(input.base, "_1.fastq.gz")) | |
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz")) | |
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz")) | |
# STEP 1 - READ Trimming | |
trim.files <- c( | |
paste0(input.dir, "/", input.base,"_1.fastq.gz"), | |
paste0(input.dir, "/", input.base,"_2.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.U.fastq.gz") | |
) | |
trim.path <- "/home/danny/software/Trimmomatic" | |
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar") | |
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10") | |
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36") | |
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts) | |
execute(trim.cmd, trim.files[3]) | |
# STEP 1.1 - UNZIP for STAR | |
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3])) | |
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5])) | |
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)]) | |
# STEP 2 - Alignment using STAR | |
star.outbase <- paste0(output.dir, "/", input.base) | |
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam") | |
star.exec <- "STAR --runMode alignReads" | |
star.opts <- paste0("--genomeDir=", genome.path, " --outSAMtype BAM SortedByCoordinate") | |
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" ")) | |
star.out <- paste0("--outFileNamePrefix ", star.outbase) | |
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out) | |
execute(star.cmd, star.bam) | |
# STEP 2.1 - Create a samtools index | |
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai")) | |
# STEP 2.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", star.bam)) | |
execute(paste0("samtools coverage ", star.bam)) | |
#STEP 3 - Remove duplicate reads using picard tools | |
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam") | |
metrics.out <- paste0(star.outbase, "_metrics.txt") | |
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar" | |
p.in <- paste0("-I ", star.bam) | |
p.out <- paste0("-O ", p.bam, " -M ", metrics.out) | |
p.opts <- paste0("--REMOVE_DUPLICATES true") | |
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out) | |
execute(p.cmd, p.bam) | |
# STEP 3.1 - Create a samtools index | |
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai")) | |
# STEP 3.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", p.bam)) | |
execute(paste0("samtools coverage ", p.bam)) | |
# STEP 4 - Add read group (1) and sample run, library, and name | |
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam") | |
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base) | |
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts) | |
execute(p.cmd) | |
# STEP 4.1 - Create a samtools index | |
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai")) | |
# STEP 5 - GATK prep | |
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar" | |
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps) | |
# STEP 5.1 - GATK BaseRecalibrator | |
gatk.cov1 <- paste0(star.outbase, "_cov1.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1) | |
execute(gatk.cmd, gatk.cov1) | |
# STEP 5.2 - GATK ApplyBQSR | |
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam) | |
execute(gatk.cmd, recal.bam) | |
# STEP 5.3 - GATK BaseRecalibrator | |
gatk.cov2 <- paste0(star.outbase, "_cov2.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2) | |
execute(gatk.cmd, gatk.cov2) | |
# STEP 5.4 - GATK AnalyzeCovariates | |
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf") | |
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot) | |
execute(gatk.cmd) | |
# STEP 6 - Index the recalibrated bam files | |
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai")) | |
# STEP 6.1 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", recal.bam)) | |
execute(paste0("samtools coverage ", recal.bam)) | |
q("no") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if(!require("BiocManager", quietly = TRUE)) | |
install.packages("BiocManager") | |
BiocManager::install( | |
c("GenomicAlignments", "GenomicFeatures", | |
"Rsamtools", "biomaRt","preprocessCore") | |
) | |
install.packages("vioplot") | |
install.packages("RColorBrewer") |
Author
DannyArends
commented
Oct 23, 2023
via email
That is weird, because if anything goes wrong it would throw an error e.g.
"bgzip not found" or such. The fasterq-dump should not take too long, since
the data is reasonably small, 30 minutes should be enough to download them.
Did both files SRR13978644_1.fastq & SRR13978644_2.fastq download ?
Danny
Op zo 22 okt 2023 om 14:04 schreef jaypaty ***@***.***>:
… ***@***.**** commented on this gist.
------------------------------
Hi Danny!
Firstly, thanks a lot for the enriching, informative, and engaging
content. I learnt a lot from your videos.
Everything worked for the 1st SRR data like you showed. Then I was
executing the pipeline via the cmd line for the second SRR data, something
like you showed: Rscript --vanilla pipeline_bulk.r SRR13978644, however it
got stuck here and could not proceed with the compression in step 0. It
kept running without returning any errors.
In my output folder I saw that a new fastrq folder was created with 12
files in it. However no compression happened.
Terminal got stuck with ---- fasterq-dump SRR13978644 for hours...
Any help would be greatly appreciated! Thank you so much for all your time
and inputs.
—
Reply to this email directly, view it on GitHub
<https://gist.github.com/DannyArends/c70f21208438cd1305162f25435922f7#gistcomment-4734631>
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AAALWHSW7QJ23QAG3DX7EDLYAUKXPBFKMF2HI4TJMJ2XIZLTSKBKK5TBNR2WLJDHNFZXJJDOMFWWLK3UNBZGKYLEL52HS4DFQKSXMYLMOVS2I5DSOVS2I3TBNVS3W5DIOJSWCZC7OBQXE5DJMNUXAYLOORPWCY3UNF3GS5DZVRZXKYTKMVRXIX3UPFYGLK2HNFZXIQ3PNVWWK3TUUZ2G64DJMNZZDAVEOR4XAZNEM5UXG5FFOZQWY5LFVEYTCOJVGQ3TQNZQU52HE2LHM5SXFJTDOJSWC5DF>
.
You are receiving this email because you authored the thread.
Triage notifications on the go with GitHub Mobile for iOS
<https://apps.apple.com/app/apple-store/id1477376905?ct=notification-email&mt=8&pt=524675>
or Android
<https://play.google.com/store/apps/details?id=com.github.android&referrer=utm_campaign%3Dnotification-email%26utm_medium%3Demail%26utm_source%3Dgithub>
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment