DannyArends · May 14, 2024 14:01 · DannyArends · Oct 23, 2023
diff --git a/1_expression.R b/1_expression.R
 #
 # Call expression from aligned SRA reads
 # copyright (c) 2022 - Danny Arends
 #

 library("GenomicAlignments")
 library("GenomicFeatures")
 library("Rsamtools")
 library("preprocessCore")
 library("vioplot")
 library("RColorBrewer")
 library("biomaRt")

 # Go into the output folder
 setwd("c:/Shared/")

 # Create DB and exons per Gene
 db <- makeTxDbFromGFF("genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf", 
                      format = "gtf", organism = "Saccharomyces", 
                      dataSource = "https://ftp.ensembl.org/pub/release-108/")

 # Get the exons per gene, and compute bp lengths of all genes
 exons <- exonsBy(db, by = "gene")
 gene.lengths <- lapply(exons, function(x){ sum(width(reduce(x))) })

 setwd("c:/Shared/output")

 # Samples and simple names
 samples <- c("SRR13978640", "SRR13978641", "SRR13978642", "SRR13978643", "SRR13978644", "SRR13978645")
 names(samples) <- c("SPRC_1", "SPRC_2", "SPRC_3", "CTRL_1", "CTRL_2", "CTRL_3")

 # Check if BAM files for all samples exist
 files <- c()
 for(s in samples){
  fp <- paste0(s, ".aln/", s, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
  if(file.exists(fp)) files <- c(files, fp)
 }

 # Load in the BAM files
 bams <- BamFileList(files, yieldSize = 100000, asMates=TRUE)

 # Overlap BAM reads and genes
 overlap <- summarizeOverlaps(exons, bams, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE)

 # Extract the raw-reads per gene
 readcount <- assay(overlap)
 colnames(readcount) <- gsub("Aligned.sortedByCoord.RD.RG.RC.out.bam", "", colnames(readcount))
 write.table(readcount, "readcount.raw.txt", sep = "\t", quote = FALSE)

 # Calculate the RPKM values per gene
 # RPKM = (10^9 * C)/(N * L)
 # C = Number of reads mapped to a gene
 # N = Total mapped reads in the sample
 # L = gene length in base-pairs for a gene

 # Get the total number of reads per samples
 N <- apply(readcount, 2, sum)

 # Loop through all genes, compute RPKM
 n <- 1
 RPKM <- t(apply(readcount, 1, function(C){
  L     <- as.numeric(gene.lengths[n])
  RPKM  <- (10^9 * C) / (N * L)
  n    <<- n + 1
  return(round(RPKM, d = 1))
 }))

 op <- par(mar = c(8,4,2,2))
 #Violin distribution plot
 vioplot(RPKM, col = c(rep("orange", 3), rep("lightblue", 3)), ylab = "reads", las = 2)
 legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

 write.table(RPKM, "RPKM.txt", sep = "\t", quote = FALSE)

 # Quantile normalization of RPKM values
 RPKM.norm <- round(normalize.quantiles(as.matrix(RPKM)), d = 1)
 colnames(RPKM.norm) <- colnames(RPKM)
 rownames(RPKM.norm) <- rownames(RPKM)

 #Violin distribution plot
 vioplot(RPKM.norm, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "normalized", las = 2)
 legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

 write.table(RPKM.norm, "RPKM.norm.txt", sep = "\t", quote = FALSE)

 # LOG2 Qnorm RPKM (so we can treat it as microarray data)
 RPKM.l2 <- round(log2(RPKM.norm), d = 1)
 RPKM.l2[RPKM.l2 < 0] <- 0

 #Violin distribution plot
 vioplot(RPKM.l2, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "log2(normalize)", las = 2)
 legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

 write.table(RPKM.l2, "RPKM.norm.log2.txt", sep = "\t", quote = FALSE)

 # P-values and Log2 fold change
 pvals <- apply(RPKM.l2, 1, function(x){
  tryCatch(t.test(x[1:3], x[4:6])$p.value, error = function(x){return(NA);})
 })

 fc <- apply(RPKM.l2, 1, function(x){
  tryCatch(log2(mean(x[1:3]) / mean(x[4:6])), error = function(x){return(NA);})
 })

 # Assign colors based on P-values
 colz <- rep("black", length(pvals))
 colz[which(pvals < 5e-2)] <- "red"
 colz[which(pvals < 1e-2)] <- "gold"
 colz[which(pvals < 1e-3)] <- "blue"

 # Volcano plot (x = fc, y = -log10(P-values))
 plot(fc, -log10(pvals), col=colz, pch=18, main ="Vulcano plot", xlab="Fold Change")
 legend("topleft", pch=18, c("<0.05", "<0.01", "<0.001"), col = c("red", "gold", "blue"))

 # Down & Up regulated genes
 down <- RPKM.l2[which(pvals < 5e-2 & fc < -0.3),]
 dclust <- down[hclust(dist(down))$order,]
 up <- RPKM.l2[which(pvals < 5e-2 & fc > 0.3),]
 uclust <- up[hclust(dist(up))$order,]

 # Gene IDs of up/Down regulated genes
 geneIDs <- c(rownames(dclust), rownames(uclust))

 # Custom heatmap using the spectral colors
 op <- par(mar = c(8, 6, 2,1))
 image(x = 1:ncol(RPKM.l2), 
      y = 1:(nrow(down)+nrow(up)), 
      z = t(rbind(dclust, uclust)), 
      xaxt='n',yaxt='n',xlab="", ylab="", 
      col = brewer.pal(11, "Spectral"))

 axis(2, at = 1:length(geneIDs), labels = geneIDs, las = 2, cex.axis=0.7)
 axis(1, at = 1:3, labels = colnames(RPKM.l2)[1:3], las = 2, col.axis = "orange")
 axis(1, at = 4:6, labels = colnames(RPKM.l2)[4:6], las = 2, col.axis = "blue")

 # Compute mean expression and standard deviations
 means <- t(apply(RPKM.l2, 1, function(x){
  tryCatch(round(c(mean(x[1:3]),mean(x[4:6])),1), error = function(x){return(NA);})
 }))

 sds <- t(apply(RPKM.l2, 1, function(x){
  tryCatch(round(c(sd(x[1:3]),sd(x[4:6])),1), error = function(x){return(NA);})
 }))

 colnames(means) <- c("SPRC", "CTRL")
 colnames(sds) <- c("SPRC", "CTRL")

 # Create an overview table
 overview <- cbind("CTRL" = means[, "CTRL"], 
                  "CTRL(SD)" = sds[, "CTRL"], 
                  "SPRC" = means[, "SPRC"], 
                  "SPRC(SD)" = sds[, "SPRC"], 
                  FC = round(fc,1),
                  P = round(pvals,6))

 overview[1:10,]

 # Use biomaRt to retrieve gene names, location, and description
 library(biomaRt)
 bio.mart <- useMart("ensembl", "scerevisiae_gene_ensembl")



 mattr <- c("ensembl_gene_id", "external_gene_name", 
           "chromosome_name", "start_position", "end_position",
           "description")

 res.bm <- getBM(attributes = mattr, 
                filters = c("ensembl_gene_id"), 
                values = geneIDs, mart = bio.mart)
 rownames(res.bm) <- res.bm[, "ensembl_gene_id"]

 # Merge biomaRt results with the overview
 p1 <- res.bm[geneIDs, c("external_gene_name", "chromosome_name", "start_position", "end_position")]
 overview <- cbind(p1, overview[geneIDs,], res.bm[geneIDs, "description"])
 colnames(overview)[1:4] <- c("GeneName", "Chr", "Start", "End")
 colnames(overview)[11] <- c("Description")

 overview[1:10,1:10]

 # Write out the table
 write.table(overview, "overview.ann.txt", sep = "\t", quote = FALSE)
diff --git a/2_Updated_pipeline.R b/2_Updated_pipeline.R
 #
 # Align SRA reads to the Saccharomyces Cerevisiae genome
 # copyright (c) 2022 - Danny Arends
 #

 # Read the sample from the commandline
 cmdlineargs <- commandArgs(trailingOnly = TRUE)

 execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
  if(!is.na(outputfile) && file.exists(outputfile)){
    cat("Output for step exists, skipping this step\n");
    return("")
  }
  cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
  if(res[1] >= 1){ 
    cat("Error external process did not finish\n\n");
    if(quitOnError) q("no")
  }
 }

 input.dir <- "/home/danny/data/raw"
 input.base <- cmdlineargs[1] #"SRR13978643"  # Now from the command line
 output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
 genome.path <- "/home/danny/genome/STAR"
 ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
 ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"

 # Create an output folder
 if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
 if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }

 # STEP 0 - SRA Download and Compress
 setwd(input.dir)

 execute(paste0("fasterq-dump ", input.base), paste0(input.base, "_1.fastq.gz"))
 execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
 execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))

 # STEP 1 - READ Trimming
 trim.files  <- c(
                  paste0(input.dir, "/", input.base,"_1.fastq.gz"),
                  paste0(input.dir, "/", input.base,"_2.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
                )
 trim.path <- "/home/danny/software/Trimmomatic"
 trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
 trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
 trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
 trim.cmd  <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)

 execute(trim.cmd, trim.files[3])

 # STEP 1.1 - UNZIP for STAR
 execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
 execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))

 files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])

 # STEP 2 - Alignment using STAR
 star.outbase <- paste0(output.dir, "/", input.base)
 star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")

 star.exec <- "STAR --runMode alignReads"
 star.opts <- paste0("--genomeDir=", genome.path, " --outSAMtype BAM SortedByCoordinate")
 star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
 star.out <- paste0("--outFileNamePrefix ", star.outbase)
 star.cmd  <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)

 execute(star.cmd, star.bam)

 # STEP 2.1 - Create a samtools index
 execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
 # STEP 2.2 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", star.bam))
 execute(paste0("samtools coverage ", star.bam))

 #STEP 3 - Remove duplicate reads using picard tools
 p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
 metrics.out <- paste0(star.outbase, "_metrics.txt")

 p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
 p.in <- paste0("-I ", star.bam)
 p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
 p.opts <- paste0("--REMOVE_DUPLICATES true")
 p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)

 execute(p.cmd, p.bam)

 # STEP 3.1 - Create a samtools index
 execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
 # STEP 3.2 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", p.bam))
 execute(paste0("samtools coverage ", p.bam))

 # STEP 4 - Add read group (1) and sample run, library, and name
 rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
 rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
 p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
 execute(p.cmd)

 # STEP 4.1 - Create a samtools index
 execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))

 # STEP 5 - GATK prep
 gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
 gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)

 # STEP 5.1 - GATK BaseRecalibrator
 gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
 gatk.cmd  <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
 execute(gatk.cmd, gatk.cov1)

 # STEP 5.2 - GATK ApplyBQSR
 recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
 gatk.cmd  <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
 execute(gatk.cmd, recal.bam)

 # STEP 5.3 - GATK BaseRecalibrator
 gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
 gatk.cmd  <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
 execute(gatk.cmd, gatk.cov2)

 # STEP 5.4 - GATK AnalyzeCovariates
 recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
 gatk.cmd  <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, "  -plots ", recal.plot)
 execute(gatk.cmd)

 # STEP 6 - Index the recalibrated bam files
 execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))

 # STEP 6.1 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", recal.bam))
 execute(paste0("samtools coverage ", recal.bam))

 q("no")
diff --git a/install.R b/install.R
 if(!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

 BiocManager::install(
  c("GenomicAlignments", "GenomicFeatures", 
    "Rsamtools", "biomaRt","preprocessCore")
 )

 install.packages("vioplot")
 install.packages("RColorBrewer")
	#
	# Call expression from aligned SRA reads
	# copyright (c) 2022 - Danny Arends
	#

	library("GenomicAlignments")
	library("GenomicFeatures")
	library("Rsamtools")
	library("preprocessCore")
	library("vioplot")
	library("RColorBrewer")
	library("biomaRt")

	# Go into the output folder
	setwd("c:/Shared/")

	# Create DB and exons per Gene
	db <- makeTxDbFromGFF("genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf",
	format = "gtf", organism = "Saccharomyces",
	dataSource = "https://ftp.ensembl.org/pub/release-108/")

	# Get the exons per gene, and compute bp lengths of all genes
	exons <- exonsBy(db, by = "gene")
	gene.lengths <- lapply(exons, function(x){ sum(width(reduce(x))) })

	setwd("c:/Shared/output")

	# Samples and simple names
	samples <- c("SRR13978640", "SRR13978641", "SRR13978642", "SRR13978643", "SRR13978644", "SRR13978645")
	names(samples) <- c("SPRC_1", "SPRC_2", "SPRC_3", "CTRL_1", "CTRL_2", "CTRL_3")

	# Check if BAM files for all samples exist
	files <- c()
	for(s in samples){
	fp <- paste0(s, ".aln/", s, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
	if(file.exists(fp)) files <- c(files, fp)
	}

	# Load in the BAM files
	bams <- BamFileList(files, yieldSize = 100000, asMates=TRUE)

	# Overlap BAM reads and genes
	overlap <- summarizeOverlaps(exons, bams, mode="Union", singleEnd=FALSE, ignore.strand=TRUE, fragments=TRUE)

	# Extract the raw-reads per gene
	readcount <- assay(overlap)
	colnames(readcount) <- gsub("Aligned.sortedByCoord.RD.RG.RC.out.bam", "", colnames(readcount))
	write.table(readcount, "readcount.raw.txt", sep = "\t", quote = FALSE)

	# Calculate the RPKM values per gene
	# RPKM = (10^9 * C)/(N * L)
	# C = Number of reads mapped to a gene
	# N = Total mapped reads in the sample
	# L = gene length in base-pairs for a gene

	# Get the total number of reads per samples
	N <- apply(readcount, 2, sum)

	# Loop through all genes, compute RPKM
	n <- 1
	RPKM <- t(apply(readcount, 1, function(C){
	L <- as.numeric(gene.lengths[n])
	RPKM <- (10^9 * C) / (N * L)
	n <<- n + 1
	return(round(RPKM, d = 1))
	}))

	op <- par(mar = c(8,4,2,2))
	#Violin distribution plot
	vioplot(RPKM, col = c(rep("orange", 3), rep("lightblue", 3)), ylab = "reads", las = 2)
	legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

	write.table(RPKM, "RPKM.txt", sep = "\t", quote = FALSE)

	# Quantile normalization of RPKM values
	RPKM.norm <- round(normalize.quantiles(as.matrix(RPKM)), d = 1)
	colnames(RPKM.norm) <- colnames(RPKM)
	rownames(RPKM.norm) <- rownames(RPKM)

	#Violin distribution plot
	vioplot(RPKM.norm, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "normalized", las = 2)
	legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

	write.table(RPKM.norm, "RPKM.norm.txt", sep = "\t", quote = FALSE)

	# LOG2 Qnorm RPKM (so we can treat it as microarray data)
	RPKM.l2 <- round(log2(RPKM.norm), d = 1)
	RPKM.l2[RPKM.l2 < 0] <- 0

	#Violin distribution plot
	vioplot(RPKM.l2, col = c(rep("orange", 3), rep("lightblue",3)), ylab = "log2(normalize)", las = 2)
	legend("topleft", c("SPRC", "CTRL"), fill = c("orange", "lightblue"))

	write.table(RPKM.l2, "RPKM.norm.log2.txt", sep = "\t", quote = FALSE)

	# P-values and Log2 fold change
	pvals <- apply(RPKM.l2, 1, function(x){
	tryCatch(t.test(x[1:3], x[4:6])$p.value, error = function(x){return(NA);})
	})

	fc <- apply(RPKM.l2, 1, function(x){
	tryCatch(log2(mean(x[1:3]) / mean(x[4:6])), error = function(x){return(NA);})
	})

	# Assign colors based on P-values
	colz <- rep("black", length(pvals))
	colz[which(pvals < 5e-2)] <- "red"
	colz[which(pvals < 1e-2)] <- "gold"
	colz[which(pvals < 1e-3)] <- "blue"

	# Volcano plot (x = fc, y = -log10(P-values))
	plot(fc, -log10(pvals), col=colz, pch=18, main ="Vulcano plot", xlab="Fold Change")
	legend("topleft", pch=18, c("<0.05", "<0.01", "<0.001"), col = c("red", "gold", "blue"))

	# Down & Up regulated genes
	down <- RPKM.l2[which(pvals < 5e-2 & fc < -0.3),]
	dclust <- down[hclust(dist(down))$order,]
	up <- RPKM.l2[which(pvals < 5e-2 & fc > 0.3),]
	uclust <- up[hclust(dist(up))$order,]

	# Gene IDs of up/Down regulated genes
	geneIDs <- c(rownames(dclust), rownames(uclust))

	# Custom heatmap using the spectral colors
	op <- par(mar = c(8, 6, 2,1))
	image(x = 1:ncol(RPKM.l2),
	y = 1:(nrow(down)+nrow(up)),
	z = t(rbind(dclust, uclust)),
	xaxt='n',yaxt='n',xlab="", ylab="",
	col = brewer.pal(11, "Spectral"))

	axis(2, at = 1:length(geneIDs), labels = geneIDs, las = 2, cex.axis=0.7)
	axis(1, at = 1:3, labels = colnames(RPKM.l2)[1:3], las = 2, col.axis = "orange")
	axis(1, at = 4:6, labels = colnames(RPKM.l2)[4:6], las = 2, col.axis = "blue")

	# Compute mean expression and standard deviations
	means <- t(apply(RPKM.l2, 1, function(x){
	tryCatch(round(c(mean(x[1:3]),mean(x[4:6])),1), error = function(x){return(NA);})
	}))

	sds <- t(apply(RPKM.l2, 1, function(x){
	tryCatch(round(c(sd(x[1:3]),sd(x[4:6])),1), error = function(x){return(NA);})
	}))

	colnames(means) <- c("SPRC", "CTRL")
	colnames(sds) <- c("SPRC", "CTRL")

	# Create an overview table
	overview <- cbind("CTRL" = means[, "CTRL"],
	"CTRL(SD)" = sds[, "CTRL"],
	"SPRC" = means[, "SPRC"],
	"SPRC(SD)" = sds[, "SPRC"],
	FC = round(fc,1),
	P = round(pvals,6))

	overview[1:10,]

	# Use biomaRt to retrieve gene names, location, and description
	library(biomaRt)
	bio.mart <- useMart("ensembl", "scerevisiae_gene_ensembl")



	mattr <- c("ensembl_gene_id", "external_gene_name",
	"chromosome_name", "start_position", "end_position",
	"description")

	res.bm <- getBM(attributes = mattr,
	filters = c("ensembl_gene_id"),
	values = geneIDs, mart = bio.mart)
	rownames(res.bm) <- res.bm[, "ensembl_gene_id"]

	# Merge biomaRt results with the overview
	p1 <- res.bm[geneIDs, c("external_gene_name", "chromosome_name", "start_position", "end_position")]
	overview <- cbind(p1, overview[geneIDs,], res.bm[geneIDs, "description"])
	colnames(overview)[1:4] <- c("GeneName", "Chr", "Start", "End")
	colnames(overview)[11] <- c("Description")

	overview[1:10,1:10]

	# Write out the table
	write.table(overview, "overview.ann.txt", sep = "\t", quote = FALSE)
	#
	# Align SRA reads to the Saccharomyces Cerevisiae genome
	# copyright (c) 2022 - Danny Arends
	#

	# Read the sample from the commandline
	cmdlineargs <- commandArgs(trailingOnly = TRUE)

	execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
	if(!is.na(outputfile) && file.exists(outputfile)){
	cat("Output for step exists, skipping this step\n");
	return("")
	}
	cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
	if(res[1] >= 1){
	cat("Error external process did not finish\n\n");
	if(quitOnError) q("no")
	}
	}

	input.dir <- "/home/danny/data/raw"
	input.base <- cmdlineargs[1] #"SRR13978643" # Now from the command line
	output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
	genome.path <- "/home/danny/genome/STAR"
	ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
	ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"

	# Create an output folder
	if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
	if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }

	# STEP 0 - SRA Download and Compress
	setwd(input.dir)

	execute(paste0("fasterq-dump ", input.base), paste0(input.base, "_1.fastq.gz"))
	execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
	execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))

	# STEP 1 - READ Trimming
	trim.files <- c(
	paste0(input.dir, "/", input.base,"_1.fastq.gz"),
	paste0(input.dir, "/", input.base,"_2.fastq.gz"),
	paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
	paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
	paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
	paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
	)
	trim.path <- "/home/danny/software/Trimmomatic"
	trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
	trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
	trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
	trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)

	execute(trim.cmd, trim.files[3])

	# STEP 1.1 - UNZIP for STAR
	execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
	execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))

	files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])

	# STEP 2 - Alignment using STAR
	star.outbase <- paste0(output.dir, "/", input.base)
	star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")

	star.exec <- "STAR --runMode alignReads"
	star.opts <- paste0("--genomeDir=", genome.path, " --outSAMtype BAM SortedByCoordinate")
	star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
	star.out <- paste0("--outFileNamePrefix ", star.outbase)
	star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)

	execute(star.cmd, star.bam)

	# STEP 2.1 - Create a samtools index
	execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
	# STEP 2.2 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", star.bam))
	execute(paste0("samtools coverage ", star.bam))

	#STEP 3 - Remove duplicate reads using picard tools
	p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
	metrics.out <- paste0(star.outbase, "_metrics.txt")

	p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
	p.in <- paste0("-I ", star.bam)
	p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
	p.opts <- paste0("--REMOVE_DUPLICATES true")
	p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)

	execute(p.cmd, p.bam)

	# STEP 3.1 - Create a samtools index
	execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
	# STEP 3.2 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", p.bam))
	execute(paste0("samtools coverage ", p.bam))

	# STEP 4 - Add read group (1) and sample run, library, and name
	rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
	rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
	p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
	execute(p.cmd)

	# STEP 4.1 - Create a samtools index
	execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))

	# STEP 5 - GATK prep
	gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
	gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)

	# STEP 5.1 - GATK BaseRecalibrator
	gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
	gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
	execute(gatk.cmd, gatk.cov1)

	# STEP 5.2 - GATK ApplyBQSR
	recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
	gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
	execute(gatk.cmd, recal.bam)

	# STEP 5.3 - GATK BaseRecalibrator
	gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
	gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
	execute(gatk.cmd, gatk.cov2)

	# STEP 5.4 - GATK AnalyzeCovariates
	recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
	gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
	execute(gatk.cmd)

	# STEP 6 - Index the recalibrated bam files
	execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))

	# STEP 6.1 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", recal.bam))
	execute(paste0("samtools coverage ", recal.bam))

	q("no")
	if(!require("BiocManager", quietly = TRUE))
	install.packages("BiocManager")

	BiocManager::install(
	c("GenomicAlignments", "GenomicFeatures",
	"Rsamtools", "biomaRt","preprocessCore")
	)

	install.packages("vioplot")
	install.packages("RColorBrewer")