DannyArends · July 21, 2025 00:08 · DannyArends · Dec 16, 2024
diff --git a/0_installSoftware.sh b/0_installSoftware.sh
 # Add yourself to the sudo group
 su -
 usermod -aG sudo danny
 exit

 # Install the virtual box guest additions
 cd /media/cdrom0
 sudo sh ./VBoxLinuxAdditions.run

 # Install R and deps
 sudo apt install r-base
 sudo nano /etc/apt/sources.list
 sudo apt install libssl-dev
 sudo apt install libxml2-dev
 sudo apt install libcurl4-openssl-dev

 # Install R packages
 sudo R
 if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
 BiocManager::install("GenomicFeatures")
 BiocManager::install("preprocessCore")
 q("no")

 # Install Trimmomatic
 sudo apt install git
 sudo apt install ant

 mkdir software
 cd software

 git clone https://github.com/usadellab/Trimmomatic.git
 cd Trimmomatic
 ant

 # Install STAR
 git clone https://github.com/alexdobin/STAR.git
 cd STAR/source
 # [UPDATE 2024] - Checkout an older version of STAR
 git checkout ee50484
 make

 # Install picard
 git clone https://github.com/broadinstitute/picard.git
 cd picard
 # [UPDATED 2024] - Checkout an older version of PICARD
 git checkout 5db8017
 ./gradlew shadowJar

 # Install HTSlib / samtools / bcftools
 sudo apt install autoconf
 git clone https://github.com/samtools/htslib.git
 git clone https://github.com/samtools/samtools.git
 git clone https://github.com/samtools/bcftools.git

 # Install HTSlib
 cd htslib
 git submodule update --init --recursive
 autoreconf -i
 ./configure
 make

 # Install samtools
 cd samtools
 autoheader
 autoconf -Wno-syntax
 ./configure
 make

 # Install bcftools
 cd bcftools
 autoheader
 autoconf -Wno-syntax
 ./configure
 make

 # GATK install
 wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip
 unzip gatk-4.2.6.1.zip

 # SRA
 wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz
 mkdir sratoolkit
 tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit
 ./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive

 # Make symbolic links
 mkdir bin
 cd bin
 ln -s /home/danny/software/STAR/source/STAR STAR
 ln -s /home/danny/software/htslib/bgzip bgzip
 ln -s /home/danny/software/samtools/samtools samtools
 ln -s /home/danny/software/bcftools/bcftools bcftools
 ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump

 #Update the bashrc file
 nano ~/.bashrc

 # Add at the end:
 export PATH="$HOME/bin:$PATH"

 # Additional link:
 ln -s /home/danny/software/htslib/tabix tabix
 # Additional R package:
 sudo R
 install.packages("ggplot2")
 install.packages("gplots")
 install.packages("gsalib")
 q("no")

 wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip
 unzip IGV_Linux_2.14.1_WithJava.zip
diff --git a/1_buildGenome.R b/1_buildGenome.R
 #
 # Download Saccharomyces Cerevisiae genome
 # copyright (c) 2022 - Danny Arends
 #

 uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/"
 base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome."
 chrs <- c(as.character(as.roman(seq(1:16))), "Mito")

 # Download
 for(chr in chrs){
  fname <- paste0(base, chr, ".fa.gz")

  # Download command
  cmd <- paste0("wget ", uri, fname)
  #cat(cmd, "\n")
  system(cmd)
 }


 # Create an empty the file
 cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
 for(chr in chrs){
  fname <- paste0(base, chr, ".fa.gz")
  # Extract and merge into a fast file
  cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
  #cat(cmd, "\n")
  system(cmd)
 }

 # Compress the fasta file using bgzip (keep original)
 cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
 #cat(cmd, "\n")
 system(cmd)

 # Delete the chromosomes
 for(chr in chrs){
  fname <- paste0(base, chr, ".fa.gz")
  # Extract and merge into a fast file
  cmd <- paste0("rm ", fname)
  #cat(cmd, "\n")
  system(cmd)
 }
diff --git a/2_prepGenome.sh b/2_prepGenome.sh
 # Get the reference transcriptome and GTF for STAR
 wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
 gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
 wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz

 # Index the genome using samtools
 samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz

 # Generate genome/transcriptome index using STAR
 STAR --runThreadN 2 --runMode genomeGenerate \
     --genomeDir ~/genome/STAR \
     --genomeSAindexNbases 10 \
     --sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \
     --genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa

 # Get the reference SNPs and index using tabix
 tabix saccharomyces_cerevisiae.vcf.gz

 # Index the genome using picard
 java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \
     -R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
diff --git a/3_pipeline.R b/3_pipeline.R
 #
 # Align SRA reads to the Saccharomyces Cerevisiae genome
 # copyright (c) 2022 - Danny Arends
 #

 execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
  if(!is.na(outputfile) && file.exists(outputfile)){
    cat("Output for step exists, skipping this step\n");
    invisible("")
  }
  cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
  if(res[1] >= 1){ 
    cat("Error external process did not finish\n\n");
    if(quitOnError) q("no")
  }
 }

 input.dir <- "/home/danny/data/raw"
 input.base <- "SRR13978643"  #Get from the command line
 output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
 genome.path <- "/home/danny/genome/STAR"
 ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
 ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"

 # Create an output folder
 if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
 if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }

 # STEP 0 - SRA Download and Compress
 setwd(input.dir)

 execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq"))
 execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
 execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))

 # STEP 1 - READ Trimming
 trim.files  <- c(
                  paste0(input.dir, "/", input.base,"_1.fastq.gz"),
                  paste0(input.dir, "/", input.base,"_2.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
                  paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
                )
 trim.path <- "/home/danny/software/Trimmomatic"
 trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
 trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
 trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
 trim.cmd  <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)

 execute(trim.cmd, trim.files[3])

 # STEP 1.1 - UNZIP for STAR
 execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
 execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))

 files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])

 # STEP 2 - Alignment using STAR
 star.outbase <- paste0(output.dir, "/", input.base)
 star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")

 star.exec <- "STAR --runMode alignReads"
 star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate")
 star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
 star.out <- paste0("--outFileNamePrefix ", star.outbase)
 star.cmd  <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)

 execute(star.cmd, star.bam)

 # STEP 2.1 - Create a samtools index
 execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
 # STEP 2.2 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", star.bam))
 execute(paste0("samtools coverage ", star.bam))

 #STEP 3 - Remove duplicate reads using picard tools
 p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
 metrics.out <- paste0(star.outbase, "_metrics.txt")

 p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
 p.in <- paste0("-I ", star.bam)
 p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
 p.opts <- paste0("--REMOVE_DUPLICATES true")
 p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)

 execute(p.cmd, p.bam)

 # STEP 3.1 - Create a samtools index
 execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
 # STEP 3.2 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", p.bam))
 execute(paste0("samtools coverage ", p.bam))

 # STEP 4 - Add read group (1) and sample run, library, and name
 rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
 rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
 p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
 execute(p.cmd)

 # STEP 4.1 - Create a samtools index
 execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))

 # STEP 5 - GATK prep
 gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
 gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)

 # STEP 5.1 - GATK BaseRecalibrator
 gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
 gatk.cmd  <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
 execute(gatk.cmd, gatk.cov1)

 # STEP 5.2 - GATK ApplyBQSR
 recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
 gatk.cmd  <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
 execute(gatk.cmd, recal.bam)

 # STEP 5.3 - GATK BaseRecalibrator
 gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
 gatk.cmd  <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
 execute(gatk.cmd, gatk.cov2)

 # STEP 5.4 - GATK AnalyzeCovariates
 recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
 gatk.cmd  <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, "  -plots ", recal.plot)
 execute(gatk.cmd)

 # STEP 6 - Index the recalibrated bam files
 execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))

 # STEP 6.1 - Create mapping and coverage statistics
 execute(paste0("samtools flagstats ", recal.bam))
 execute(paste0("samtools coverage ", recal.bam))

 q("no")
	# Add yourself to the sudo group
	su -
	usermod -aG sudo danny
	exit

	# Install the virtual box guest additions
	cd /media/cdrom0
	sudo sh ./VBoxLinuxAdditions.run

	# Install R and deps
	sudo apt install r-base
	sudo nano /etc/apt/sources.list
	sudo apt install libssl-dev
	sudo apt install libxml2-dev
	sudo apt install libcurl4-openssl-dev

	# Install R packages
	sudo R
	if (!require("BiocManager", quietly = TRUE))
	install.packages("BiocManager")
	BiocManager::install("GenomicFeatures")
	BiocManager::install("preprocessCore")
	q("no")

	# Install Trimmomatic
	sudo apt install git
	sudo apt install ant

	mkdir software
	cd software

	git clone https://github.com/usadellab/Trimmomatic.git
	cd Trimmomatic
	ant

	# Install STAR
	git clone https://github.com/alexdobin/STAR.git
	cd STAR/source
	# [UPDATE 2024] - Checkout an older version of STAR
	git checkout ee50484
	make

	# Install picard
	git clone https://github.com/broadinstitute/picard.git
	cd picard
	# [UPDATED 2024] - Checkout an older version of PICARD
	git checkout 5db8017
	./gradlew shadowJar

	# Install HTSlib / samtools / bcftools
	sudo apt install autoconf
	git clone https://github.com/samtools/htslib.git
	git clone https://github.com/samtools/samtools.git
	git clone https://github.com/samtools/bcftools.git

	# Install HTSlib
	cd htslib
	git submodule update --init --recursive
	autoreconf -i
	./configure
	make

	# Install samtools
	cd samtools
	autoheader
	autoconf -Wno-syntax
	./configure
	make

	# Install bcftools
	cd bcftools
	autoheader
	autoconf -Wno-syntax
	./configure
	make

	# GATK install
	wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip
	unzip gatk-4.2.6.1.zip

	# SRA
	wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz
	mkdir sratoolkit
	tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit
	./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive

	# Make symbolic links
	mkdir bin
	cd bin
	ln -s /home/danny/software/STAR/source/STAR STAR
	ln -s /home/danny/software/htslib/bgzip bgzip
	ln -s /home/danny/software/samtools/samtools samtools
	ln -s /home/danny/software/bcftools/bcftools bcftools
	ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump

	#Update the bashrc file
	nano ~/.bashrc

	# Add at the end:
	export PATH="$HOME/bin:$PATH"

	# Additional link:
	ln -s /home/danny/software/htslib/tabix tabix
	# Additional R package:
	sudo R
	install.packages("ggplot2")
	install.packages("gplots")
	install.packages("gsalib")
	q("no")

	wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip
	unzip IGV_Linux_2.14.1_WithJava.zip
	#
	# Download Saccharomyces Cerevisiae genome
	# copyright (c) 2022 - Danny Arends
	#

	uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/"
	base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome."
	chrs <- c(as.character(as.roman(seq(1:16))), "Mito")

	# Download
	for(chr in chrs){
	fname <- paste0(base, chr, ".fa.gz")

	# Download command
	cmd <- paste0("wget ", uri, fname)
	#cat(cmd, "\n")
	system(cmd)
	}


	# Create an empty the file
	cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
	for(chr in chrs){
	fname <- paste0(base, chr, ".fa.gz")
	# Extract and merge into a fast file
	cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
	#cat(cmd, "\n")
	system(cmd)
	}

	# Compress the fasta file using bgzip (keep original)
	cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
	#cat(cmd, "\n")
	system(cmd)

	# Delete the chromosomes
	for(chr in chrs){
	fname <- paste0(base, chr, ".fa.gz")
	# Extract and merge into a fast file
	cmd <- paste0("rm ", fname)
	#cat(cmd, "\n")
	system(cmd)
	}
	# Get the reference transcriptome and GTF for STAR
	wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
	gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
	wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz

	# Index the genome using samtools
	samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz

	# Generate genome/transcriptome index using STAR
	STAR --runThreadN 2 --runMode genomeGenerate \
	--genomeDir ~/genome/STAR \
	--genomeSAindexNbases 10 \
	--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \
	--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa

	# Get the reference SNPs and index using tabix
	tabix saccharomyces_cerevisiae.vcf.gz

	# Index the genome using picard
	java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \
	-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
	#
	# Align SRA reads to the Saccharomyces Cerevisiae genome
	# copyright (c) 2022 - Danny Arends
	#

	execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
	if(!is.na(outputfile) && file.exists(outputfile)){
	cat("Output for step exists, skipping this step\n");
	invisible("")
	}
	cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
	if(res[1] >= 1){
	cat("Error external process did not finish\n\n");
	if(quitOnError) q("no")
	}
	}

	input.dir <- "/home/danny/data/raw"
	input.base <- "SRR13978643" #Get from the command line
	output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
	genome.path <- "/home/danny/genome/STAR"
	ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
	ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"

	# Create an output folder
	if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
	if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }

	# STEP 0 - SRA Download and Compress
	setwd(input.dir)

	execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq"))
	execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
	execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))

	# STEP 1 - READ Trimming
	trim.files <- c(
	paste0(input.dir, "/", input.base,"_1.fastq.gz"),
	paste0(input.dir, "/", input.base,"_2.fastq.gz"),
	paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
	paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
	paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
	paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
	)
	trim.path <- "/home/danny/software/Trimmomatic"
	trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
	trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
	trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
	trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)

	execute(trim.cmd, trim.files[3])

	# STEP 1.1 - UNZIP for STAR
	execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
	execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))

	files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])

	# STEP 2 - Alignment using STAR
	star.outbase <- paste0(output.dir, "/", input.base)
	star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")

	star.exec <- "STAR --runMode alignReads"
	star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate")
	star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
	star.out <- paste0("--outFileNamePrefix ", star.outbase)
	star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)

	execute(star.cmd, star.bam)

	# STEP 2.1 - Create a samtools index
	execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
	# STEP 2.2 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", star.bam))
	execute(paste0("samtools coverage ", star.bam))

	#STEP 3 - Remove duplicate reads using picard tools
	p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
	metrics.out <- paste0(star.outbase, "_metrics.txt")

	p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
	p.in <- paste0("-I ", star.bam)
	p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
	p.opts <- paste0("--REMOVE_DUPLICATES true")
	p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)

	execute(p.cmd, p.bam)

	# STEP 3.1 - Create a samtools index
	execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
	# STEP 3.2 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", p.bam))
	execute(paste0("samtools coverage ", p.bam))

	# STEP 4 - Add read group (1) and sample run, library, and name
	rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
	rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
	p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
	execute(p.cmd)

	# STEP 4.1 - Create a samtools index
	execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))

	# STEP 5 - GATK prep
	gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
	gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)

	# STEP 5.1 - GATK BaseRecalibrator
	gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
	gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
	execute(gatk.cmd, gatk.cov1)

	# STEP 5.2 - GATK ApplyBQSR
	recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
	gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
	execute(gatk.cmd, recal.bam)

	# STEP 5.3 - GATK BaseRecalibrator
	gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
	gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
	execute(gatk.cmd, gatk.cov2)

	# STEP 5.4 - GATK AnalyzeCovariates
	recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
	gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
	execute(gatk.cmd)

	# STEP 6 - Index the recalibrated bam files
	execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))

	# STEP 6.1 - Create mapping and coverage statistics
	execute(paste0("samtools flagstats ", recal.bam))
	execute(paste0("samtools coverage ", recal.bam))

	q("no")