Skip to content

Instantly share code, notes, and snippets.

@sunilnahata
Forked from DannyArends/0_installSoftware.sh
Created April 21, 2024 10:03
Show Gist options
  • Save sunilnahata/6be392e0649351291e145e7b28793b2a to your computer and use it in GitHub Desktop.
Save sunilnahata/6be392e0649351291e145e7b28793b2a to your computer and use it in GitHub Desktop.
Scripts and other things for RNA-Seq
# Add yourself to the sudo group
su -
usermod -aG sudo danny
exit
# Install the virtual box guest additions
cd /media/cdrom0
sudo sh ./VBoxLinuxAdditions.run
# Install R and deps
sudo apt install r-base
sudo nano /etc/apt/sources.list
sudo apt install libssl-dev
sudo apt install libxml2-dev
sudo apt install libcurl4-openssl-dev
# Install R packages
sudo R
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("GenomicFeatures")
BiocManager::install("preprocessCore")
q("no")
# Install Trimmomatic
sudo apt install git
sudo apt install ant
mkdir software
cd software
git clone https://github.com/usadellab/Trimmomatic.git
cd Trimmomatic
ant
# Install STAR
git clone https://github.com/alexdobin/STAR.git
cd STAR/source
# [UPDATE 2024] - Checkout an older version of STAR
git checkout ee50484
make
# Install picard
git clone https://github.com/broadinstitute/picard.git
cd picard
# [UPDATED 2024] - Checkout an older version of PICARD
git checkout 5db8017
./gradlew shadowJar
# Install HTSlib / samtools / bcftools
sudo apt install autoconf
git clone https://github.com/samtools/htslib.git
git clone https://github.com/samtools/samtools.git
git clone https://github.com/samtools/bcftools.git
# Install HTSlib
cd htslib
git submodule update --init --recursive
autoreconf -i
./configure
make
# Install samtools
cd samtools
autoheader
autoconf -Wno-syntax
./configure
make
# Install bcftools
cd bcftools
autoheader
autoconf -Wno-syntax
./configure
make
# GATK install
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip
unzip gatk-4.2.6.1.zip
# SRA
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz
mkdir sratoolkit
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive
# Make symbolic links
mkdir bin
cd bin
ln -s /home/danny/software/STAR/source/STAR STAR
ln -s /home/danny/software/htslib/bgzip bgzip
ln -s /home/danny/software/samtools/samtools samtools
ln -s /home/danny/software/bcftools/bcftools bcftools
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump
#Update the bashrc file
nano ~/.bashrc
# Add at the end:
export PATH="$HOME/bin:$PATH"
# Additional link:
ln -s /home/danny/software/htslib/tabix tabix
# Additional R package:
sudo R
install.packages("ggplot2")
install.packages("gplots")
install.packages("gsalib")
q("no")
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip
unzip IGV_Linux_2.14.1_WithJava.zip
#
# Download Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/"
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome."
chrs <- c(as.character(as.roman(seq(1:16))), "Mito")
# Download
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Download command
cmd <- paste0("wget ", uri, fname)
#cat(cmd, "\n")
system(cmd)
}
# Create an empty the file
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
}
# Compress the fasta file using bgzip (keep original)
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
# Delete the chromosomes
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("rm ", fname)
#cat(cmd, "\n")
system(cmd)
}
# Get the reference transcriptome and GTF for STAR
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz
# Index the genome using samtools
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
# Generate genome/transcriptome index using STAR
STAR --runThreadN 2 --runMode genomeGenerate \
--genomeDir ~/genome/STAR \
--genomeSAindexNbases 10 \
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa
# Get the reference SNPs and index using tabix
tabix saccharomyces_cerevisiae.vcf.gz
# Index the genome using picard
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
#
# Align SRA reads to the Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
if(!is.na(outputfile) && file.exists(outputfile)){
cat("Output for step exists, skipping this step\n");
invisible("")
}
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
if(res[1] >= 1){
cat("Error external process did not finish\n\n");
if(quitOnError) q("no")
}
}
input.dir <- "/home/danny/data/raw"
input.base <- "SRR13978643" #Get from the command line
output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
genome.path <- "/home/danny/genome/STAR"
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"
# Create an output folder
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }
# STEP 0 - SRA Download and Compress
setwd(input.dir)
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq"))
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))
# STEP 1 - READ Trimming
trim.files <- c(
paste0(input.dir, "/", input.base,"_1.fastq.gz"),
paste0(input.dir, "/", input.base,"_2.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
)
trim.path <- "/home/danny/software/Trimmomatic"
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)
execute(trim.cmd, trim.files[3])
# STEP 1.1 - UNZIP for STAR
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])
# STEP 2 - Alignment using STAR
star.outbase <- paste0(output.dir, "/", input.base)
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")
star.exec <- "STAR --runMode alignReads"
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate")
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
star.out <- paste0("--outFileNamePrefix ", star.outbase)
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)
execute(star.cmd, star.bam)
# STEP 2.1 - Create a samtools index
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
# STEP 2.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", star.bam))
execute(paste0("samtools coverage ", star.bam))
#STEP 3 - Remove duplicate reads using picard tools
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
metrics.out <- paste0(star.outbase, "_metrics.txt")
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
p.in <- paste0("-I ", star.bam)
p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
p.opts <- paste0("--REMOVE_DUPLICATES true")
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)
execute(p.cmd, p.bam)
# STEP 3.1 - Create a samtools index
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
# STEP 3.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", p.bam))
execute(paste0("samtools coverage ", p.bam))
# STEP 4 - Add read group (1) and sample run, library, and name
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
execute(p.cmd)
# STEP 4.1 - Create a samtools index
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))
# STEP 5 - GATK prep
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)
# STEP 5.1 - GATK BaseRecalibrator
gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
execute(gatk.cmd, gatk.cov1)
# STEP 5.2 - GATK ApplyBQSR
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
execute(gatk.cmd, recal.bam)
# STEP 5.3 - GATK BaseRecalibrator
gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
execute(gatk.cmd, gatk.cov2)
# STEP 5.4 - GATK AnalyzeCovariates
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
execute(gatk.cmd)
# STEP 6 - Index the recalibrated bam files
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))
# STEP 6.1 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", recal.bam))
execute(paste0("samtools coverage ", recal.bam))
q("no")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment