Skip to content

Instantly share code, notes, and snippets.

@DannyArends
Last active December 16, 2024 09:31
Show Gist options
  • Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
Scripts and other things for RNA-Seq
# Add yourself to the sudo group
su -
usermod -aG sudo danny
exit
# Install the virtual box guest additions
cd /media/cdrom0
sudo sh ./VBoxLinuxAdditions.run
# Install R and deps
sudo apt install r-base
sudo nano /etc/apt/sources.list
sudo apt install libssl-dev
sudo apt install libxml2-dev
sudo apt install libcurl4-openssl-dev
# Install R packages
sudo R
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("GenomicFeatures")
BiocManager::install("preprocessCore")
q("no")
# Install Trimmomatic
sudo apt install git
sudo apt install ant
mkdir software
cd software
git clone https://github.com/usadellab/Trimmomatic.git
cd Trimmomatic
ant
# Install STAR
git clone https://github.com/alexdobin/STAR.git
cd STAR/source
# [UPDATE 2024] - Checkout an older version of STAR
git checkout ee50484
make
# Install picard
git clone https://github.com/broadinstitute/picard.git
cd picard
# [UPDATED 2024] - Checkout an older version of PICARD
git checkout 5db8017
./gradlew shadowJar
# Install HTSlib / samtools / bcftools
sudo apt install autoconf
git clone https://github.com/samtools/htslib.git
git clone https://github.com/samtools/samtools.git
git clone https://github.com/samtools/bcftools.git
# Install HTSlib
cd htslib
git submodule update --init --recursive
autoreconf -i
./configure
make
# Install samtools
cd samtools
autoheader
autoconf -Wno-syntax
./configure
make
# Install bcftools
cd bcftools
autoheader
autoconf -Wno-syntax
./configure
make
# GATK install
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip
unzip gatk-4.2.6.1.zip
# SRA
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz
mkdir sratoolkit
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive
# Make symbolic links
mkdir bin
cd bin
ln -s /home/danny/software/STAR/source/STAR STAR
ln -s /home/danny/software/htslib/bgzip bgzip
ln -s /home/danny/software/samtools/samtools samtools
ln -s /home/danny/software/bcftools/bcftools bcftools
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump
#Update the bashrc file
nano ~/.bashrc
# Add at the end:
export PATH="$HOME/bin:$PATH"
# Additional link:
ln -s /home/danny/software/htslib/tabix tabix
# Additional R package:
sudo R
install.packages("ggplot2")
install.packages("gplots")
install.packages("gsalib")
q("no")
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip
unzip IGV_Linux_2.14.1_WithJava.zip
#
# Download Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/"
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome."
chrs <- c(as.character(as.roman(seq(1:16))), "Mito")
# Download
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Download command
cmd <- paste0("wget ", uri, fname)
#cat(cmd, "\n")
system(cmd)
}
# Create an empty the file
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
}
# Compress the fasta file using bgzip (keep original)
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa")
#cat(cmd, "\n")
system(cmd)
# Delete the chromosomes
for(chr in chrs){
fname <- paste0(base, chr, ".fa.gz")
# Extract and merge into a fast file
cmd <- paste0("rm ", fname)
#cat(cmd, "\n")
system(cmd)
}
# Get the reference transcriptome and GTF for STAR
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz
# Index the genome using samtools
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
# Generate genome/transcriptome index using STAR
STAR --runThreadN 2 --runMode genomeGenerate \
--genomeDir ~/genome/STAR \
--genomeSAindexNbases 10 \
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa
# Get the reference SNPs and index using tabix
tabix saccharomyces_cerevisiae.vcf.gz
# Index the genome using picard
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz
#
# Align SRA reads to the Saccharomyces Cerevisiae genome
# copyright (c) 2022 - Danny Arends
#
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){
if(!is.na(outputfile) && file.exists(outputfile)){
cat("Output for step exists, skipping this step\n");
invisible("")
}
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n")
if(res[1] >= 1){
cat("Error external process did not finish\n\n");
if(quitOnError) q("no")
}
}
input.dir <- "/home/danny/data/raw"
input.base <- "SRR13978643" #Get from the command line
output.dir <- paste0("/home/danny/data/output/", input.base,".aln")
genome.path <- "/home/danny/genome/STAR"
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz"
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz"
# Create an output folder
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) }
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) }
# STEP 0 - SRA Download and Compress
setwd(input.dir)
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq"))
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz"))
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz"))
# STEP 1 - READ Trimming
trim.files <- c(
paste0(input.dir, "/", input.base,"_1.fastq.gz"),
paste0(input.dir, "/", input.base,"_2.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"),
paste0(output.dir, "/", input.base,"_2.U.fastq.gz")
)
trim.path <- "/home/danny/software/Trimmomatic"
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar")
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10")
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts)
execute(trim.cmd, trim.files[3])
# STEP 1.1 - UNZIP for STAR
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3]))
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5]))
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)])
# STEP 2 - Alignment using STAR
star.outbase <- paste0(output.dir, "/", input.base)
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam")
star.exec <- "STAR --runMode alignReads"
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate")
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" "))
star.out <- paste0("--outFileNamePrefix ", star.outbase)
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out)
execute(star.cmd, star.bam)
# STEP 2.1 - Create a samtools index
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai"))
# STEP 2.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", star.bam))
execute(paste0("samtools coverage ", star.bam))
#STEP 3 - Remove duplicate reads using picard tools
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam")
metrics.out <- paste0(star.outbase, "_metrics.txt")
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar"
p.in <- paste0("-I ", star.bam)
p.out <- paste0("-O ", p.bam, " -M ", metrics.out)
p.opts <- paste0("--REMOVE_DUPLICATES true")
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out)
execute(p.cmd, p.bam)
# STEP 3.1 - Create a samtools index
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai"))
# STEP 3.2 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", p.bam))
execute(paste0("samtools coverage ", p.bam))
# STEP 4 - Add read group (1) and sample run, library, and name
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam")
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base)
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts)
execute(p.cmd)
# STEP 4.1 - Create a samtools index
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai"))
# STEP 5 - GATK prep
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar"
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps)
# STEP 5.1 - GATK BaseRecalibrator
gatk.cov1 <- paste0(star.outbase, "_cov1.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1)
execute(gatk.cmd, gatk.cov1)
# STEP 5.2 - GATK ApplyBQSR
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam")
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam)
execute(gatk.cmd, recal.bam)
# STEP 5.3 - GATK BaseRecalibrator
gatk.cov2 <- paste0(star.outbase, "_cov2.txt")
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2)
execute(gatk.cmd, gatk.cov2)
# STEP 5.4 - GATK AnalyzeCovariates
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf")
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot)
execute(gatk.cmd)
# STEP 6 - Index the recalibrated bam files
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai"))
# STEP 6.1 - Create mapping and coverage statistics
execute(paste0("samtools flagstats ", recal.bam))
execute(paste0("samtools coverage ", recal.bam))
q("no")
@DannyArends
Copy link
Author

Hi Danny, I am currently a master's student in bioinformatics and have watched all your RNA-seq analysis videos. I found them incredibly insightful and greatly appreciate the effort you put into them. I have a question regarding RNA-seq data analysis: if you have any Linux command codes related to the analysis, could you please upload them to your GitHub repository? I would like to practice on my Linux machine.

Hey @aberakenea, the shell commands generated by R, are specific to Linux (the sh bash). That is why we run them in a Debian virtual machine under Windows. The scripts should work without much modification on e.g. Ubuntu or CentOS as long as the shell used to run them is the bash / sh shell.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment