-
-
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
# Add yourself to the sudo group | |
su - | |
usermod -aG sudo danny | |
exit | |
# Install the virtual box guest additions | |
cd /media/cdrom0 | |
sudo sh ./VBoxLinuxAdditions.run | |
# Install R and deps | |
sudo apt install r-base | |
sudo nano /etc/apt/sources.list | |
sudo apt install libssl-dev | |
sudo apt install libxml2-dev | |
sudo apt install libcurl4-openssl-dev | |
# Install R packages | |
sudo R | |
if (!require("BiocManager", quietly = TRUE)) | |
install.packages("BiocManager") | |
BiocManager::install("GenomicFeatures") | |
BiocManager::install("preprocessCore") | |
q("no") | |
# Install Trimmomatic | |
sudo apt install git | |
sudo apt install ant | |
mkdir software | |
cd software | |
git clone https://github.com/usadellab/Trimmomatic.git | |
cd Trimmomatic | |
ant | |
# Install STAR | |
git clone https://github.com/alexdobin/STAR.git | |
cd STAR/source | |
# [UPDATE 2024] - Checkout an older version of STAR | |
git checkout ee50484 | |
make | |
# Install picard | |
git clone https://github.com/broadinstitute/picard.git | |
cd picard | |
# [UPDATED 2024] - Checkout an older version of PICARD | |
git checkout 5db8017 | |
./gradlew shadowJar | |
# Install HTSlib / samtools / bcftools | |
sudo apt install autoconf | |
git clone https://github.com/samtools/htslib.git | |
git clone https://github.com/samtools/samtools.git | |
git clone https://github.com/samtools/bcftools.git | |
# Install HTSlib | |
cd htslib | |
git submodule update --init --recursive | |
autoreconf -i | |
./configure | |
make | |
# Install samtools | |
cd samtools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# Install bcftools | |
cd bcftools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# GATK install | |
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip | |
unzip gatk-4.2.6.1.zip | |
# SRA | |
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz | |
mkdir sratoolkit | |
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit | |
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive | |
# Make symbolic links | |
mkdir bin | |
cd bin | |
ln -s /home/danny/software/STAR/source/STAR STAR | |
ln -s /home/danny/software/htslib/bgzip bgzip | |
ln -s /home/danny/software/samtools/samtools samtools | |
ln -s /home/danny/software/bcftools/bcftools bcftools | |
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump | |
#Update the bashrc file | |
nano ~/.bashrc | |
# Add at the end: | |
export PATH="$HOME/bin:$PATH" | |
# Additional link: | |
ln -s /home/danny/software/htslib/tabix tabix | |
# Additional R package: | |
sudo R | |
install.packages("ggplot2") | |
install.packages("gplots") | |
install.packages("gsalib") | |
q("no") | |
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip | |
unzip IGV_Linux_2.14.1_WithJava.zip |
# | |
# Download Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/" | |
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome." | |
chrs <- c(as.character(as.roman(seq(1:16))), "Mito") | |
# Download | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Download command | |
cmd <- paste0("wget ", uri, fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Create an empty the file | |
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Compress the fasta file using bgzip (keep original) | |
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
# Delete the chromosomes | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("rm ", fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} |
# Get the reference transcriptome and GTF for STAR | |
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using samtools | |
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz | |
# Generate genome/transcriptome index using STAR | |
STAR --runThreadN 2 --runMode genomeGenerate \ | |
--genomeDir ~/genome/STAR \ | |
--genomeSAindexNbases 10 \ | |
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \ | |
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa | |
# Get the reference SNPs and index using tabix | |
tabix saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using picard | |
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \ | |
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz |
# | |
# Align SRA reads to the Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){ | |
if(!is.na(outputfile) && file.exists(outputfile)){ | |
cat("Output for step exists, skipping this step\n"); | |
invisible("") | |
} | |
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n") | |
if(res[1] >= 1){ | |
cat("Error external process did not finish\n\n"); | |
if(quitOnError) q("no") | |
} | |
} | |
input.dir <- "/home/danny/data/raw" | |
input.base <- "SRR13978643" #Get from the command line | |
output.dir <- paste0("/home/danny/data/output/", input.base,".aln") | |
genome.path <- "/home/danny/genome/STAR" | |
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz" | |
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz" | |
# Create an output folder | |
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) } | |
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) } | |
# STEP 0 - SRA Download and Compress | |
setwd(input.dir) | |
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq")) | |
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz")) | |
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz")) | |
# STEP 1 - READ Trimming | |
trim.files <- c( | |
paste0(input.dir, "/", input.base,"_1.fastq.gz"), | |
paste0(input.dir, "/", input.base,"_2.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.U.fastq.gz") | |
) | |
trim.path <- "/home/danny/software/Trimmomatic" | |
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar") | |
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10") | |
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36") | |
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts) | |
execute(trim.cmd, trim.files[3]) | |
# STEP 1.1 - UNZIP for STAR | |
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3])) | |
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5])) | |
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)]) | |
# STEP 2 - Alignment using STAR | |
star.outbase <- paste0(output.dir, "/", input.base) | |
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam") | |
star.exec <- "STAR --runMode alignReads" | |
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate") | |
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" ")) | |
star.out <- paste0("--outFileNamePrefix ", star.outbase) | |
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out) | |
execute(star.cmd, star.bam) | |
# STEP 2.1 - Create a samtools index | |
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai")) | |
# STEP 2.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", star.bam)) | |
execute(paste0("samtools coverage ", star.bam)) | |
#STEP 3 - Remove duplicate reads using picard tools | |
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam") | |
metrics.out <- paste0(star.outbase, "_metrics.txt") | |
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar" | |
p.in <- paste0("-I ", star.bam) | |
p.out <- paste0("-O ", p.bam, " -M ", metrics.out) | |
p.opts <- paste0("--REMOVE_DUPLICATES true") | |
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out) | |
execute(p.cmd, p.bam) | |
# STEP 3.1 - Create a samtools index | |
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai")) | |
# STEP 3.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", p.bam)) | |
execute(paste0("samtools coverage ", p.bam)) | |
# STEP 4 - Add read group (1) and sample run, library, and name | |
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam") | |
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base) | |
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts) | |
execute(p.cmd) | |
# STEP 4.1 - Create a samtools index | |
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai")) | |
# STEP 5 - GATK prep | |
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar" | |
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps) | |
# STEP 5.1 - GATK BaseRecalibrator | |
gatk.cov1 <- paste0(star.outbase, "_cov1.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1) | |
execute(gatk.cmd, gatk.cov1) | |
# STEP 5.2 - GATK ApplyBQSR | |
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam) | |
execute(gatk.cmd, recal.bam) | |
# STEP 5.3 - GATK BaseRecalibrator | |
gatk.cov2 <- paste0(star.outbase, "_cov2.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2) | |
execute(gatk.cmd, gatk.cov2) | |
# STEP 5.4 - GATK AnalyzeCovariates | |
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf") | |
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot) | |
execute(gatk.cmd) | |
# STEP 6 - Index the recalibrated bam files | |
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai")) | |
# STEP 6.1 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", recal.bam)) | |
execute(paste0("samtools coverage ", recal.bam)) | |
q("no") |
Hi Danny,
I am currently a master's student in bioinformatics and have watched all your RNA-seq analysis videos. I found them incredibly insightful and greatly appreciate the effort you put into them. I have a question regarding RNA-seq data analysis: if you have any Linux command codes related to the analysis, could you please upload them to your GitHub repository? I would like to practice on my Linux machine.
Hi Danny, I am currently a master's student in bioinformatics and have watched all your RNA-seq analysis videos. I found them incredibly insightful and greatly appreciate the effort you put into them. I have a question regarding RNA-seq data analysis: if you have any Linux command codes related to the analysis, could you please upload them to your GitHub repository? I would like to practice on my Linux machine.
Hey @aberakenea, the shell commands generated by R, are specific to Linux (the sh bash). That is why we run them in a Debian virtual machine under Windows. The scripts should work without much modification on e.g. Ubuntu or CentOS as long as the shell used to run them is the bash / sh shell.
Hello,
when I do:
git clone https://github.com/broadinstitute/picard.git
I see:
then I do:
cd picard/
git checkout 5db8017
and afterwards see:
then I do:
./gradlew shadowJar
and see:
How to fix this?
Thank you