Last active
December 16, 2024 09:31
-
-
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
Scripts and other things for RNA-Seq
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Add yourself to the sudo group | |
su - | |
usermod -aG sudo danny | |
exit | |
# Install the virtual box guest additions | |
cd /media/cdrom0 | |
sudo sh ./VBoxLinuxAdditions.run | |
# Install R and deps | |
sudo apt install r-base | |
sudo nano /etc/apt/sources.list | |
sudo apt install libssl-dev | |
sudo apt install libxml2-dev | |
sudo apt install libcurl4-openssl-dev | |
# Install R packages | |
sudo R | |
if (!require("BiocManager", quietly = TRUE)) | |
install.packages("BiocManager") | |
BiocManager::install("GenomicFeatures") | |
BiocManager::install("preprocessCore") | |
q("no") | |
# Install Trimmomatic | |
sudo apt install git | |
sudo apt install ant | |
mkdir software | |
cd software | |
git clone https://github.com/usadellab/Trimmomatic.git | |
cd Trimmomatic | |
ant | |
# Install STAR | |
git clone https://github.com/alexdobin/STAR.git | |
cd STAR/source | |
# [UPDATE 2024] - Checkout an older version of STAR | |
git checkout ee50484 | |
make | |
# Install picard | |
git clone https://github.com/broadinstitute/picard.git | |
cd picard | |
# [UPDATED 2024] - Checkout an older version of PICARD | |
git checkout 5db8017 | |
./gradlew shadowJar | |
# Install HTSlib / samtools / bcftools | |
sudo apt install autoconf | |
git clone https://github.com/samtools/htslib.git | |
git clone https://github.com/samtools/samtools.git | |
git clone https://github.com/samtools/bcftools.git | |
# Install HTSlib | |
cd htslib | |
git submodule update --init --recursive | |
autoreconf -i | |
./configure | |
make | |
# Install samtools | |
cd samtools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# Install bcftools | |
cd bcftools | |
autoheader | |
autoconf -Wno-syntax | |
./configure | |
make | |
# GATK install | |
wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip | |
unzip gatk-4.2.6.1.zip | |
# SRA | |
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz | |
mkdir sratoolkit | |
tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit | |
./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive | |
# Make symbolic links | |
mkdir bin | |
cd bin | |
ln -s /home/danny/software/STAR/source/STAR STAR | |
ln -s /home/danny/software/htslib/bgzip bgzip | |
ln -s /home/danny/software/samtools/samtools samtools | |
ln -s /home/danny/software/bcftools/bcftools bcftools | |
ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump | |
#Update the bashrc file | |
nano ~/.bashrc | |
# Add at the end: | |
export PATH="$HOME/bin:$PATH" | |
# Additional link: | |
ln -s /home/danny/software/htslib/tabix tabix | |
# Additional R package: | |
sudo R | |
install.packages("ggplot2") | |
install.packages("gplots") | |
install.packages("gsalib") | |
q("no") | |
wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip | |
unzip IGV_Linux_2.14.1_WithJava.zip |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Download Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/" | |
base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome." | |
chrs <- c(as.character(as.roman(seq(1:16))), "Mito") | |
# Download | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Download command | |
cmd <- paste0("wget ", uri, fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Create an empty the file | |
cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
} | |
# Compress the fasta file using bgzip (keep original) | |
cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
#cat(cmd, "\n") | |
system(cmd) | |
# Delete the chromosomes | |
for(chr in chrs){ | |
fname <- paste0(base, chr, ".fa.gz") | |
# Extract and merge into a fast file | |
cmd <- paste0("rm ", fname) | |
#cat(cmd, "\n") | |
system(cmd) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Get the reference transcriptome and GTF for STAR | |
wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using samtools | |
samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz | |
# Generate genome/transcriptome index using STAR | |
STAR --runThreadN 2 --runMode genomeGenerate \ | |
--genomeDir ~/genome/STAR \ | |
--genomeSAindexNbases 10 \ | |
--sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \ | |
--genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa | |
# Get the reference SNPs and index using tabix | |
tabix saccharomyces_cerevisiae.vcf.gz | |
# Index the genome using picard | |
java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \ | |
-R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Align SRA reads to the Saccharomyces Cerevisiae genome | |
# copyright (c) 2022 - Danny Arends | |
# | |
execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){ | |
if(!is.na(outputfile) && file.exists(outputfile)){ | |
cat("Output for step exists, skipping this step\n"); | |
invisible("") | |
} | |
cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n") | |
if(res[1] >= 1){ | |
cat("Error external process did not finish\n\n"); | |
if(quitOnError) q("no") | |
} | |
} | |
input.dir <- "/home/danny/data/raw" | |
input.base <- "SRR13978643" #Get from the command line | |
output.dir <- paste0("/home/danny/data/output/", input.base,".aln") | |
genome.path <- "/home/danny/genome/STAR" | |
ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz" | |
ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz" | |
# Create an output folder | |
if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) } | |
if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) } | |
# STEP 0 - SRA Download and Compress | |
setwd(input.dir) | |
execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq")) | |
execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz")) | |
execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz")) | |
# STEP 1 - READ Trimming | |
trim.files <- c( | |
paste0(input.dir, "/", input.base,"_1.fastq.gz"), | |
paste0(input.dir, "/", input.base,"_2.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_1.U.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.P.fastq.gz"), | |
paste0(output.dir, "/", input.base,"_2.U.fastq.gz") | |
) | |
trim.path <- "/home/danny/software/Trimmomatic" | |
trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar") | |
trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10") | |
trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36") | |
trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts) | |
execute(trim.cmd, trim.files[3]) | |
# STEP 1.1 - UNZIP for STAR | |
execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3])) | |
execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5])) | |
files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)]) | |
# STEP 2 - Alignment using STAR | |
star.outbase <- paste0(output.dir, "/", input.base) | |
star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam") | |
star.exec <- "STAR --runMode alignReads" | |
star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate") | |
star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" ")) | |
star.out <- paste0("--outFileNamePrefix ", star.outbase) | |
star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out) | |
execute(star.cmd, star.bam) | |
# STEP 2.1 - Create a samtools index | |
execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai")) | |
# STEP 2.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", star.bam)) | |
execute(paste0("samtools coverage ", star.bam)) | |
#STEP 3 - Remove duplicate reads using picard tools | |
p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam") | |
metrics.out <- paste0(star.outbase, "_metrics.txt") | |
p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar" | |
p.in <- paste0("-I ", star.bam) | |
p.out <- paste0("-O ", p.bam, " -M ", metrics.out) | |
p.opts <- paste0("--REMOVE_DUPLICATES true") | |
p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out) | |
execute(p.cmd, p.bam) | |
# STEP 3.1 - Create a samtools index | |
execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai")) | |
# STEP 3.2 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", p.bam)) | |
execute(paste0("samtools coverage ", p.bam)) | |
# STEP 4 - Add read group (1) and sample run, library, and name | |
rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam") | |
rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base) | |
p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts) | |
execute(p.cmd) | |
# STEP 4.1 - Create a samtools index | |
execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai")) | |
# STEP 5 - GATK prep | |
gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar" | |
gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps) | |
# STEP 5.1 - GATK BaseRecalibrator | |
gatk.cov1 <- paste0(star.outbase, "_cov1.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1) | |
execute(gatk.cmd, gatk.cov1) | |
# STEP 5.2 - GATK ApplyBQSR | |
recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam) | |
execute(gatk.cmd, recal.bam) | |
# STEP 5.3 - GATK BaseRecalibrator | |
gatk.cov2 <- paste0(star.outbase, "_cov2.txt") | |
gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2) | |
execute(gatk.cmd, gatk.cov2) | |
# STEP 5.4 - GATK AnalyzeCovariates | |
recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf") | |
gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot) | |
execute(gatk.cmd) | |
# STEP 6 - Index the recalibrated bam files | |
execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai")) | |
# STEP 6.1 - Create mapping and coverage statistics | |
execute(paste0("samtools flagstats ", recal.bam)) | |
execute(paste0("samtools coverage ", recal.bam)) | |
q("no") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hey @aberakenea, the shell commands generated by R, are specific to Linux (the sh bash). That is why we run them in a Debian virtual machine under Windows. The scripts should work without much modification on e.g. Ubuntu or CentOS as long as the shell used to run them is the bash / sh shell.