Created
October 2, 2015 19:29
-
-
Save k3yavi/ec251ef47f400bac11bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
########################################################## | |
#requires snakemake, python3, pyfasta to be installed | |
#save this file and provide all the binaries and their path | |
#in variables below. | |
#to run flux pipeline: | |
#snakemake run_flux_pipeline | |
#to run rsem pipeline: | |
#snakemake run_rsem_pipeline | |
########################################################## | |
#------------------------------------------------- | |
#Initial variable | |
#------------------------------------------------- | |
pwd = os.getcwd() | |
########################################################## | |
#Paths for binaries | |
#We assume these binaries are available and their path can | |
#be specifies below: | |
#1. flux simulator. | |
#2. rsem-prepare-reference | |
#3. sailfish (commit 8ec6df16dd5af1a2278d875c143cb7f736339771) | |
#4. kallisto (version 0.42.3) | |
#5. bowtie2-build | |
#6. bowtie2 | |
#7. parseAlignment (Bitseq) (version 0.7.5) | |
#8. estimateVBExpression (Bitseq) (version 0.7.5) | |
########################################################## | |
flux_binary="{}/binaries/flux-simulator-1.2.1/bin/flux-simulator".format(pwd) | |
rsem_prepare_ref_binary="{}/binaries/rsem-1.2.22/rsem-prepare-reference".format(pwd) | |
sailfish_binary="{}/binaries/SailfishBeta-latest_ubuntu-12.04/bin/sailfish".format(pwd) | |
kallisto_binary="{}/binaries/kallisto/build/src/kallisto".format(pwd) | |
bowtie2_build_binary="/mnt/scratch3/avi/bowtie2-master/bowtie2-build" | |
bowtie2_binary="/mnt/scratch3/avi/bowtie2-master/bowtie2" | |
bitseq_parse_binary="{}/binaries/BitSeq/parseAlignment".format(pwd) | |
bitseq_estimate_binary="{}/binaries/BitSeq/estimateVBExpression".format(pwd) | |
########################################################## | |
########################################################## | |
#rsem data | |
#We assume rsem has been run with kallisto's pipeline | |
#Need to give path of reference and PE reads generated | |
########################################################## | |
#Transcriptome | |
rsem_transcriptome_fullpath="/mnt/scratch1/simulated_data/rsem_sim_data/refs/Homo_sapiens.GRCh37.75.cdna.pc.fa" | |
rsem_first_mate_path = "/mnt/scratch1/simulated_data/rsem_sim_data/11_1.fq" | |
rsem_second_mate_path = "/mnt/scratch1/simulated_data/rsem_sim_data/11_2.fq" | |
########################################################## | |
#flux data | |
########################################################## | |
#Genome | |
flux_genome_file="Homo_sapiens.GRCh38.dna.toplevel.fa" | |
flux_genome_ftp_path="ftp://ftp.ensembl.org/pub/release-80/fasta/homo_sapiens/dna/{}.gz".format(flux_genome_file) | |
flux_genome_path="{}/data/Human_Genome".format(pwd) | |
flux_genome_fullpath="{}/{}".format(flux_genome_path, flux_genome_file) | |
#Gene annotation | |
flux_annotations_file="Homo_sapiens.GRCh38.80.gtf" | |
flux_annotations_ftp_prefix="ftp://ftp.ensembl.org/pub/release-80/gtf/homo_sapiens" | |
flux_annotations_path="{}/data/HumanGenomeAnnotations".format(pwd) | |
flux_annotations_fullpath = "{}/{}".format(flux_annotations_path, flux_annotations_file) | |
#Transcriptome | |
flux_transcriptome_path = "{}/data/Transcriptome".format(pwd) | |
flux_transcriptome_file="Human_Genome_filtered.transcripts.fa" | |
flux_transcriptome_prefix="Human_Genome" | |
flux_raw_transcriptome_path = "{}/data/Transcriptome/{}.transcripts.fa".format(pwd, flux_transcriptome_prefix) | |
flux_transcriptome_fullpath="{}/{}".format(flux_transcriptome_path, flux_transcriptome_file) | |
#flux_transcriptome_fullpath="/mnt/scratch1/avi/data/bigdata/hg19/reference.transcripts.fa" | |
########################################################### | |
#flux variables | |
flux_data_path = "{}/data/FluxSim".format(pwd) | |
flux_reads_path = "{}/reads".format(flux_data_path) | |
flux_first_mate_path = "{}/r1.fq".format(flux_reads_path) | |
flux_second_mate_path = "{}/r2.fq".format(flux_reads_path) | |
#flux_first_mate_path = "/mnt/scratch1/avi/data/25M/1.fastq" | |
#flux_second_mate_path = "/mnt/scratch1/avi/data/25M/2.fastq" | |
flux_config = """TMP_DIR {}/tmp | |
### Expression ### | |
REF_FILE_NAME {}/{} | |
GEN_DIR {} | |
## Library preparation | |
# # Expression | |
# | |
NB_MOLECULES 5000000 | |
TSS_MEAN 50 | |
POLYA_SCALE 100 | |
POLYA_SHAPE 1.5 | |
# | |
#### Fragmentation ### | |
FRAG_SUBSTRATE RNA | |
FRAG_METHOD UR | |
FRAG_UR_ETA 350 | |
# | |
#### Reverse Transcription ### | |
RTRANSCRIPTION YES | |
RT_MOTIF default | |
# | |
#### Amplification ### | |
GC_MEAN NaN | |
PCR_PROBABILITY 0.05 | |
PCR_DISTRIBUTION default | |
#### Size Filtering ### | |
FILTERING YES | |
#### Sequencing ### | |
READ_NUMBER 60000000 | |
READ_LENGTH 76 | |
PAIRED_END YES | |
ERR_FILE 76 | |
FASTA YES | |
""".format(flux_data_path, flux_annotations_path, flux_annotations_file, flux_genome_path) | |
################################################### | |
# Quantifier variables | |
################################################### | |
sailfish_flux_index_path = "{}/data/sailfish/flux/index".format(pwd) | |
sailfish_flux_quant_path = "{}/data/sailfish/flux/quant".format(pwd) | |
sailfish_threads = 20 | |
kallisto_flux_index_path = "{}/data/kallisto/flux/index".format(pwd) | |
kallisto_flux_quant_path = "{}/data/kallisto/flux/quant".format(pwd) | |
kallisto_flux_index_prefix = "index" | |
bowtie2_flux_index_path = "{}/data/bowtie2/flux/index".format(pwd) | |
bowtie2_flux_align_path = "{}/data/bowtie2/flux/align".format(pwd) | |
bowtie2_flux_sam = "{}/data1-1.sam".format(bowtie2_flux_align_path) | |
bowtie2_index_prefix="index" | |
bowtie2_threads = 20 | |
bitseq_flux_parse_path = "{}/data/bitseq/flux/parse".format(pwd) | |
bitseq_flux_prob_path = "{}/data.prob".format(bitseq_flux_parse_path) | |
bitseq_flux_trinfo_path = "{}/ensemblGenes.tr".format(bitseq_flux_parse_path) | |
bitseq_flux_quant_path = "{}/data/bitseq/flux/quant".format(pwd) | |
bitseq_threads = 20 | |
sailfish_rsem_index_path = "{}/data/sailfish/rsem/index".format(pwd) | |
sailfish_rsem_quant_path = "{}/data/sailfish/rsem/quant".format(pwd) | |
kallisto_rsem_index_path = "{}/data/kallisto/rsem/index".format(pwd) | |
kallisto_rsem_quant_path = "{}/data/kallisto/rsem/quant".format(pwd) | |
kallisto_rsem_index_prefix = "index" | |
bowtie2_rsem_index_path = "{}/data/bowtie2/rsem/index".format(pwd) | |
bowtie2_rsem_align_path = "{}/data/bowtie2/rsem/align".format(pwd) | |
bowtie2_rsem_sam = "{}/data1-1.sam".format(bowtie2_rsem_align_path) | |
bitseq_rsem_parse_path = "{}/data/bitseq/rsem/parse".format(pwd) | |
bitseq_rsem_prob_path = "{}/data.prob".format(bitseq_rsem_parse_path) | |
bitseq_rsem_trinfo_path = "{}/ensemblGenes.tr".format(bitseq_rsem_parse_path) | |
bitseq_rsem_quant_path = "{}/data/bitseq/rsem/quant".format(pwd) | |
bitseq_prefix = "quant" | |
################################################### | |
simulator = ["fluxsim", "rsem"] | |
#-------------------------------------------------- | |
#Rules for running test | |
#-------------------------------------------------- | |
rule obtain_flux_genome: | |
output: | |
flux_genome_path | |
shell: | |
""" | |
mkdir -p {flux_genome_path} | |
cd {flux_genome_path} | |
wget {flux_genome_ftp_path} | |
gunzip {flux_genome_file}.gz | |
pyfasta split --header="%(seqid)s.fasta" {flux_genome_file} | |
rename -v 's/(\w)\s+.*\.fasta/$1.fa/' *.fasta | |
""" | |
rule obtain_flux_annotations: | |
output: | |
flux_annotations_fullpath | |
shell: | |
""" | |
mkdir -p {flux_annotations_path} | |
cd {flux_annotations_path} | |
wget {flux_annotations_ftp_prefix}/{flux_annotations_file}.gz | |
gunzip {flux_annotations_file}.gz | |
""" | |
rule create_flux_transcriptome: | |
input: | |
genome={flux_genome_path}, | |
gtf= {flux_annotations_fullpath} | |
output: | |
flux_raw_transcriptome_path | |
shell: | |
""" | |
mkdir -p {flux_transcriptome_path} | |
{rsem_prepare_ref_binary} --gtf {input.gtf} {input.genome} {flux_transcriptome_path}/{flux_transcriptome_prefix} | |
""" | |
rule filter_flux_transcriptome: | |
input: | |
flux_raw_transcriptome_path | |
output: | |
flux_transcriptome_fullpath | |
run: | |
#taken from https://www.biostars.org/p/667/ | |
from pyfasta import Fasta | |
import re | |
f = Fasta("{}".format(flux_raw_transcriptome_path)) | |
ncar = 60 | |
with open("{}".format(flux_transcriptome_fullpath), "w") as out_file: | |
for header in f.keys(): | |
name = str(header) | |
seq = str(f[header]) | |
if not bool(re.match('[n]*$', seq, re.IGNORECASE)): | |
out_file.write('>'+ name + "\n") | |
while len(seq) > 0: | |
out_file.write(seq[:ncar] + "\n") | |
seq = seq[ncar:] | |
rule run_flux: | |
input: | |
flux_genome_path, | |
flux_annotations_fullpath | |
output: | |
flux_data_path | |
run: | |
shell("mkdir -p {}".format(flux_data_path)) | |
shell("rm -rf {}/tmp".format(flux_data_path)) | |
shell("mkdir -p {}/tmp".format(flux_data_path)) | |
flux_config_fname = "{}/config.par".format(flux_data_path) | |
with open(flux_config_fname, "w") as f: | |
f.write(flux_config) | |
#might have to change set -x FLUX_MEM '20G' to export FLUX_MEM='20G' if using bash | |
shell("set -x FLUX_MEM '20G'; {} -p {} -l -s -x".format(flux_binary, flux_config_fname)) | |
rule generate_flux_reads: | |
input: | |
flux_data_path | |
output: | |
flux_first_mate_path, | |
flux_second_mate_path | |
run: | |
shell("mkdir -p {}".format(flux_reads_path)) | |
r1 = open('{}/r1.fq'.format(flux_reads_path), 'w') | |
r2 = open('{}/r2.fq'.format(flux_reads_path), 'w') | |
[r1.write(line) if (i % 8 < 4) else r2.write(line) for i, line in enumerate(open("{}/config.fastq".format(flux_data_path), 'r'))] | |
r1.close() | |
r2.close() | |
rule run_sailfish_flux_index: | |
input: | |
transcripts={flux_transcriptome_fullpath} | |
output: | |
sailfish_flux_index_path | |
threads: | |
sailfish_threads | |
shell: | |
""" | |
mkdir -p {sailfish_flux_index_path} | |
{sailfish_binary} index -t {input.transcripts} -k 31 -o {sailfish_flux_index_path} -p {sailfish_threads} | |
""" | |
rule run_sailfish_flux_quant: | |
input: | |
first = {flux_first_mate_path}, | |
second = {flux_second_mate_path}, | |
index = {sailfish_flux_index_path} | |
output: | |
sailfish_flux_quant_path | |
threads: | |
sailfish_threads | |
shell: | |
""" | |
mkdir -p {sailfish_flux_quant_path} | |
{sailfish_binary} quant -i {input.index} -l IU -1 {input.first} -2 {input.second} -p {sailfish_threads} -o {sailfish_flux_quant_path} --useVBOpt | |
""" | |
rule run_kallisto_flux_index: | |
input: | |
transcripts = {flux_transcriptome_fullpath} | |
output: | |
kallisto_flux_index_path | |
shell: | |
""" | |
mkdir -p {kallisto_flux_index_path} | |
{kallisto_binary} index -i {kallisto_flux_index_path}/{kallisto_flux_index_prefix} -k 31 {input.transcripts} | |
""" | |
rule run_kallisto_flux_quant: | |
input: | |
first = {flux_first_mate_path}, | |
second = {flux_second_mate_path}, | |
index = {kallisto_flux_index_path} | |
output: | |
kallisto_flux_quant_path | |
shell: | |
""" | |
mkdir -p {kallisto_flux_quant_path} | |
{kallisto_binary} quant -i {kallisto_flux_index_path}/{kallisto_flux_index_prefix} -o {kallisto_flux_quant_path} {input.first} {input.second} | |
""" | |
rule run_bowtie2_flux_index: | |
input: | |
transcripts = {flux_transcriptome_fullpath} | |
output: | |
bowtie2_flux_index_path | |
threads: | |
bowtie2_threads | |
shell: | |
""" | |
mkdir -p {bowtie2_flux_index_path} | |
{bowtie2_build_binary} -f {input.transcripts} {bowtie2_flux_index_path}/{bowtie2_index_prefix} | |
""" | |
rule run_bowtie2_flux_align: | |
input: | |
first = {flux_first_mate_path}, | |
second = {flux_second_mate_path}, | |
index = {bowtie2_flux_index_path} | |
output: | |
bowtie2_flux_sam | |
threads: | |
bowtie2_threads | |
shell: | |
""" | |
mkdir -p {bowtie2_flux_align_path} | |
{bowtie2_binary} -q -k 100 --threads {bowtie2_threads} --no-mixed --no-discordant -x {input.index}/{bowtie2_index_prefix} -1 {input.first} -2 {input.second} -S {bowtie2_flux_sam} | |
""" | |
rule run_bitseq_flux_parse: | |
input: | |
sam = {bowtie2_flux_sam}, | |
transcripts = {flux_transcriptome_fullpath} | |
output: | |
bitseq_flux_prob_path | |
shell: | |
""" | |
mkdir -p {bitseq_flux_parse_path} | |
{bitseq_parse_binary} {input.sam} -o {bitseq_flux_prob_path} --trSeqFile {input.transcripts} --trInfoFile {bitseq_flux_trinfo_path} --uniform --verbose | |
""" | |
rule run_bitseq_flux_quant: | |
input: | |
bitseq_flux_prob_path | |
output: | |
bitseq_flux_quant_path | |
shell: | |
""" | |
mkdir -p {bitseq_flux_quant_path} | |
{bitseq_estimate_binary} -o {bitseq_flux_quant_path}/{bitseq_prefix} {bitseq_flux_prob_path} | |
""" | |
rule run_flux_pipeline: | |
input: | |
sailfish = sailfish_flux_quant_path, | |
kallisto = kallisto_flux_quant_path, | |
bitseq = bitseq_flux_quant_path | |
################################################################################## | |
# rsem rules | |
################################################################################## | |
rule run_sailfish_rsem_index: | |
input: | |
transcripts={rsem_transcriptome_fullpath} | |
output: | |
sailfish_rsem_index_path | |
threads: | |
sailfish_threads | |
shell: | |
""" | |
mkdir -p {sailfish_rsem_index_path} | |
{sailfish_binary} index -t {input.transcripts} -k 31 -o {sailfish_rsem_index_path} -p {sailfish_threads} | |
""" | |
rule run_sailfish_rsem_quant: | |
input: | |
first = {rsem_first_mate_path}, | |
second = {rsem_second_mate_path}, | |
index = {sailfish_rsem_index_path} | |
output: | |
sailfish_rsem_quant_path | |
threads: | |
sailfish_threads | |
shell: | |
""" | |
mkdir -p {sailfish_rsem_quant_path} | |
{sailfish_binary} quant -i {input.index} -l IU -1 {input.first} -2 {input.second} -p {sailfish_threads} -o {sailfish_rsem_quant_path} --useVBOpt | |
""" | |
rule run_kallisto_rsem_index: | |
input: | |
transcripts = {rsem_transcriptome_fullpath} | |
output: | |
kallisto_rsem_index_path | |
shell: | |
""" | |
mkdir -p {kallisto_rsem_index_path} | |
{kallisto_binary} index -i {kallisto_rsem_index_path}/{kallisto_rsem_index_prefix} -k 31 {input.transcripts} | |
""" | |
rule run_kallisto_rsem_quant: | |
input: | |
first = {rsem_first_mate_path}, | |
second = {rsem_second_mate_path}, | |
index = {kallisto_rsem_index_path} | |
output: | |
kallisto_rsem_quant_path | |
shell: | |
""" | |
mkdir -p {kallisto_rsem_quant_path} | |
{kallisto_binary} quant -i {kallisto_rsem_index_path}/{kallisto_rsem_index_prefix} -o {kallisto_rsem_quant_path} {input.first} {input.second} | |
""" | |
rule run_bowtie2_rsem_index: | |
input: | |
transcripts = {rsem_transcriptome_fullpath} | |
output: | |
bowtie2_rsem_index_path | |
threads: | |
bowtie2_threads | |
shell: | |
""" | |
mkdir -p {bowtie2_rsem_index_path} | |
{bowtie2_build_binary} -f {input.transcripts} {bowtie2_rsem_index_path}/{bowtie2_index_prefix} | |
""" | |
rule run_bowtie2_rsem_align: | |
input: | |
first = {rsem_first_mate_path}, | |
second = {rsem_second_mate_path}, | |
index = {bowtie2_rsem_index_path} | |
output: | |
bowtie2_rsem_sam | |
threads: | |
bowtie2_threads | |
shell: | |
""" | |
mkdir -p {bowtie2_rsem_align_path} | |
{bowtie2_binary} -q -k 100 --threads {bowtie2_threads} --no-mixed --no-discordant -x {input.index}/{bowtie2_index_prefix} -1 {input.first} -2 {input.second} -S {bowtie2_rsem_sam} | |
""" | |
rule run_bitseq_rsem_parse: | |
input: | |
sam = {bowtie2_rsem_sam}, | |
transcripts = {rsem_transcriptome_fullpath} | |
output: | |
bitseq_rsem_prob_path | |
shell: | |
""" | |
mkdir -p {bitseq_rsem_parse_path} | |
{bitseq_parse_binary} {input.sam} -o {bitseq_rsem_prob_path} --trSeqFile {input.transcripts} --trInfoFile {bitseq_rsem_trinfo_path} --uniform --verbose | |
""" | |
rule run_bitseq_rsem_quant: | |
input: | |
bitseq_rsem_prob_path | |
output: | |
bitseq_rsem_quant_path | |
shell: | |
""" | |
mkdir -p {bitseq_rsem_quant_path} | |
{bitseq_estimate_binary} -o {bitseq_rsem_quant_path}/{bitseq_prefix} {bitseq_rsem_prob_path} | |
""" | |
rule run_rsem_pipeline: | |
input: | |
sailfish = sailfish_rsem_quant_path, | |
kallisto = kallisto_rsem_quant_path, | |
bitseq = bitseq_rsem_quant_path |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment