Skip to content

Instantly share code, notes, and snippets.

Created June 9, 2013 20:49

Revisions

  1. @invalid-email-address Anonymous created this gist Jun 9, 2013.
    100 changes: 100 additions & 0 deletions pipeline.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,100 @@
    import os
    import re
    from ruffus import *
    import subprocess
    import sqlite3

    parser = cmdline.get_argparse(description='Midlands-1 analysis pipeline')

    parser.add_argument("-s", "--sqlite_db", type=str, help="Name and path of SQLite3 database")
    parser.add_argument("-c", "--sql_command", type=str, help="SQL command to return rows")

    options = parser.parse_args()

    # optional logger which can be passed to ruffus tasks
    logger, logger_mutex = cmdline.setup_logging (__name__, options.log_file, options.verbose)

    #_____________________________________________________________________________________
    # pipelined functions go here
    #_____________________________________________________________________________________

    def cmd_exists(cmd):
    try:
    ret = subprocess.call([cmd],
    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return True
    except OSError, e:
    return 0

    binaries = dict((b, b) for b in ['bwa', 'samtools'])

    missing_files = [binary_file for binary_file in binaries if not cmd_exists(binary_file)]
    if missing_files:
    raise SystemExit("%s not on path" % (", ".join(missing_files)))

    def read_run_details_sqlite3(fn, cmd):
    conn = sqlite3.connect(fn)
    conn.row_factory=sqlite3.Row
    c = conn.cursor()
    c.execute(cmd)
    rows = c.fetchall()
    return rows

    runs = read_run_details_sqlite3(options.sqlite_db, options.sql_command)
    print "%d runs selected " % (len(runs),)

    input_prefix = '/mnt/phatso/nick/alnQC'

    usefulhash = {}

    def make_input_files(outfile):
    files = []
    for run in runs:
    files.append((
    ('%s/%s/%s_1_trimmed.fastq.gz' % (input_prefix, run['sequencing_plate'], run['label']),
    '%s/%s/%s_2_trimmed.fastq.gz' % (input_prefix, run['sequencing_plate'], run['label'])),
    outfile % run['label'],
    run['label']
    ))
    usefulhash[outfile % run['label']] = run['label']
    return files

    initial_input_files = make_input_files('%s.sorted.bam')
    denovo_assembly_files = make_input_files('%s.assembly')

    @files(initial_input_files)
    def align_to_les(infiles, outfile, label):
    os.system("""%s mem -t16 refs/NC_011770.fna %s %s |
    %s view -bS - |
    %s sort -f -@ 8 - %s""" % (binaries['bwa'],
    infiles[0], infiles[1],
    binaries['samtools'],
    binaries['samtools'],
    outfile))

    @merge(align_to_les, "variants.vcf")
    #@merge(['CF40-P31.sorted.bam', 'CF60-P44.sorted.bam'], "variants.vcf")
    def generate_vcf(infiles, outfile):
    fh = open("labels.txt", "w")
    for fn in infiles:
    print >>fh, usefulhash[fn]
    fh.close()
    cmd = "%s mpileup -q 50 -f refs/NC_011770.fna %s \
    | java -jar bin/VarScan.jar mpileup2snp \
    --min-coverage 2 --min-var-freq 0.9 --p-value 0.005 \
    --output-vcf --vcf-sample-list labels.txt > %s" % (binaries['samtools'],
    " ".join(infiles), outfile)
    os.system(cmd)

    @transform(align_to_les, suffix(".sorted.bam"), r"\1_assembly/contigs.fa")
    def velvet_assembly(infile, outfile):
    label = usefulhash[infile]
    os.system("bin/velvet_1.2.10/velveth %s_assembly 63 -bam %s" % (label, infile))
    os.system("bin/velvet_1.2.10/velvetg %s_assembly -exp_cov auto -cov_cutoff auto" % (label))

    @transform(velvet_assembly, suffix("contigs.fa"), "mlst.json")
    def mlst_profile(infile, outfile):
    os.system("curl -o %s -F file=@%s -F name=Pseudomonas_aeruginosa http://192.168.1.31:8001/mlst/?format=json" % (outfile, infile))

    cmdline.run(options)