June 9, 2013 20:49 · Jun 9, 2013
diff --git a/pipeline.py b/pipeline.py
@@ -0,0 +1,100 @@
+import os
+import re
+from ruffus import *
+import subprocess
+import sqlite3
+
+parser = cmdline.get_argparse(description='Midlands-1 analysis pipeline')
+
+parser.add_argument("-s", "--sqlite_db", type=str, help="Name and path of SQLite3 database")
+parser.add_argument("-c", "--sql_command", type=str, help="SQL command to return rows")
+
+options = parser.parse_args()
+
+#  optional logger which can be passed to ruffus tasks
+logger, logger_mutex = cmdline.setup_logging (__name__, options.log_file, options.verbose)
+
+#_____________________________________________________________________________________
+#   pipelined functions go here
+#_____________________________________________________________________________________
+
+def cmd_exists(cmd):
+    try:
+        ret = subprocess.call([cmd], 
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except OSError, e:
+        return 0
+
+binaries = dict((b, b) for b in ['bwa', 'samtools'])
+
+missing_files = [binary_file for binary_file in binaries if not cmd_exists(binary_file)]
+if missing_files:
+	raise SystemExit("%s not on path" % (", ".join(missing_files)))
+
+def read_run_details_sqlite3(fn, cmd):
+	conn = sqlite3.connect(fn)
+	conn.row_factory=sqlite3.Row
+	c = conn.cursor()
+	c.execute(cmd)
+	rows = c.fetchall()
+	return rows
+
+runs = read_run_details_sqlite3(options.sqlite_db, options.sql_command)
+print "%d runs selected " % (len(runs),)
+
+input_prefix = '/mnt/phatso/nick/alnQC'
+
+usefulhash = {}
+
+def make_input_files(outfile):
+    files = []
+    for run in runs:
+        files.append((
+            ('%s/%s/%s_1_trimmed.fastq.gz' % (input_prefix, run['sequencing_plate'], run['label']),
+             '%s/%s/%s_2_trimmed.fastq.gz' % (input_prefix, run['sequencing_plate'], run['label'])),
+             outfile % run['label'],
+             run['label']
+        ))
+        usefulhash[outfile % run['label']] = run['label']
+    return files
+
+initial_input_files = make_input_files('%s.sorted.bam')
+denovo_assembly_files = make_input_files('%s.assembly')
+
+@files(initial_input_files)
+def align_to_les(infiles, outfile, label):
+    os.system("""%s mem -t16 refs/NC_011770.fna %s %s |
+                 %s view -bS - |
+                 %s sort -f -@ 8 - %s""" % (binaries['bwa'],
+                                    infiles[0], infiles[1],
+                                    binaries['samtools'],
+                                    binaries['samtools'],
+                                    outfile))
+
+@merge(align_to_les, "variants.vcf")
+#@merge(['CF40-P31.sorted.bam', 'CF60-P44.sorted.bam'], "variants.vcf")
+def generate_vcf(infiles, outfile):
+    fh = open("labels.txt", "w")
+    for fn in infiles:
+        print >>fh, usefulhash[fn]
+    fh.close()
+    cmd = "%s mpileup -q 50 -f refs/NC_011770.fna %s \
+        | java -jar bin/VarScan.jar mpileup2snp \
+          --min-coverage 2 --min-var-freq 0.9 --p-value 0.005 \
+          --output-vcf --vcf-sample-list labels.txt > %s" % (binaries['samtools'],
+                " ".join(infiles), outfile)
+    os.system(cmd)
+
+@transform(align_to_les, suffix(".sorted.bam"), r"\1_assembly/contigs.fa")
+def velvet_assembly(infile, outfile):
+    label = usefulhash[infile]
+    os.system("bin/velvet_1.2.10/velveth %s_assembly 63 -bam %s" % (label, infile))
+    os.system("bin/velvet_1.2.10/velvetg %s_assembly -exp_cov auto -cov_cutoff auto" % (label))
+
+@transform(velvet_assembly, suffix("contigs.fa"), "mlst.json")
+def mlst_profile(infile, outfile):
+    os.system("curl -o %s -F file=@%s -F name=Pseudomonas_aeruginosa http://192.168.1.31:8001/mlst/?format=json" % (outfile, infile))
+
+cmdline.run(options)
+