Created
May 9, 2018 21:57
-
-
Save dmckean/1fb9455c7cbcb8703f461f8e2a819025 to your computer and use it in GitHub Desktop.
Tool for binning and tabulating integer FORMAT data from VCF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hts | |
import docopt | |
from sequtils import toSeq | |
from strutils import parseInt, join | |
let doc = """ | |
Bin and tabulate FORMAT data for each VCF record. | |
Usage: | |
tabulate_fmt (-h | --help) | |
tabulate_fmt [options] FMT_ID [INPUT_VCF] | |
Options: | |
-h --help Show this screen. | |
-b b --begin=b Lowest interval for binned output [default: 0]. | |
-s s --step=s Bin width for binned output [default: 25]. | |
-e e --end=e Highest interval for binned output [default: 100]. | |
For example, `./tab -s 25 DP in.bcf` will output a tab-delimited table of | |
per-row DP counts binned into adjacent intervals 25 wide. | |
Example output (pretty printed): | |
#CHROM POS ID REF ALT DP_missing (,0) [0,25) [25,50) [50,75) [75,100) [100,) | |
chr1 51765 . AC A 2 0 8 15 89 65 21 | |
chr3 651045 . A G 1 0 6 56 84 45 8 | |
chr12 10075 . CCCTAA C 0 0 4 15 83 58 40 | |
""" | |
# Argument parsing and checks | |
let args = docopt(doc, optionsFirst=true) | |
let rMin = parseInt($args["--begin"]) | |
let rStep = parseInt($args["--step"]) | |
let rMax = parseInt($args["--end"]) | |
let fmt_id = $args["FMT_ID"] | |
var in_name = $args["INPUT_VCF"] | |
if(args["INPUT_VCF"].kind == vkNone): | |
in_name = "-" | |
assert rMin <= rMax | |
assert rStep > 0 | |
assert fmt_id is string | |
assert len(fmt_id) > 0 | |
assert in_name is string | |
assert len(in_name) > 0 | |
# Determine bin count and setup binning proc | |
let nBins = (rMax - rMin) div rStep + (if((rMax - rMin) mod rStep > 0): 1 else: 0) | |
proc bin_int(num: int32): int = | |
result = if num == low(int32): 0 | |
elif num < rMin: 1 | |
elif num >= rMax: nBins + 2 | |
else: ((num - rMin) div rStep) + 2 | |
# Print header line | |
iterator interval_labels(): string = | |
yield "(,"& $rMin&")" | |
var x = rMin | |
while x < rMax: | |
yield "["& $x&","& $(min(x+rStep, rMax))&")" | |
x += rStep | |
yield "["& $rMax&",)" | |
let interval_seq = toSeq(interval_labels()) | |
echo "#CHROM\tPOS\tID\tREF\tALT\tmissing\t"&interval_seq.join("\t") | |
# Parse VCF and tabulate each variant | |
var v: VCF | |
assert open(v, in_name) | |
for rec in v: | |
var fmt_bins = newSeq[int](nBins + 3) | |
var fmt_vals = newSeq[int32](len(v.samples)) | |
# Read in values for FORMAT, check for errors | |
let fmt_status = rec.format.ints(fmt_id, fmt_vals) | |
case fmt_status: | |
of Status.NotFound: continue | |
of Status.UndefinedTag: | |
echo fmt_id, "not found in the Header" | |
of Status.UnexpectedType: | |
echo fmt_id, "is not an int" | |
else: discard | |
assert fmt_status == Status.OK | |
# Tabulate and print record with interval counts | |
for i in fmt_vals: | |
fmt_bins[bin_int(i)] += 1 | |
let output = @[rec.CHROM, $rec.POS, rec.ID, rec.REF, rec.ALT.join(","), fmt_bins.join("\t")] | |
echo output.join("\t") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment