Skip to content

Instantly share code, notes, and snippets.

@dmckean
Created May 9, 2018 21:57
Show Gist options
  • Save dmckean/1fb9455c7cbcb8703f461f8e2a819025 to your computer and use it in GitHub Desktop.
Save dmckean/1fb9455c7cbcb8703f461f8e2a819025 to your computer and use it in GitHub Desktop.
Tool for binning and tabulating integer FORMAT data from VCF
import hts
import docopt
from sequtils import toSeq
from strutils import parseInt, join
let doc = """
Bin and tabulate FORMAT data for each VCF record.
Usage:
tabulate_fmt (-h | --help)
tabulate_fmt [options] FMT_ID [INPUT_VCF]
Options:
-h --help Show this screen.
-b b --begin=b Lowest interval for binned output [default: 0].
-s s --step=s Bin width for binned output [default: 25].
-e e --end=e Highest interval for binned output [default: 100].
For example, `./tab -s 25 DP in.bcf` will output a tab-delimited table of
per-row DP counts binned into adjacent intervals 25 wide.
Example output (pretty printed):
#CHROM POS ID REF ALT DP_missing (,0) [0,25) [25,50) [50,75) [75,100) [100,)
chr1 51765 . AC A 2 0 8 15 89 65 21
chr3 651045 . A G 1 0 6 56 84 45 8
chr12 10075 . CCCTAA C 0 0 4 15 83 58 40
"""
# Argument parsing and checks
let args = docopt(doc, optionsFirst=true)
let rMin = parseInt($args["--begin"])
let rStep = parseInt($args["--step"])
let rMax = parseInt($args["--end"])
let fmt_id = $args["FMT_ID"]
var in_name = $args["INPUT_VCF"]
if(args["INPUT_VCF"].kind == vkNone):
in_name = "-"
assert rMin <= rMax
assert rStep > 0
assert fmt_id is string
assert len(fmt_id) > 0
assert in_name is string
assert len(in_name) > 0
# Determine bin count and setup binning proc
let nBins = (rMax - rMin) div rStep + (if((rMax - rMin) mod rStep > 0): 1 else: 0)
proc bin_int(num: int32): int =
result = if num == low(int32): 0
elif num < rMin: 1
elif num >= rMax: nBins + 2
else: ((num - rMin) div rStep) + 2
# Print header line
iterator interval_labels(): string =
yield "(,"& $rMin&")"
var x = rMin
while x < rMax:
yield "["& $x&","& $(min(x+rStep, rMax))&")"
x += rStep
yield "["& $rMax&",)"
let interval_seq = toSeq(interval_labels())
echo "#CHROM\tPOS\tID\tREF\tALT\tmissing\t"&interval_seq.join("\t")
# Parse VCF and tabulate each variant
var v: VCF
assert open(v, in_name)
for rec in v:
var fmt_bins = newSeq[int](nBins + 3)
var fmt_vals = newSeq[int32](len(v.samples))
# Read in values for FORMAT, check for errors
let fmt_status = rec.format.ints(fmt_id, fmt_vals)
case fmt_status:
of Status.NotFound: continue
of Status.UndefinedTag:
echo fmt_id, "not found in the Header"
of Status.UnexpectedType:
echo fmt_id, "is not an int"
else: discard
assert fmt_status == Status.OK
# Tabulate and print record with interval counts
for i in fmt_vals:
fmt_bins[bin_int(i)] += 1
let output = @[rec.CHROM, $rec.POS, rec.ID, rec.REF, rec.ALT.join(","), fmt_bins.join("\t")]
echo output.join("\t")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment