Created
May 9, 2018 21:58
-
-
Save dmckean/9b641956fde4bd710e0be559d701f6fd to your computer and use it in GitHub Desktop.
bcftools plugin for binning and tabulating integer FORMAT data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <assert.h> | |
#include <getopt.h> | |
#include <unistd.h> | |
#include <inttypes.h> | |
#include <htslib/vcf.h> | |
#include "bcftools.h" | |
#include "version.c" | |
int nsmpl, rMin, rStep, rMax, missing, minima, maxima; | |
size_t nbins; | |
char * fmt_str; | |
bcf_hdr_t *hdr; | |
const char *about(void) | |
{ | |
return "Bins and tabulates FORMAT data for each VCF record.\n"; | |
} | |
const char *usage(void) { | |
return | |
"\n" | |
"About: Bin and tabulate integer FORMAT fields for each VCF record..\n" | |
"Usage: bcftools +tabulate <in.bcf/.vcf.gz> [General Options] -- [Plugin Options]\n" | |
"\n" | |
"Options:\n" | |
" run \"bcftools plugin\" for a list of common options\n" | |
"\n" | |
"Plugin options:\n" | |
" -f --format FMT_ID ID for format field (required)\n" | |
"\n" | |
" -b, --begin BEGIN_INT Lowest interval for binned output [0]\n" | |
" -s, --step STEP_INT Bin width for binned output [1]\n" | |
" -e --end END_INT Max interval for binned output [100]\n" | |
"\n" | |
" --no-missing Ignore missing values (e.g. non numeric values, like '.')\n" | |
" --no-minima Ignores values below `BEGIN_INT`\n" | |
" --no-maxima Ignores values greater than or equal to `END_INT`\n" | |
"\n" | |
"Example:\n" | |
" bcftools +tabulate in.bcf -- -f DP\n" | |
"\n"; | |
} | |
int init(int argc, char **argv, bcf_hdr_t *in_hdr, bcf_hdr_t *out_hdr) | |
{ | |
hdr = in_hdr; | |
nsmpl = bcf_hdr_nsamples(hdr); | |
rMin = 0; | |
rStep = 1; | |
rMax = 100; | |
missing=1; | |
minima=1; | |
maxima=1; | |
int c; | |
static struct option loptions[] = | |
{ | |
{"format", required_argument, NULL, 'f'}, | |
{"begin", optional_argument, NULL, 'b'}, | |
{"step", optional_argument, NULL, 's'}, | |
{"end", optional_argument, NULL, 'e'}, | |
{"no-missing", no_argument, NULL, 'm'}, | |
{"no-minima", no_argument, NULL, 'n'}, | |
{"no-maxima", no_argument, NULL, 'p'}, | |
{NULL, no_argument, NULL, 0} | |
}; | |
while ((c = getopt_long(argc, argv, "f:b:s:e:amnp", loptions, NULL)) >= 0) { | |
switch (c) { | |
case 'f': | |
fmt_str = optarg; | |
case 'b': | |
rMin = (int) strtol(optarg, NULL, 10); | |
break; | |
case 's': | |
rStep = (int) strtol(optarg, NULL, 10); | |
break; | |
case 'e': | |
rMax = (int) strtol(optarg, NULL, 10); | |
break; | |
case 'm': | |
missing=0; | |
break; | |
case 'n': | |
minima=0; | |
break; | |
case 'p': | |
maxima=0; | |
break; | |
default: | |
fprintf(stderr, "%s\n", usage()); | |
exit(EXIT_FAILURE); | |
} | |
} | |
if( fmt_str == NULL ) { | |
fprintf(stderr, "[error]: missing --format FMT_STR\n"); | |
exit(EXIT_FAILURE); | |
} | |
if( rMax < rMin ) { | |
fprintf(stderr, "[error]: END_INT: %d must be greater than or equal to BEGIN_INT: %d\n", rMax, rMin); | |
exit(EXIT_FAILURE); | |
} | |
if( rStep < 1 ) { | |
fprintf(stderr, "[error]: STEP_INT: %d must be greater than 0.\n", rStep); | |
exit(EXIT_FAILURE); | |
} | |
int fmt_type = bcf_hdr_id2type(hdr, BCF_HL_FMT, bcf_hdr_id2int(hdr, BCF_DT_ID, (const char*) fmt_str)); | |
if ( fmt_type != BCF_HT_INT ) { | |
char fmt_type_str[10] = ""; | |
if(fmt_type == BCF_HT_FLAG) { | |
strcat(fmt_type_str, "flag"); | |
} else if ( fmt_type == BCF_HT_REAL ) { | |
strcat(fmt_type_str, "float"); | |
} else if(fmt_type == BCF_HT_STR) { | |
strcat(fmt_type_str, "string"); | |
} else { | |
strcat(fmt_type_str, "unknown"); | |
} | |
fprintf(stderr, "[error]: Invalid type '%s' for FORMAT ID=%s\n", fmt_type_str, fmt_str); | |
exit(EXIT_FAILURE); | |
} | |
int r = rMax - rMin; | |
nbins = (unsigned int) r / rStep + ((r % rStep) > 0); | |
printf("#CHROM\tPOS\tID\tREF\tALT"); | |
if(missing == 1) printf("\t%s_missing", fmt_str); | |
if(minima == 1) printf("\t(,%d)", rMin); | |
if(rStep == 1) { | |
for(int i = rMin; i < rMax; i++) printf("\t%d", i); | |
} else { | |
for(int i = rMin; i < rMax; i += rStep) printf("\t[%d,%d)", i, ((i + rStep < rMax) ? i + rStep : rMax)); | |
} | |
if(maxima == 1) printf("\t[%d,)", rMax); | |
printf("\n"); | |
return 1; | |
} | |
void fprintf_rec(bcf1_t *rec, FILE *stream) { | |
fprintf(stream, "%s\t%d\t%s\t%s\t%s", bcf_seqname(hdr, rec), rec->pos + 1, rec->d.id, rec->d.allele[0], rec->d.allele[1]); | |
for(int i = 2; i < rec->n_allele; i++) fprintf(stream, ",%s", rec->d.allele[i]); | |
} | |
void process_int(bcf1_t *rec, int *val_totals) { | |
int nval, arr_len = 0; | |
int32_t *val_arr = NULL; | |
nval = bcf_get_format_int32(hdr, rec, fmt_str, (void**)(&val_arr), &arr_len); | |
if( nval != nsmpl ) { | |
if (nval < nsmpl) {fprintf(stderr, "Values missing on "); } | |
else {fprintf(stderr, "Extra values on ");} | |
fprintf_rec(rec, stderr); | |
fprintf(stderr, "\n %d samples found, %d values for FMT ID=%s found\n", nsmpl, nval, fmt_str); | |
exit(EXIT_FAILURE); | |
} | |
int fmt_val; | |
for(int i = 0; i < nsmpl; i++) { | |
fmt_val = val_arr[i]; | |
if (fmt_val == bcf_int32_missing) { | |
if (missing == 1) { | |
fmt_val = 0; | |
} else { | |
continue; | |
} | |
} else { | |
if(fmt_val >= rMax) { | |
if (maxima) { | |
fmt_val = (int) nbins + missing + minima; | |
} else { | |
continue; | |
} | |
} | |
else if (fmt_val < rMin) { | |
if (minima) { | |
fmt_val = 0; | |
} else { | |
continue; | |
} | |
} else { | |
fmt_val = (fmt_val - rMin) / rStep + missing + minima; | |
} | |
} | |
val_totals[fmt_val]++; | |
} | |
free(val_arr); | |
} | |
// Called for each VCF record, return NULL to suppress the output | |
bcf1_t *process(bcf1_t *rec) | |
{ | |
bcf_unpack(rec, BCF_UN_FMT); | |
int *val_totals = calloc(missing + minima + nbins + maxima, sizeof(int)); | |
process_int(rec, val_totals); | |
fprintf_rec(rec, stdout); | |
for(int i = 0; i < (missing + minima + nbins + maxima); i++) printf("\t%d", val_totals[i]); | |
printf("\n"); | |
free(val_totals); | |
return NULL; | |
} | |
void destroy(void) { | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment