Created
February 21, 2018 14:31
-
-
Save mbreese/03acfa65b72f68cafa05df41671ebfb9 to your computer and use it in GitHub Desktop.
Given a VCF file annotated with VEP (and expanded), calculate the worst consequence, gene, SIFT, PolyPhen, etc...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import itertools | |
cons_key = "VEP_Consequence" | |
impact_key = "VEP_IMPACT" | |
gene_key = "VEP_SYMBOL" | |
csn_key = "VEP_CSN" | |
sift_key = "VEP_SIFT" | |
polyphen_key = "VEP_PolyPhen" | |
new_cons_key = "VEP_Worst_Consequence" | |
new_impact_key = "VEP_Worst_Impact" | |
new_gene_key = "VEP_Worst_Gene" | |
new_csn_key = "VEP_Worst_CSN" | |
new_sift_key = "VEP_Worst_SIFT" | |
new_polyphen_key = "VEP_Worst_PolyPhen" | |
impact_ranked = ['HIGH', 'MODERATE','LOW','MODIFIER'] | |
consequences = '''1 transcript_ablation | |
3 splice_acceptor_variant | |
3 splice_donor_variant | |
4 stop_gained | |
5 frameshift_variant | |
6 stop_lost | |
7 start_lost | |
8 transcript_amplification | |
10 inframe_insertion | |
11 inframe_deletion | |
12 missense_variant | |
12 protein_altering_variant | |
13 splice_region_variant | |
14 incomplete_terminal_codon_variant | |
15 start_retained_variant | |
15 stop_retained_variant | |
15 synonymous_variant | |
16 coding_sequence_variant | |
17 mature_miRNA_variant | |
18 5_prime_UTR_variant | |
19 3_prime_UTR_variant | |
20 non_coding_transcript_exon_variant | |
21 intron_variant | |
22 NMD_transcript_variant | |
23 non_coding_transcript_variant | |
24 upstream_gene_variant | |
25 downstream_gene_variant | |
26 TFBS_ablation | |
28 TFBS_amplification | |
30 TF_binding_site_variant | |
31 regulatory_region_ablation | |
33 regulatory_region_amplification | |
36 feature_elongation | |
36 regulatory_region_variant | |
37 feature_truncation | |
38 intergenic_variant | |
39 sequence_variant''' | |
cons_ranked = [x[1] for x in [y.split(' ') for y in consequences.split('\n')]] | |
def parse_pos(line): | |
cols = line.strip().split('\t') | |
outcols = cols[:7] | |
info = cols[7] | |
info_vals = info.split(';') | |
info_out = [] | |
genes = [] | |
csns = [] | |
gene_idx = -1 | |
for ival in info_vals: | |
if '=' not in ival: | |
info_out.append(ival) | |
else: | |
info_out.append(ival) | |
left, right = ival.split('=') | |
if left == gene_key: | |
genes = right.split(',') | |
elif left == csn_key: | |
csns = right.split(',') | |
elif left == polyphen_key: | |
worst = '' | |
worst_val = 0 | |
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')])) | |
for s in spl: | |
s2 = s.replace('(', ' ').replace(')','') | |
if s2: | |
name, val = s2.split(' ') | |
val = float(val) | |
if val > worst_val: | |
worst = name | |
worst_val = val | |
if worst: | |
info_out.append('%s=%s' % (new_polyphen_key, worst)) | |
elif left == sift_key: | |
worst = '' | |
worst_val = 1 | |
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')])) | |
for s in spl: | |
s2 = s.replace('(', ' ').replace(')','') | |
if s2: | |
name, val = s2.split(' ') | |
val = float(val) | |
if val < worst_val: | |
worst = name | |
worst_val = val | |
if worst: | |
info_out.append('%s=%s' % (new_sift_key, worst)) | |
elif left == impact_key: | |
worst = '' | |
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')])) | |
for c in impact_ranked: | |
if c in spl: | |
worst = c | |
break | |
if worst: | |
info_out.append('%s=%s' % (new_impact_key, worst)) | |
elif left == cons_key: | |
worst = '' | |
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')])) | |
for c in cons_ranked: | |
if c in spl: | |
worst = c | |
break | |
if worst: | |
info_out.append('%s=%s' % (new_cons_key, worst)) | |
for i,spl in enumerate(right.split(',')) : | |
for spl2 in spl.split('&'): | |
if worst in spl2: | |
gene_idx = i | |
if genes and gene_idx > -1: | |
if len(genes) > 1: | |
info_out.append('%s=%s' % (new_gene_key, genes[gene_idx])) | |
else: | |
info_out.append('%s=%s' % (new_gene_key, genes[0])) | |
if csns and gene_idx > -1: | |
if len(csns) > 1: | |
info_out.append('%s=%s' % (new_csn_key, csns[gene_idx])) | |
else: | |
info_out.append('%s=%s' % (new_csn_key, genes[0])) | |
outcols.append(';'.join(info_out)) | |
outcols.extend(cols[8:]) | |
return outcols | |
def parse_main(fin=sys.stdin, fout=sys.stdout): | |
printed_header = False | |
for line in fin: | |
if line[0] == '#' and line[1] == '#': | |
fout.write(line) | |
elif line[0] == '#': | |
if not printed_header: | |
printed_header = True | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Consequence">\n' % (new_cons_key)) | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Impact">\n' % (new_impact_key)) | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Gene">\n' % (new_gene_key)) | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst SIFT">\n' % (new_sift_key)) | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst PolyPhen">\n' % (new_polyphen_key)) | |
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst CSN">\n' % (new_csn_key)) | |
fout.write(line) | |
else: | |
fout.write('%s\n' % '\t'.join(parse_pos(line))) | |
if __name__ == '__main__': | |
parse_main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note: this doesn't work right... it needs to split the consequences and process each value separately for each transcript (these are comma separated)