mbreese · February 21, 2018 14:31 · mbreese · Jan 7, 2022
diff --git a/vep_vcf_worst_consequence.py b/vep_vcf_worst_consequence.py
 #!/usr/bin/env python

 import sys
 import itertools

 cons_key = "VEP_Consequence"
 impact_key = "VEP_IMPACT"
 gene_key = "VEP_SYMBOL"
 csn_key = "VEP_CSN"
 sift_key = "VEP_SIFT"
 polyphen_key = "VEP_PolyPhen"

 new_cons_key = "VEP_Worst_Consequence"
 new_impact_key = "VEP_Worst_Impact"
 new_gene_key = "VEP_Worst_Gene"
 new_csn_key = "VEP_Worst_CSN"
 new_sift_key = "VEP_Worst_SIFT"
 new_polyphen_key = "VEP_Worst_PolyPhen"

 impact_ranked = ['HIGH', 'MODERATE','LOW','MODIFIER']

 consequences = '''1 transcript_ablation
 3 splice_acceptor_variant
 3 splice_donor_variant
 4 stop_gained
 5 frameshift_variant
 6 stop_lost
 7 start_lost
 8 transcript_amplification
 10 inframe_insertion
 11 inframe_deletion
 12 missense_variant
 12 protein_altering_variant
 13 splice_region_variant
 14 incomplete_terminal_codon_variant
 15 start_retained_variant
 15 stop_retained_variant
 15 synonymous_variant
 16 coding_sequence_variant
 17 mature_miRNA_variant
 18 5_prime_UTR_variant
 19 3_prime_UTR_variant
 20 non_coding_transcript_exon_variant
 21 intron_variant
 22 NMD_transcript_variant
 23 non_coding_transcript_variant
 24 upstream_gene_variant
 25 downstream_gene_variant
 26 TFBS_ablation
 28 TFBS_amplification
 30 TF_binding_site_variant
 31 regulatory_region_ablation
 33 regulatory_region_amplification
 36 feature_elongation
 36 regulatory_region_variant
 37 feature_truncation
 38 intergenic_variant
 39 sequence_variant'''

 cons_ranked = [x[1] for x in [y.split(' ') for y in consequences.split('\n')]]

 def parse_pos(line):
    cols = line.strip().split('\t')
    outcols = cols[:7]
    
    info = cols[7]
    
    info_vals = info.split(';')
    info_out = []

    genes = []
    csns = []
    gene_idx = -1

    for ival in info_vals:
        if '=' not in ival:
            info_out.append(ival)
        else:
            info_out.append(ival)
            left, right = ival.split('=')
            if left == gene_key:
                genes = right.split(',')

            elif left == csn_key:
                csns = right.split(',')

            elif left == polyphen_key:
                worst = ''
                worst_val = 0
                
                spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))

                for s in spl:
                    s2 = s.replace('(', ' ').replace(')','')
                    if s2:
                        name, val = s2.split(' ')
                        val = float(val)
                        if val > worst_val:
                            worst = name
                            worst_val = val

                if worst:
                    info_out.append('%s=%s' % (new_polyphen_key, worst))

            elif left == sift_key:
                worst = ''
                worst_val = 1
                
                spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))

                for s in spl:
                    s2 = s.replace('(', ' ').replace(')','')
                    if s2:
                        name, val = s2.split(' ')
                        val = float(val)
                        if val < worst_val:
                            worst = name
                            worst_val = val

                if worst:
                    info_out.append('%s=%s' % (new_sift_key, worst))

            elif left == impact_key:
                worst = ''
                
                spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
                for c in impact_ranked:
                    if c in spl:
                        worst = c
                        break
                if worst:
                    info_out.append('%s=%s' % (new_impact_key, worst))

            elif left == cons_key:
                worst = ''
                
                spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
                for c in cons_ranked:
                    if c in spl:
                        worst = c
                        break
                if worst:
                    info_out.append('%s=%s' % (new_cons_key, worst))
                    for i,spl in enumerate(right.split(',')) :
                        for spl2 in spl.split('&'):
                            if worst in spl2:
                                gene_idx = i

    if genes and gene_idx > -1:
        if len(genes) > 1:
            info_out.append('%s=%s' % (new_gene_key, genes[gene_idx]))
        else:
            info_out.append('%s=%s' % (new_gene_key, genes[0]))

    if csns and gene_idx > -1:
        if len(csns) > 1:
            info_out.append('%s=%s' % (new_csn_key, csns[gene_idx]))
        else:
            info_out.append('%s=%s' % (new_csn_key, genes[0]))

    outcols.append(';'.join(info_out))
    outcols.extend(cols[8:])

    return outcols



 def parse_main(fin=sys.stdin, fout=sys.stdout):

    printed_header = False

    for line in fin:
        if line[0] == '#' and line[1] == '#':
            fout.write(line)
        elif line[0] == '#':
            if not printed_header:
                printed_header = True
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Consequence">\n' % (new_cons_key))
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Impact">\n' % (new_impact_key))
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Gene">\n' % (new_gene_key))
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst SIFT">\n' % (new_sift_key))
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst PolyPhen">\n' % (new_polyphen_key))
                fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst CSN">\n' % (new_csn_key))
            fout.write(line)
        else:
            fout.write('%s\n' % '\t'.join(parse_pos(line)))
    

 if __name__ == '__main__':
    parse_main()
	#!/usr/bin/env python

	import sys
	import itertools

	cons_key = "VEP_Consequence"
	impact_key = "VEP_IMPACT"
	gene_key = "VEP_SYMBOL"
	csn_key = "VEP_CSN"
	sift_key = "VEP_SIFT"
	polyphen_key = "VEP_PolyPhen"

	new_cons_key = "VEP_Worst_Consequence"
	new_impact_key = "VEP_Worst_Impact"
	new_gene_key = "VEP_Worst_Gene"
	new_csn_key = "VEP_Worst_CSN"
	new_sift_key = "VEP_Worst_SIFT"
	new_polyphen_key = "VEP_Worst_PolyPhen"

	impact_ranked = ['HIGH', 'MODERATE','LOW','MODIFIER']

	consequences = '''1 transcript_ablation
	3 splice_acceptor_variant
	3 splice_donor_variant
	4 stop_gained
	5 frameshift_variant
	6 stop_lost
	7 start_lost
	8 transcript_amplification
	10 inframe_insertion
	11 inframe_deletion
	12 missense_variant
	12 protein_altering_variant
	13 splice_region_variant
	14 incomplete_terminal_codon_variant
	15 start_retained_variant
	15 stop_retained_variant
	15 synonymous_variant
	16 coding_sequence_variant
	17 mature_miRNA_variant
	18 5_prime_UTR_variant
	19 3_prime_UTR_variant
	20 non_coding_transcript_exon_variant
	21 intron_variant
	22 NMD_transcript_variant
	23 non_coding_transcript_variant
	24 upstream_gene_variant
	25 downstream_gene_variant
	26 TFBS_ablation
	28 TFBS_amplification
	30 TF_binding_site_variant
	31 regulatory_region_ablation
	33 regulatory_region_amplification
	36 feature_elongation
	36 regulatory_region_variant
	37 feature_truncation
	38 intergenic_variant
	39 sequence_variant'''

	cons_ranked = [x[1] for x in [y.split(' ') for y in consequences.split('\n')]]

	def parse_pos(line):
	cols = line.strip().split('\t')
	outcols = cols[:7]

	info = cols[7]

	info_vals = info.split(';')
	info_out = []

	genes = []
	csns = []
	gene_idx = -1

	for ival in info_vals:
	if '=' not in ival:
	info_out.append(ival)
	else:
	info_out.append(ival)
	left, right = ival.split('=')
	if left == gene_key:
	genes = right.split(',')

	elif left == csn_key:
	csns = right.split(',')

	elif left == polyphen_key:
	worst = ''
	worst_val = 0

	spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))

	for s in spl:
	s2 = s.replace('(', ' ').replace(')','')
	if s2:
	name, val = s2.split(' ')
	val = float(val)
	if val > worst_val:
	worst = name
	worst_val = val

	if worst:
	info_out.append('%s=%s' % (new_polyphen_key, worst))

	elif left == sift_key:
	worst = ''
	worst_val = 1

	spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))

	for s in spl:
	s2 = s.replace('(', ' ').replace(')','')
	if s2:
	name, val = s2.split(' ')
	val = float(val)
	if val < worst_val:
	worst = name
	worst_val = val

	if worst:
	info_out.append('%s=%s' % (new_sift_key, worst))

	elif left == impact_key:
	worst = ''

	spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
	for c in impact_ranked:
	if c in spl:
	worst = c
	break
	if worst:
	info_out.append('%s=%s' % (new_impact_key, worst))

	elif left == cons_key:
	worst = ''

	spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
	for c in cons_ranked:
	if c in spl:
	worst = c
	break
	if worst:
	info_out.append('%s=%s' % (new_cons_key, worst))
	for i,spl in enumerate(right.split(',')) :
	for spl2 in spl.split('&'):
	if worst in spl2:
	gene_idx = i

	if genes and gene_idx > -1:
	if len(genes) > 1:
	info_out.append('%s=%s' % (new_gene_key, genes[gene_idx]))
	else:
	info_out.append('%s=%s' % (new_gene_key, genes[0]))

	if csns and gene_idx > -1:
	if len(csns) > 1:
	info_out.append('%s=%s' % (new_csn_key, csns[gene_idx]))
	else:
	info_out.append('%s=%s' % (new_csn_key, genes[0]))

	outcols.append(';'.join(info_out))
	outcols.extend(cols[8:])

	return outcols



	def parse_main(fin=sys.stdin, fout=sys.stdout):

	printed_header = False

	for line in fin:
	if line[0] == '#' and line[1] == '#':
	fout.write(line)
	elif line[0] == '#':
	if not printed_header:
	printed_header = True
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Consequence">\n' % (new_cons_key))
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Impact">\n' % (new_impact_key))
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Gene">\n' % (new_gene_key))
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst SIFT">\n' % (new_sift_key))
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst PolyPhen">\n' % (new_polyphen_key))
	fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst CSN">\n' % (new_csn_key))
	fout.write(line)
	else:
	fout.write('%s\n' % '\t'.join(parse_pos(line)))


	if __name__ == '__main__':
	parse_main()