Skip to content

Instantly share code, notes, and snippets.

@mbreese
Last active February 10, 2025 17:50
Show Gist options
  • Save mbreese/ec3727004d72c4661783ce26e0697cea to your computer and use it in GitHub Desktop.
Save mbreese/ec3727004d72c4661783ce26e0697cea to your computer and use it in GitHub Desktop.
Convert SIFT scores to VCF format
#!/usr/bin/env python3
#
# Converts SIFT format files to VCF format for easier annotation lookup
#
# Sift input gz files are tab-delimited gzip compressed files. Coordinates are 1-based, so
# this is a pretty easy conversion.
#
import os
import sys
import gzip
#see https://useast.ensembl.org/info/genome/variation/prediction/protein_function.html
deleterious_threshold = 0.05
def convert(fnames):
sys.stdout.write('##fileformat=VCFv4.2\n')
sys.stdout.write('##INFO=<ID=SIFT_score,Number=A,Type=Float,Description="SIFT score">\n')
sys.stdout.write('##INFO=<ID=SIFT_pred,Number=A,Type=String,Description="SIFT prediction">\n')
sys.stdout.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n')
for fname in fnames:
chrom = "chr%s" % os.path.basename(fname).replace(".gz", "")
if chrom == "chrMT":
chrom = "chrM"
sys.stderr.write("Converting %s => %s\n" % (fname, chrom))
f = gzip.open(fname, 'rt')
col_idx = None
last_pos_ref_alt = None
last_sift = None
for line in f:
cols = line.strip('\n').split('\t')
if not col_idx:
col_idx = {}
for i,v in enumerate(cols):
col_idx[v] = i
continue
pos = cols[col_idx['#Position']]
ref = cols[col_idx['Ref_allele']]
alt = cols[col_idx['New_allele']]
sift = cols[col_idx['SIFT_score']]
k = (pos, ref, alt)
if last_pos_ref_alt != k:
if last_pos_ref_alt:
if last_pos_ref_alt[1] != last_pos_ref_alt[2] and last_sift:
write_base(chrom, last_pos_ref_alt[0], last_pos_ref_alt[1], last_pos_ref_alt[2], last_sift)
last_pos_ref_alt = k
last_sift = None
if not last_sift and sift != 'NA':
last_sift = sift
f.close()
def write_base(chrom, pos, ref, alt, sift):
pred = 'tolerated'
if float(sift) < deleterious_threshold:
pred = 'deleterious'
sys.stdout.write('%s\t%s\t.\t%s\t%s\t.\t.\tSIFT_score=%s;SIFT_pred=%s\n' % (chrom, pos, ref, alt, sift, pred))
if __name__ == '__main__':
if len(sys.argv) < 2 or not sys.argv[1]:
sys.stderr.write("Missing argument: filename.gz")
sys.exit(1)
fnames = []
for arg in sys.argv[1:]:
fnames.append(arg)
convert(fnames)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment