Last active
February 10, 2025 17:50
-
-
Save mbreese/ec3727004d72c4661783ce26e0697cea to your computer and use it in GitHub Desktop.
Convert SIFT scores to VCF format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Converts SIFT format files to VCF format for easier annotation lookup | |
# | |
# Sift input gz files are tab-delimited gzip compressed files. Coordinates are 1-based, so | |
# this is a pretty easy conversion. | |
# | |
import os | |
import sys | |
import gzip | |
#see https://useast.ensembl.org/info/genome/variation/prediction/protein_function.html | |
deleterious_threshold = 0.05 | |
def convert(fnames): | |
sys.stdout.write('##fileformat=VCFv4.2\n') | |
sys.stdout.write('##INFO=<ID=SIFT_score,Number=A,Type=Float,Description="SIFT score">\n') | |
sys.stdout.write('##INFO=<ID=SIFT_pred,Number=A,Type=String,Description="SIFT prediction">\n') | |
sys.stdout.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n') | |
for fname in fnames: | |
chrom = "chr%s" % os.path.basename(fname).replace(".gz", "") | |
if chrom == "chrMT": | |
chrom = "chrM" | |
sys.stderr.write("Converting %s => %s\n" % (fname, chrom)) | |
f = gzip.open(fname, 'rt') | |
col_idx = None | |
last_pos_ref_alt = None | |
last_sift = None | |
for line in f: | |
cols = line.strip('\n').split('\t') | |
if not col_idx: | |
col_idx = {} | |
for i,v in enumerate(cols): | |
col_idx[v] = i | |
continue | |
pos = cols[col_idx['#Position']] | |
ref = cols[col_idx['Ref_allele']] | |
alt = cols[col_idx['New_allele']] | |
sift = cols[col_idx['SIFT_score']] | |
k = (pos, ref, alt) | |
if last_pos_ref_alt != k: | |
if last_pos_ref_alt: | |
if last_pos_ref_alt[1] != last_pos_ref_alt[2] and last_sift: | |
write_base(chrom, last_pos_ref_alt[0], last_pos_ref_alt[1], last_pos_ref_alt[2], last_sift) | |
last_pos_ref_alt = k | |
last_sift = None | |
if not last_sift and sift != 'NA': | |
last_sift = sift | |
f.close() | |
def write_base(chrom, pos, ref, alt, sift): | |
pred = 'tolerated' | |
if float(sift) < deleterious_threshold: | |
pred = 'deleterious' | |
sys.stdout.write('%s\t%s\t.\t%s\t%s\t.\t.\tSIFT_score=%s;SIFT_pred=%s\n' % (chrom, pos, ref, alt, sift, pred)) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2 or not sys.argv[1]: | |
sys.stderr.write("Missing argument: filename.gz") | |
sys.exit(1) | |
fnames = [] | |
for arg in sys.argv[1:]: | |
fnames.append(arg) | |
convert(fnames) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment