Last active
March 29, 2024 20:49
-
-
Save budsonjelmont/08db8ded1c59f1a384828b204ef93648 to your computer and use it in GitHub Desktop.
Batch import gnomAD VCFs into an Omics annotation store
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
dataset = 'genomes' | |
semver = '2.1.1' | |
import_arn = 'arn:aws:iam::[account_id]:role/service-role/[role_name]' | |
annot_store_name='gnomad_grch37' | |
annot_store_version_name= dataset + '_v' + str(semver).replace('.','_') | |
base_s3_uri = 's3://gnomad-public-us-east-1/release/' + str(semver) + '/vcf/' + dataset | |
chroms_to_import = [str(c) for c in range(1,23)] + ['X','Y'] | |
# Naming format for 4.0 VCFs | |
# vcfs=['gnomad.' + dataset + '.v' + str(semver) + '.sites.chr' + chrom + '.vcf.bgz' for chrom in chroms_to_import] | |
# Naming format for 2.1.1 VCFs | |
vcfs=['gnomad.' + dataset + '.r' + str(semver) + '.sites.' + chrom + '.vcf.bgz' for chrom in chroms_to_import] | |
uris=[base_s3_uri + '/' + v for v in vcfs] | |
# Create annotation store version | |
omics = boto3.client('omics', region_name='us-east-1') | |
create_version_response = omics.create_annotation_store_version( | |
name=annot_store_name, | |
versionName=annot_store_version_name, | |
description=annot_store_name + '_' + annot_store_version_name | |
) | |
# Wait til version is created | |
waiter = omics.get_waiter('annotation_store_version_created') | |
waiter.wait( | |
name=annot_store_name, | |
versionName=annot_store_version_name, | |
WaiterConfig={ | |
'Delay': 30, | |
'MaxAttempts': 250 | |
} | |
) | |
# Get version info -- don't need this, providing the version name during the import job is sufficient | |
# get_annot_store_response = omics.get_annotation_store_version( | |
# name=annot_store_name, | |
# versionName=annot_store_version_name | |
# ) | |
# This schema is what I inferred from looking at the schema definition--see actual solution below | |
# vep_schema = { | |
# "vep": [ | |
# { | |
# "allele": "string", | |
# "consequence": "string[]", | |
# "impact": "string", | |
# "symbol": "string", | |
# "gene": "string", | |
# "feature_type": "string", | |
# "feature": "string", | |
# "biotype": "string", | |
# "exon": { | |
# "rank": "string", | |
# "total": "string" | |
# }, | |
# "intron": { | |
# "rank": "string", | |
# "total": "string" | |
# }, | |
# "hgvsc": "string", | |
# "hgvsp": "string", | |
# "cdna_position": "string", | |
# "cds_position": "string", | |
# "protein_position": "string", | |
# "amino_acids": { | |
# "reference": "string", | |
# "variant": "string" | |
# }, | |
# "codons": { | |
# "reference": "string", | |
# "variant": "string" | |
# }, | |
# "existing_variation": "string[]", | |
# "distance": "string", | |
# "strand": "string", | |
# "flags": "string[]", | |
# "symbol_source": "string", | |
# "hgnc_id": "string", | |
# "extras": { | |
# "key": "string", | |
# "value": "string" | |
# } | |
# } | |
# ] | |
# } | |
# Omics actually just needs the name of the INFO key that holds the VEP annotation, it will do the rest | |
vep_schema = {'VEP': 'vep'} | |
for v in uris: | |
start_annotation_import_response = omics.start_annotation_import_job( | |
destinationName=annot_store_name, | |
versionName=annot_store_version_name, | |
roleArn=import_arn, | |
items=[{'source': v}], | |
# items=[{'source': v} for v in uris], | |
formatOptions={ | |
'vcfOptions': { | |
'ignoreQualField': False, | |
'ignoreFilterField': False | |
} | |
}, | |
runLeftNormalization=True, | |
annotationFields=vep_schema | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment