Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save budsonjelmont/08db8ded1c59f1a384828b204ef93648 to your computer and use it in GitHub Desktop.
Save budsonjelmont/08db8ded1c59f1a384828b204ef93648 to your computer and use it in GitHub Desktop.
Batch import gnomAD VCFs into an Omics annotation store
import boto3
dataset = 'genomes'
semver = '2.1.1'
import_arn = 'arn:aws:iam::[account_id]:role/service-role/[role_name]'
annot_store_name='gnomad_grch37'
annot_store_version_name= dataset + '_v' + str(semver).replace('.','_')
base_s3_uri = 's3://gnomad-public-us-east-1/release/' + str(semver) + '/vcf/' + dataset
chroms_to_import = [str(c) for c in range(1,23)] + ['X','Y']
# Naming format for 4.0 VCFs
# vcfs=['gnomad.' + dataset + '.v' + str(semver) + '.sites.chr' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
# Naming format for 2.1.1 VCFs
vcfs=['gnomad.' + dataset + '.r' + str(semver) + '.sites.' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
uris=[base_s3_uri + '/' + v for v in vcfs]
# Create annotation store version
omics = boto3.client('omics', region_name='us-east-1')
create_version_response = omics.create_annotation_store_version(
name=annot_store_name,
versionName=annot_store_version_name,
description=annot_store_name + '_' + annot_store_version_name
)
# Wait til version is created
waiter = omics.get_waiter('annotation_store_version_created')
waiter.wait(
name=annot_store_name,
versionName=annot_store_version_name,
WaiterConfig={
'Delay': 30,
'MaxAttempts': 250
}
)
# Get version info -- don't need this, providing the version name during the import job is sufficient
# get_annot_store_response = omics.get_annotation_store_version(
# name=annot_store_name,
# versionName=annot_store_version_name
# )
# This schema is what I inferred from looking at the schema definition--see actual solution below
# vep_schema = {
# "vep": [
# {
# "allele": "string",
# "consequence": "string[]",
# "impact": "string",
# "symbol": "string",
# "gene": "string",
# "feature_type": "string",
# "feature": "string",
# "biotype": "string",
# "exon": {
# "rank": "string",
# "total": "string"
# },
# "intron": {
# "rank": "string",
# "total": "string"
# },
# "hgvsc": "string",
# "hgvsp": "string",
# "cdna_position": "string",
# "cds_position": "string",
# "protein_position": "string",
# "amino_acids": {
# "reference": "string",
# "variant": "string"
# },
# "codons": {
# "reference": "string",
# "variant": "string"
# },
# "existing_variation": "string[]",
# "distance": "string",
# "strand": "string",
# "flags": "string[]",
# "symbol_source": "string",
# "hgnc_id": "string",
# "extras": {
# "key": "string",
# "value": "string"
# }
# }
# ]
# }
# Omics actually just needs the name of the INFO key that holds the VEP annotation, it will do the rest
vep_schema = {'VEP': 'vep'}
for v in uris:
start_annotation_import_response = omics.start_annotation_import_job(
destinationName=annot_store_name,
versionName=annot_store_version_name,
roleArn=import_arn,
items=[{'source': v}],
# items=[{'source': v} for v in uris],
formatOptions={
'vcfOptions': {
'ignoreQualField': False,
'ignoreFilterField': False
}
},
runLeftNormalization=True,
annotationFields=vep_schema
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment