budsonjelmont · March 29, 2024 20:49
diff --git a/import_gnomad_vcfs_to_aws_omics_annot_store.py b/import_gnomad_vcfs_to_aws_omics_annot_store.py
 import boto3

 dataset = 'genomes'
 semver = '2.1.1'
 import_arn = 'arn:aws:iam::[account_id]:role/service-role/[role_name]'

 annot_store_name='gnomad_grch37'
 annot_store_version_name= dataset + '_v' + str(semver).replace('.','_')

 base_s3_uri = 's3://gnomad-public-us-east-1/release/' + str(semver) + '/vcf/' + dataset

 chroms_to_import = [str(c) for c in range(1,23)] + ['X','Y']

 # Naming format for 4.0 VCFs
 # vcfs=['gnomad.' + dataset + '.v' + str(semver) + '.sites.chr' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
 # Naming format for 2.1.1 VCFs 
 vcfs=['gnomad.' + dataset + '.r' + str(semver) + '.sites.' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
 uris=[base_s3_uri + '/' + v  for v in vcfs]

 # Create annotation store version
 omics = boto3.client('omics', region_name='us-east-1')

 create_version_response = omics.create_annotation_store_version(
    name=annot_store_name,
    versionName=annot_store_version_name,
    description=annot_store_name + '_' + annot_store_version_name
 )

 # Wait til version is created
 waiter = omics.get_waiter('annotation_store_version_created')
 waiter.wait(
    name=annot_store_name,
    versionName=annot_store_version_name,
    WaiterConfig={
        'Delay': 30,
        'MaxAttempts': 250
    }
 )

 # Get version info -- don't need this, providing the version name during the import job is sufficient
 # get_annot_store_response = omics.get_annotation_store_version(
 #     name=annot_store_name,
 #     versionName=annot_store_version_name
 # )

 # This schema is what I inferred from looking at the schema definition--see actual solution below
 # vep_schema = {
 #     "vep": [
 #       {
 #         "allele": "string",
 #         "consequence": "string[]",
 #         "impact": "string",
 #         "symbol": "string",
 #         "gene": "string",
 #         "feature_type": "string",
 #         "feature": "string",
 #         "biotype": "string",
 #         "exon": {
 #           "rank": "string",
 #           "total": "string"
 #         },
 #         "intron": {
 #           "rank": "string",
 #           "total": "string"
 #         },
 #         "hgvsc": "string",
 #         "hgvsp": "string",
 #         "cdna_position": "string",
 #         "cds_position": "string",
 #         "protein_position": "string",
 #         "amino_acids": {
 #           "reference": "string",
 #           "variant": "string"
 #         },
 #         "codons": {
 #           "reference": "string",
 #           "variant": "string"
 #         },
 #         "existing_variation": "string[]",
 #         "distance": "string",
 #         "strand": "string",
 #         "flags": "string[]",
 #         "symbol_source": "string",
 #         "hgnc_id": "string",
 #         "extras": {
 #           "key": "string",
 #           "value": "string"
 #         }
 #       }
 #     ]
 # }

 # Omics actually just needs the name of the INFO key that holds the VEP annotation, it will do the rest
 vep_schema = {'VEP': 'vep'}

 for v in uris:
    start_annotation_import_response = omics.start_annotation_import_job(
        destinationName=annot_store_name,
        versionName=annot_store_version_name,
        roleArn=import_arn,
        items=[{'source': v}],
        # items=[{'source': v} for v in uris],
        formatOptions={
            'vcfOptions': {
                'ignoreQualField': False,
                'ignoreFilterField': False
            }
        },
        runLeftNormalization=True,
        annotationFields=vep_schema
    )
	import boto3

	dataset = 'genomes'
	semver = '2.1.1'
	import_arn = 'arn:aws:iam::[account_id]:role/service-role/[role_name]'

	annot_store_name='gnomad_grch37'
	annot_store_version_name= dataset + '_v' + str(semver).replace('.','_')

	base_s3_uri = 's3://gnomad-public-us-east-1/release/' + str(semver) + '/vcf/' + dataset

	chroms_to_import = [str(c) for c in range(1,23)] + ['X','Y']

	# Naming format for 4.0 VCFs
	# vcfs=['gnomad.' + dataset + '.v' + str(semver) + '.sites.chr' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
	# Naming format for 2.1.1 VCFs
	vcfs=['gnomad.' + dataset + '.r' + str(semver) + '.sites.' + chrom + '.vcf.bgz' for chrom in chroms_to_import]
	uris=[base_s3_uri + '/' + v for v in vcfs]

	# Create annotation store version
	omics = boto3.client('omics', region_name='us-east-1')

	create_version_response = omics.create_annotation_store_version(
	name=annot_store_name,
	versionName=annot_store_version_name,
	description=annot_store_name + '_' + annot_store_version_name
	)

	# Wait til version is created
	waiter = omics.get_waiter('annotation_store_version_created')
	waiter.wait(
	name=annot_store_name,
	versionName=annot_store_version_name,
	WaiterConfig={
	'Delay': 30,
	'MaxAttempts': 250
	}
	)

	# Get version info -- don't need this, providing the version name during the import job is sufficient
	# get_annot_store_response = omics.get_annotation_store_version(
	# name=annot_store_name,
	# versionName=annot_store_version_name
	# )

	# This schema is what I inferred from looking at the schema definition--see actual solution below
	# vep_schema = {
	# "vep": [
	# {
	# "allele": "string",
	# "consequence": "string[]",
	# "impact": "string",
	# "symbol": "string",
	# "gene": "string",
	# "feature_type": "string",
	# "feature": "string",
	# "biotype": "string",
	# "exon": {
	# "rank": "string",
	# "total": "string"
	# },
	# "intron": {
	# "rank": "string",
	# "total": "string"
	# },
	# "hgvsc": "string",
	# "hgvsp": "string",
	# "cdna_position": "string",
	# "cds_position": "string",
	# "protein_position": "string",
	# "amino_acids": {
	# "reference": "string",
	# "variant": "string"
	# },
	# "codons": {
	# "reference": "string",
	# "variant": "string"
	# },
	# "existing_variation": "string[]",
	# "distance": "string",
	# "strand": "string",
	# "flags": "string[]",
	# "symbol_source": "string",
	# "hgnc_id": "string",
	# "extras": {
	# "key": "string",
	# "value": "string"
	# }
	# }
	# ]
	# }

	# Omics actually just needs the name of the INFO key that holds the VEP annotation, it will do the rest
	vep_schema = {'VEP': 'vep'}

	for v in uris:
	start_annotation_import_response = omics.start_annotation_import_job(
	destinationName=annot_store_name,
	versionName=annot_store_version_name,
	roleArn=import_arn,
	items=[{'source': v}],
	# items=[{'source': v} for v in uris],
	formatOptions={
	'vcfOptions': {
	'ignoreQualField': False,
	'ignoreFilterField': False
	}
	},
	runLeftNormalization=True,
	annotationFields=vep_schema
	)