Last active
December 7, 2023 22:57
-
-
Save ecmonsen/e1e3e6906def081a5fbf43ad3a653171 to your computer and use it in GitHub Desktop.
Make an Athena-created view available to Glue Jobs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Making an Athena view queryable in a Glue job | |
# | |
# When a view is created by running an Athena SQL query, it appears in the Glue catalog but errors are raised when | |
# querying the view in a Glue job or a Glue Spark context. | |
# | |
# Use this gist to programmatically update the view's metadata in the Glue catalog. | |
# | |
# After this you should be able to run `spark.sql("select * from mydb.myview")` without errors. | |
# | |
# Assumes you have created the view `mydb.myview` in Athena. | |
# For testing I used `CREATE VIEW mydb.myview AS SELECT 1 as Index;` | |
# | |
import boto3 | |
import json | |
import base64 | |
glue = boto3.client("glue") | |
# Change these | |
database_name="mydb" | |
table_name="myview" | |
athena_table = glue.get_table(DatabaseName=database_name, Name=table_name)["Table"] | |
# We can only pass these keys in a boto3 call to update_table(). | |
# Docs: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/client/update_table.html | |
GLUE_UPDATE_TABLE_INPUT_KEYS={ | |
"ViewOriginalText", | |
"LastAccessTime", | |
"StorageDescriptor", | |
"LastAnalyzedTime", | |
"Name", | |
"PartitionKeys", | |
"Parameters", | |
"Description", | |
"Retention", | |
"ViewExpandedText", | |
"Owner", | |
"TableType", | |
"TargetTable" | |
} | |
for att in set(athena_table.keys()) - GLUE_UPDATE_TABLE_INPUT_KEYS: | |
del (athena_table[att]) | |
# It appears Glue gets confused by a blank string for StorageDescriptor.Location, so remove it. | |
if "StorageDescriptor" in athena_table and "Location" in athena_table["StorageDescriptor"]: | |
if athena_table["StorageDescriptor"]["Location"] != "": | |
raise Exception("Athena table has non-blank value for StorageDescriptor.Location. Don't know what to do.") | |
del(athena_table["StorageDescriptor"]["Location"]) | |
# Glue expects the view SQL to be in ViewExpandedText, while Athena uses a Hive-encoded string from "ViewOriginalText". | |
# Decode the Hive-encoded string to update ViewExpandedText: | |
athena_table["ViewExpandedText"] = json.loads(base64.b64decode(athena_table["ViewOriginalText"][16:-3]).decode("utf-8"))["originalSql"] | |
# update the table! | |
glue.update_table(DatabaseName=database_name, TableInput=athena_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment