Created
August 24, 2023 20:35
-
-
Save arabold/343cd2970d39f264bfc9f6506da137b5 to your computer and use it in GitHub Desktop.
Fix Glue Table Schema
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
boto3.setup_default_session(region_name="eu-central-1") | |
def make_partitions_inherit_datatypes_of_table(database_name, table_name): | |
glue_client = boto3.client("glue") | |
# Get the data types of the base table | |
table_response = glue_client.get_table( | |
DatabaseName=database_name, | |
Name=table_name | |
) | |
column_to_datatype = { | |
item["Name"]: item["Type"] for item in table_response["Table"]["StorageDescriptor"]["Columns"] | |
} | |
# List partitions and datatypes | |
partition_params = { | |
"DatabaseName": database_name, | |
"TableName": table_name, | |
} | |
response = glue_client.get_partitions(**partition_params) | |
partitions = response["Partitions"] | |
while "NextToken" in response: | |
partition_params["NextToken"] = response["NextToken"] | |
response = glue_client.get_partitions(**partition_params) | |
partitions += response["Partitions"] | |
print("Got", len(partitions), "partitions") | |
partitions_to_update = [] | |
for partition in partitions: | |
changed = False | |
columns = partition["StorageDescriptor"]["Columns"] | |
new_columns = [] | |
for column in columns: | |
if column["Name"] in column_to_datatype and column["Type"] != column_to_datatype[column["Name"]]: | |
changed = True | |
# print(f"Changing type of {column['Name']} from {column['Type']} to {column_to_datatype[column['Name']]}") | |
column["Type"] = column_to_datatype[column["Name"]] | |
new_columns.append(column) | |
partition["StorageDescriptor"]["Columns"] = new_columns | |
if changed: | |
partitions_to_update.append(partition) | |
print(f"{len(partitions_to_update)} partitions of table {table_name} will be updated.") | |
# Update partitions if necessary | |
for partition in partitions_to_update: | |
print(f"Updating {', '.join(partition['Values'])}") | |
partition.pop("CatalogId") | |
partition.pop("CreationTime") | |
glue_client.update_partition( | |
DatabaseName=partition.pop("DatabaseName"), | |
TableName=partition.pop("TableName"), | |
PartitionValueList=partition['Values'], | |
PartitionInput=partition | |
) | |
def main(): | |
database_name = "my_db" | |
table_name = "my_table" | |
make_partitions_inherit_datatypes_of_table( | |
database_name=database_name, | |
table_name=table_name | |
) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment