Skip to content

Instantly share code, notes, and snippets.

@JnsFerreira
Created April 6, 2022 23:09
Show Gist options
  • Save JnsFerreira/92d7e861d5513f953b10c15f03c60c01 to your computer and use it in GitHub Desktop.
Save JnsFerreira/92d7e861d5513f953b10c15f03c60c01 to your computer and use it in GitHub Desktop.
# References:
# https://aws.amazon.com/blogs/big-data/load-data-incrementally-and-optimized-parquet-writer-with-aws-glue/
import sys
from awsglue.job import Job
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
BLOCK_SIZE = 128*1024*1024
PAGE_SIZE = 1024*1024
args = getResolvedOptions(
sys.argv, [
'JOB_NAME',
'source_database',
'source_table_name',
's3_output_dir',
's3_output_format'
]
)
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
source_table = glueContext.create_dynamic_frame.from_catalog(
database=args['source_database'],
table_name=args['source_table_name'],
transformation_ctx="source_table"
)
glueContext.write_dynamic_frame.from_options(
frame=source_table,
connection_type="s3",
connection_options={"path": args['s3_output_dir']},
format=args['s3_output_format'],
format_options={
"compression": "snappy",
"blockSize": BLOCK_SIZE,
"pageSize": PAGE_SIZE
}
)
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment