JnsFerreira · April 6, 2022 23:09
diff --git a/glue_job_jdbc_to_s3.py b/glue_job_jdbc_to_s3.py
 # References:
 # https://aws.amazon.com/blogs/big-data/load-data-incrementally-and-optimized-parquet-writer-with-aws-glue/

 import sys

 from awsglue.job import Job
 from awsglue.transforms import *
 from pyspark.context import SparkContext
 from awsglue.context import GlueContext
 from awsglue.utils import getResolvedOptions

 BLOCK_SIZE = 128*1024*1024
 PAGE_SIZE = 1024*1024

 args = getResolvedOptions(
        sys.argv, [
            'JOB_NAME',
            'source_database',
            'source_table_name',
            's3_output_dir',
            's3_output_format'
        ]
    )


 glueContext = GlueContext(SparkContext.getOrCreate())
 spark = glueContext.spark_session

 job = Job(glueContext)
 job.init(args['JOB_NAME'], args)

 source_table = glueContext.create_dynamic_frame.from_catalog(
        database=args['source_database'],
        table_name=args['source_table_name'],
        transformation_ctx="source_table"
 )

 glueContext.write_dynamic_frame.from_options(
    frame=source_table,
    connection_type="s3",
    connection_options={"path": args['s3_output_dir']},
    format=args['s3_output_format'],
    format_options={
        "compression": "snappy",
        "blockSize": BLOCK_SIZE,
        "pageSize": PAGE_SIZE
    }
 )

 job.commit()
	# References:
	# https://aws.amazon.com/blogs/big-data/load-data-incrementally-and-optimized-parquet-writer-with-aws-glue/

	import sys

	from awsglue.job import Job
	from awsglue.transforms import *
	from pyspark.context import SparkContext
	from awsglue.context import GlueContext
	from awsglue.utils import getResolvedOptions

	BLOCK_SIZE = 12810241024
	PAGE_SIZE = 1024*1024

	args = getResolvedOptions(
	sys.argv, [
	'JOB_NAME',
	'source_database',
	'source_table_name',
	's3_output_dir',
	's3_output_format'
	]
	)


	glueContext = GlueContext(SparkContext.getOrCreate())
	spark = glueContext.spark_session

	job = Job(glueContext)
	job.init(args['JOB_NAME'], args)

	source_table = glueContext.create_dynamic_frame.from_catalog(
	database=args['source_database'],
	table_name=args['source_table_name'],
	transformation_ctx="source_table"
	)

	glueContext.write_dynamic_frame.from_options(
	frame=source_table,
	connection_type="s3",
	connection_options={"path": args['s3_output_dir']},
	format=args['s3_output_format'],
	format_options={
	"compression": "snappy",
	"blockSize": BLOCK_SIZE,
	"pageSize": PAGE_SIZE
	}
	)

	job.commit()