harshavardhana · March 8, 2026 06:20
diff --git a/README.md b/README.md
diff --git a/spark_parquet_overwrite_test.py b/spark_parquet_overwrite_test.py
 import csv
 import os
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType, StructField, StringType, IntegerType

 # ---------------------------------------------------------------------------
 # Generate sample CSV files
 # ---------------------------------------------------------------------------

 def write_csv(path, rows):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(rows)

 # batch1: initial load — two dates, three records each
 BATCH1 = [
    [1, "Alice",  100, "2024-01-01"],
    [2, "Bob",    200, "2024-01-01"],
    [3, "Carol",  300, "2024-01-01"],
    [4, "Dave",   400, "2024-01-02"],
    [5, "Eve",    500, "2024-01-02"],
    [6, "Frank",  600, "2024-01-02"],
 ]

 # batch2: updated data for the same two dates (overwrites batch1 partition files)
 BATCH2 = [
    [1, "Alice",  110, "2024-01-01"],
    [2, "Bob",    220, "2024-01-01"],
    [3, "Carol",  330, "2024-01-01"],
    [4, "Dave",   440, "2024-01-02"],
    [5, "Eve",    550, "2024-01-02"],
    [6, "Frank",  660, "2024-01-02"],
 ]

 write_csv("/tmp/input/batch1.csv", BATCH1)
 write_csv("/tmp/input/batch2.csv", BATCH2)

 # ---------------------------------------------------------------------------
 # Spark session
 # ---------------------------------------------------------------------------

 spark = SparkSession.builder \
    .appName("parquet-overwrite-test") \
    .getOrCreate()

 schema = StructType([
    StructField("id",     IntegerType(), False),
    StructField("name",   StringType(),  True),
    StructField("amount", IntegerType(), True),
    StructField("date",   StringType(),  False),
 ])

 BASE_PATH = "s3a://your-bucket/data/transactions"

 # ---------------------------------------------------------------------------
 # Step 1: Read batch1 CSV and write partitioned parquet (append)
 # ---------------------------------------------------------------------------

 df1 = spark.read.schema(schema).csv("/tmp/input/batch1.csv")

 df1.write \
    .partitionBy("date") \
    .mode("append") \
    .parquet(BASE_PATH)

 # At this point:
 #   .../transactions/date=2024-01-01/part-00000.parquet
 #   .../transactions/date=2024-01-02/part-00000.parquet

 # ---------------------------------------------------------------------------
 # Step 2: Read batch2 CSV (same schema, same dates, updated amounts)
 # ---------------------------------------------------------------------------

 df2 = spark.read.schema(schema).csv("/tmp/input/batch2.csv")

 # ---------------------------------------------------------------------------
 # Step 3: Overwrite — drops existing parquet files under matching date
 #         partitions, writes new ones.  The date=XXXX/ prefix MUST still
 #         appear in a delimited ListObjectsV2 on the parent path afterward.
 # ---------------------------------------------------------------------------

 df2.write \
    .partitionBy("date") \
    .mode("overwrite") \
    .parquet(BASE_PATH)

 # ---------------------------------------------------------------------------
 # Step 4: Verify partition prefixes are still visible after overwrite.
 #         If the object store's delimited listing is broken the partition
 #         folders won't be discovered and both reads below return nothing.
 # ---------------------------------------------------------------------------

 result = spark.read.parquet(BASE_PATH)
 result.show()

 spark.catalog.refreshByPath(BASE_PATH)
 partitions = spark.read.parquet(BASE_PATH).select("date").distinct()
 partitions.show()

 spark.stop()
	import csv
	import os
	from pyspark.sql import SparkSession
	from pyspark.sql.types import StructType, StructField, StringType, IntegerType

	# ---------------------------------------------------------------------------
	# Generate sample CSV files
	# ---------------------------------------------------------------------------

	def write_csv(path, rows):
	os.makedirs(os.path.dirname(path), exist_ok=True)
	with open(path, "w", newline="") as f:
	writer = csv.writer(f)
	writer.writerows(rows)

	# batch1: initial load — two dates, three records each
	BATCH1 = [
	[1, "Alice", 100, "2024-01-01"],
	[2, "Bob", 200, "2024-01-01"],
	[3, "Carol", 300, "2024-01-01"],
	[4, "Dave", 400, "2024-01-02"],
	[5, "Eve", 500, "2024-01-02"],
	[6, "Frank", 600, "2024-01-02"],
	]

	# batch2: updated data for the same two dates (overwrites batch1 partition files)
	BATCH2 = [
	[1, "Alice", 110, "2024-01-01"],
	[2, "Bob", 220, "2024-01-01"],
	[3, "Carol", 330, "2024-01-01"],
	[4, "Dave", 440, "2024-01-02"],
	[5, "Eve", 550, "2024-01-02"],
	[6, "Frank", 660, "2024-01-02"],
	]

	write_csv("/tmp/input/batch1.csv", BATCH1)
	write_csv("/tmp/input/batch2.csv", BATCH2)

	# ---------------------------------------------------------------------------
	# Spark session
	# ---------------------------------------------------------------------------

	spark = SparkSession.builder \
	.appName("parquet-overwrite-test") \
	.getOrCreate()

	schema = StructType([
	StructField("id", IntegerType(), False),
	StructField("name", StringType(), True),
	StructField("amount", IntegerType(), True),
	StructField("date", StringType(), False),
	])

	BASE_PATH = "s3a://your-bucket/data/transactions"

	# ---------------------------------------------------------------------------
	# Step 1: Read batch1 CSV and write partitioned parquet (append)
	# ---------------------------------------------------------------------------

	df1 = spark.read.schema(schema).csv("/tmp/input/batch1.csv")

	df1.write \
	.partitionBy("date") \
	.mode("append") \
	.parquet(BASE_PATH)

	# At this point:
	# .../transactions/date=2024-01-01/part-00000.parquet
	# .../transactions/date=2024-01-02/part-00000.parquet

	# ---------------------------------------------------------------------------
	# Step 2: Read batch2 CSV (same schema, same dates, updated amounts)
	# ---------------------------------------------------------------------------

	df2 = spark.read.schema(schema).csv("/tmp/input/batch2.csv")

	# ---------------------------------------------------------------------------
	# Step 3: Overwrite — drops existing parquet files under matching date
	# partitions, writes new ones. The date=XXXX/ prefix MUST still
	# appear in a delimited ListObjectsV2 on the parent path afterward.
	# ---------------------------------------------------------------------------

	df2.write \
	.partitionBy("date") \
	.mode("overwrite") \
	.parquet(BASE_PATH)

	# ---------------------------------------------------------------------------
	# Step 4: Verify partition prefixes are still visible after overwrite.
	# If the object store's delimited listing is broken the partition
	# folders won't be discovered and both reads below return nothing.
	# ---------------------------------------------------------------------------

	result = spark.read.parquet(BASE_PATH)
	result.show()

	spark.catalog.refreshByPath(BASE_PATH)
	partitions = spark.read.parquet(BASE_PATH).select("date").distinct()
	partitions.show()

	spark.stop()
No results found