Last active
March 27, 2022 17:22
-
-
Save thanoojgithub/c6872b67635e60900029b9281ed3858e to your computer and use it in GitHub Desktop.
create empty parquet file from existing parquet file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
from pyspark.sql.functions import * | |
from pyspark.sql.types import StructType, StructField, StringType, TimestampType | |
if __name__ == "__main__": | |
spark = SparkSession \ | |
.builder \ | |
.master('local') \ | |
.appName('pyspark-test-run') \ | |
.getOrCreate() | |
spark.sparkContext.setLogLevel("ERROR") | |
ccData1 = [('A', 'Due to heavy rainfall'), ('B', 'Due to heavy snowfall'), ('C', 'Due to some technical issue'), | |
('D', 'Due to bad weather'), ('D', None), ('A', 'Due to heavy rainfall')] | |
# TODO - Schema creation | |
ccColumns = StructType([ | |
StructField('CancellationCode', StringType(), True), | |
StructField('CancellationDesc', StringType(), True) | |
]) | |
ccDF1 = spark.createDataFrame(data=ccData1, schema=ccColumns) \ | |
.withColumn("StartDate", to_timestamp(lit("2021-12-31 23:59:59"), 'yyyy-MM-dd HH:mm:ss')) \ | |
.withColumn("EndDate", to_timestamp(lit("9999-12-31 23:59:59"), 'yyyy-MM-dd HH:mm:ss')) | |
ccDF1.write.parquet("/home/thanooj/dataParquet/data1/data") | |
schema = StructType([ | |
StructField('CancellationCode', StringType(), True), | |
StructField('CancellationDesc', StringType(), True), | |
StructField('StartDate', TimestampType(), True), | |
StructField('EndDate', TimestampType(), True) | |
]) | |
data1 = spark.read.parquet("/home/thanooj/dataParquet/data1/data") | |
print("data1 : ") | |
data1.printSchema() | |
data1.show(truncate=False) | |
rDD = spark.sparkContext.emptyRDD() | |
data2 = spark.createDataFrame(data=rDD, schema=data1.schema) | |
print("data2 : ") | |
data2.printSchema() | |
data2.show(truncate=False) | |
data2.write.parquet("/home/thanooj/dataParquet/data2/data") | |
data3 = spark.read.parquet("/home/thanooj/dataParquet/data2/data") | |
print("data3 : ") | |
data3.printSchema() | |
data3.show(truncate=False) | |
data1 : | |
root | |
|-- CancellationCode: string (nullable = true) | |
|-- CancellationDesc: string (nullable = true) | |
|-- StartDate: timestamp (nullable = true) | |
|-- EndDate: timestamp (nullable = true) | |
+----------------+---------------------------+-------------------+-------------------+ | |
|CancellationCode|CancellationDesc |StartDate |EndDate | | |
+----------------+---------------------------+-------------------+-------------------+ | |
|A |Due to heavy rainfall |2021-12-31 23:59:59|9999-12-31 23:59:59| | |
|B |Due to heavy snowfall |2021-12-31 23:59:59|9999-12-31 23:59:59| | |
|C |Due to some technical issue|2021-12-31 23:59:59|9999-12-31 23:59:59| | |
|D |Due to bad weather |2021-12-31 23:59:59|9999-12-31 23:59:59| | |
|D |null |2021-12-31 23:59:59|9999-12-31 23:59:59| | |
|A |Due to heavy rainfall |2021-12-31 23:59:59|9999-12-31 23:59:59| | |
+----------------+---------------------------+-------------------+-------------------+ | |
data2 : | |
root | |
|-- CancellationCode: string (nullable = true) | |
|-- CancellationDesc: string (nullable = true) | |
|-- StartDate: timestamp (nullable = true) | |
|-- EndDate: timestamp (nullable = true) | |
+----------------+----------------+---------+-------+ | |
|CancellationCode|CancellationDesc|StartDate|EndDate| | |
+----------------+----------------+---------+-------+ | |
+----------------+----------------+---------+-------+ | |
data3 : | |
root | |
|-- CancellationCode: string (nullable = true) | |
|-- CancellationDesc: string (nullable = true) | |
|-- StartDate: timestamp (nullable = true) | |
|-- EndDate: timestamp (nullable = true) | |
+----------------+----------------+---------+-------+ | |
|CancellationCode|CancellationDesc|StartDate|EndDate| | |
+----------------+----------------+---------+-------+ | |
+----------------+----------------+---------+-------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment