Last active
January 23, 2025 21:44
-
-
Save abij/1c117560386e601bdba947a6cbb0a552 to your computer and use it in GitHub Desktop.
Compare impact of file formats using Spark and DuckDB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": "%pip install duckdb pyspark pandas", | |
"id": "d1b68dcfa6a6c89", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"cell_type": "code", | |
"id": "initial_id", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"import os\n", | |
"from pyspark.sql import SparkSession\n", | |
"\n", | |
"# Default spark session\n", | |
"spark = SparkSession.builder.appName(\"Spark\").getOrCreate()\n", | |
"print(spark.version)\n", | |
"\n", | |
"# Input file:\n", | |
"raw_csv_file = os.path.expanduser(\"~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_20250117.csv\")" | |
], | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"import duckdb\n", | |
"print(duckdb.__version__)\n", | |
"\n", | |
"# Create copies of the CSV in various formats:\n", | |
"duckdb.sql(f\"\"\"COPY (FROM '{raw_csv_file}') TO 'raw_snappy.parquet' (FORMAT 'parquet')\"\"\")\n", | |
"duckdb.sql(f\"\"\"COPY (FROM '{raw_csv_file}') TO 'raw_zstd.parquet' (FORMAT 'parquet', compression 'zstd')\"\"\")\n", | |
"\n", | |
"# File persisted in the current directory\n", | |
"db = duckdb.connect(\"duckdb_rdw_cars.db\")\n", | |
"db.sql(f\"\"\"\n", | |
" create table if not exists open_data_rdw__gekentekende_voertuigen_20250117 as\n", | |
" select * FROM read_csv('{raw_csv_file}',header = true)\n", | |
"\"\"\")" | |
], | |
"id": "9fdc1e941f112e10", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"# Load dataset as a queryable tables\n", | |
"spark.read.csv(raw_csv_file, header=True).createOrReplaceTempView(\"raw_csv\")\n", | |
"spark.read.parquet('raw_snappy.parquet').createOrReplaceTempView(\"raw_snappy\")\n", | |
"spark.read.parquet('raw_zstd.parquet').createOrReplaceTempView(\"raw_zstd\")" | |
], | |
"id": "384ce8cf991cc7b6", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"%%sh\n", | |
"echo \"first: $(head -n2 ~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_202* | tail -n1 | cut -d, -f1)\"\n", | |
"echo \"last: $(tail -n1 ~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_202* | cut -d, -f1)\"" | |
], | |
"id": "45616ae5c0438c4", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"# Save first and last row for the queries:\n", | |
"first = 'MR56LN'\n", | |
"last = 'MR56LG'" | |
], | |
"id": "e06d2e58dfab051", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2025-01-23T21:44:35.501012Z", | |
"start_time": "2025-01-23T21:44:35.497961Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"def get_analytical_query(input_table, engine='spark', year=2024):\n", | |
" quote = \"`\" if engine == 'spark' else '\"'\n", | |
" substring_offset = 0 if engine == 'spark' else 1\n", | |
"\n", | |
" return f\"\"\"\n", | |
" with cars_brand_per_year as (\n", | |
" select Merk, Handelsbenaming, substring(cast({quote}Datum eerste toelating{quote} as string), 0, {4 + substring_offset}) as jaar, count(*) total\n", | |
" from {input_table}\n", | |
" where Voertuigsoort = 'Personenauto'\n", | |
" and {quote}Datum eerste toelating{quote} >= {year - 1}0101\n", | |
" and {quote}Datum eerste toelating{quote} < {year + 1}0101\n", | |
" group by 1, 2, 3\n", | |
" )\n", | |
" select\n", | |
" Merk || ' ' || trim(split_part(Handelsbenaming, Merk, -1)) as car,\n", | |
" round((total / (select count(*) as year_total from {input_table} where Voertuigsoort = 'Personenauto' and substring(cast({quote}Datum eerste toelating{quote} as string), 0, {4 + substring_offset}) = {year})) * 100, 2) || '%' as marketshare,\n", | |
" total,\n", | |
" total - lag(total) over (partition by Merk, Handelsbenaming order by jaar) as diff_last_jaar\n", | |
" from cars_brand_per_year\n", | |
" where jaar = {year} or jaar = {year - 1}\n", | |
" order by jaar desc, total desc\n", | |
" limit 500\n", | |
" \"\"\"" | |
], | |
"id": "9fe9728cbb9aa0df", | |
"outputs": [], | |
"execution_count": 128 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"# Spark timings:\n", | |
"# CSV\n", | |
"%timeit spark.sql(f\"select * from raw_csv where Kenteken = '{first}' limit 1\").collect()\n", | |
"%timeit spark.sql(f\"select * from raw_csv where Kenteken = '{last}' limit 1\").collect()\n", | |
"%timeit spark.sql(get_analytical_query('raw_csv')).collect()\n", | |
"#\n", | |
"# Parquet (snappy)\n", | |
"%timeit spark.sql(f\"select * from raw_snappy where Kenteken = '{first}' limit 1\").collect()\n", | |
"%timeit spark.sql(f\"select * from raw_snappy where Kenteken = '{last}' limit 1\").collect()\n", | |
"%timeit spark.sql(get_analytical_query('raw_snappy')).collect()\n", | |
"\n", | |
"# Parquet (zstd)\n", | |
"%timeit spark.sql(f\"select * from raw_zstd where Kenteken = '{first}' limit 1\").collect()\n", | |
"%timeit spark.sql(f\"select * from raw_zstd where Kenteken = '{last}' limit 1\").collect()\n", | |
"%timeit spark.sql(get_analytical_query('raw_zstd')).toPandas()" | |
], | |
"id": "8956249145cd1a2f", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2025-01-23T21:44:39.839771Z", | |
"start_time": "2025-01-23T21:44:38.336395Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": "spark.sql(get_analytical_query('raw_zstd', year=2024)).toPandas()[:5]", | |
"id": "6a94d4a841579940", | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
" \r" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
" car marketshare total diff_last_jaar\n", | |
"0 TESLA MODEL Y 4.79% 19078 5124.0\n", | |
"1 KIA NIRO 2.76% 10989 3301.0\n", | |
"2 VOLVO EX30 2.74% 10895 10812.0\n", | |
"3 TESLA MODEL 3 2.69% 10711 5569.0\n", | |
"4 KIA PICANTO 2.1% 8374 -2141.0" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>car</th>\n", | |
" <th>marketshare</th>\n", | |
" <th>total</th>\n", | |
" <th>diff_last_jaar</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>TESLA MODEL Y</td>\n", | |
" <td>4.79%</td>\n", | |
" <td>19078</td>\n", | |
" <td>5124.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>KIA NIRO</td>\n", | |
" <td>2.76%</td>\n", | |
" <td>10989</td>\n", | |
" <td>3301.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>VOLVO EX30</td>\n", | |
" <td>2.74%</td>\n", | |
" <td>10895</td>\n", | |
" <td>10812.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>TESLA MODEL 3</td>\n", | |
" <td>2.69%</td>\n", | |
" <td>10711</td>\n", | |
" <td>5569.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>KIA PICANTO</td>\n", | |
" <td>2.1%</td>\n", | |
" <td>8374</td>\n", | |
" <td>-2141.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 129, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 129 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"# DuckDB timings:\n", | |
"# CSV\n", | |
"%timeit duckdb.sql(f\"select * FROM '{raw_csv_file}' where Kenteken='{first}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(f\"select * FROM '{raw_csv_file}' where Kenteken='{last}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n", | |
"\n", | |
"# Parquet (snappy)\n", | |
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{first}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{last}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n", | |
"\n", | |
"# Parquet (zstd)\n", | |
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{first}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{last}' limit 1\").fetchall()\n", | |
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n", | |
"\n", | |
"# DuckDB internal format\n", | |
"%timeit db.sql(f\"select * FROM 'open_data_rdw__gekentekende_voertuigen_20250117' where Kenteken='{first}' limit 1\").fetchall()\n", | |
"%timeit db.sql(f\"select * FROM 'open_data_rdw__gekentekende_voertuigen_20250117' where Kenteken='{last}' limit 1\").fetchall()\n", | |
"%timeit db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb')).fetchall()" | |
], | |
"id": "3e52eee1e057c57e", | |
"outputs": [], | |
"execution_count": null | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2025-01-23T21:44:41.461185Z", | |
"start_time": "2025-01-23T21:44:41.359264Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": "db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb')).to_df()[:137]", | |
"id": "e8fbd291902a008f", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
" car marketshare total diff_last_jaar\n", | |
"0 TESLA MODEL Y 4.79% 19078 5124.0\n", | |
"1 KIA NIRO 2.76% 10989 3301.0\n", | |
"2 VOLVO EX30 2.74% 10895 10812.0\n", | |
"3 TESLA MODEL 3 2.69% 10711 5569.0\n", | |
"4 KIA PICANTO 2.1% 8374 -2141.0\n", | |
".. ... ... ... ...\n", | |
"132 VOLKSWAGEN ID.3 PRO S 150 KW 0.16% 618 386.0\n", | |
"133 AUDI A3 LIMOUSINE 0.15% 605 -499.0\n", | |
"134 LAND ROVER DEFENDER 0.15% 603 41.0\n", | |
"135 VOLKSWAGEN UP! 0.15% 594 -3918.0\n", | |
"136 MAZDA 3 0.15% 590 -99.0\n", | |
"\n", | |
"[137 rows x 4 columns]" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>car</th>\n", | |
" <th>marketshare</th>\n", | |
" <th>total</th>\n", | |
" <th>diff_last_jaar</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>TESLA MODEL Y</td>\n", | |
" <td>4.79%</td>\n", | |
" <td>19078</td>\n", | |
" <td>5124.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>KIA NIRO</td>\n", | |
" <td>2.76%</td>\n", | |
" <td>10989</td>\n", | |
" <td>3301.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>VOLVO EX30</td>\n", | |
" <td>2.74%</td>\n", | |
" <td>10895</td>\n", | |
" <td>10812.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>TESLA MODEL 3</td>\n", | |
" <td>2.69%</td>\n", | |
" <td>10711</td>\n", | |
" <td>5569.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>KIA PICANTO</td>\n", | |
" <td>2.1%</td>\n", | |
" <td>8374</td>\n", | |
" <td>-2141.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>132</th>\n", | |
" <td>VOLKSWAGEN ID.3 PRO S 150 KW</td>\n", | |
" <td>0.16%</td>\n", | |
" <td>618</td>\n", | |
" <td>386.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>133</th>\n", | |
" <td>AUDI A3 LIMOUSINE</td>\n", | |
" <td>0.15%</td>\n", | |
" <td>605</td>\n", | |
" <td>-499.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>134</th>\n", | |
" <td>LAND ROVER DEFENDER</td>\n", | |
" <td>0.15%</td>\n", | |
" <td>603</td>\n", | |
" <td>41.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>135</th>\n", | |
" <td>VOLKSWAGEN UP!</td>\n", | |
" <td>0.15%</td>\n", | |
" <td>594</td>\n", | |
" <td>-3918.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>136</th>\n", | |
" <td>MAZDA 3</td>\n", | |
" <td>0.15%</td>\n", | |
" <td>590</td>\n", | |
" <td>-99.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>137 rows × 4 columns</p>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 130, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 130 | |
}, | |
{ | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2025-01-23T21:44:43.496878Z", | |
"start_time": "2025-01-23T21:44:43.406861Z" | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Top 5 in 2025?\n", | |
"db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb', year=2025)).to_df()[:5]" | |
], | |
"id": "158715cea5f869d0", | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
" car marketshare total diff_last_jaar\n", | |
"0 KIA EV3 6.45% 946 820.0\n", | |
"1 VOLKSWAGEN POLO 2.56% 375 -6463.0\n", | |
"2 VOLVO EX30 2.46% 361 -10534.0\n", | |
"3 KIA PICANTO 2.35% 345 -8029.0\n", | |
"4 SUZUKI SWIFT 2.24% 328 -2253.0" | |
], | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>car</th>\n", | |
" <th>marketshare</th>\n", | |
" <th>total</th>\n", | |
" <th>diff_last_jaar</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>KIA EV3</td>\n", | |
" <td>6.45%</td>\n", | |
" <td>946</td>\n", | |
" <td>820.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>VOLKSWAGEN POLO</td>\n", | |
" <td>2.56%</td>\n", | |
" <td>375</td>\n", | |
" <td>-6463.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>VOLVO EX30</td>\n", | |
" <td>2.46%</td>\n", | |
" <td>361</td>\n", | |
" <td>-10534.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>KIA PICANTO</td>\n", | |
" <td>2.35%</td>\n", | |
" <td>345</td>\n", | |
" <td>-8029.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>SUZUKI SWIFT</td>\n", | |
" <td>2.24%</td>\n", | |
" <td>328</td>\n", | |
" <td>-2253.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
] | |
}, | |
"execution_count": 131, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"execution_count": 131 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"source": [ | |
"# cleanup\n", | |
"spark.active and spark.stop()\n", | |
"db.close()\n", | |
"\n", | |
"os.path.exists(\"raw_snappy.parquet\") and os.remove(\"raw_snappy.parquet\")\n", | |
"os.path.exists(\"raw_zstd.parquet\") and os.remove(\"raw_zstd.parquet\")\n", | |
"os.path.exists(\"duckdb_rdw_cars.db\") and os.remove(\"duckdb_rdw_cars.db\")" | |
], | |
"id": "ee9c35a741f82144", | |
"outputs": [], | |
"execution_count": null | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Py3.12 (fileformats)", | |
"language": "python", | |
"name": "fileformats" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment