Skip to content

Instantly share code, notes, and snippets.

@abij
Last active January 23, 2025 21:44
Show Gist options
  • Save abij/1c117560386e601bdba947a6cbb0a552 to your computer and use it in GitHub Desktop.
Save abij/1c117560386e601bdba947a6cbb0a552 to your computer and use it in GitHub Desktop.
Compare impact of file formats using Spark and DuckDB
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"source": "%pip install duckdb pyspark pandas",
"id": "d1b68dcfa6a6c89",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true
},
"source": [
"import os\n",
"from pyspark.sql import SparkSession\n",
"\n",
"# Default spark session\n",
"spark = SparkSession.builder.appName(\"Spark\").getOrCreate()\n",
"print(spark.version)\n",
"\n",
"# Input file:\n",
"raw_csv_file = os.path.expanduser(\"~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_20250117.csv\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"import duckdb\n",
"print(duckdb.__version__)\n",
"\n",
"# Create copies of the CSV in various formats:\n",
"duckdb.sql(f\"\"\"COPY (FROM '{raw_csv_file}') TO 'raw_snappy.parquet' (FORMAT 'parquet')\"\"\")\n",
"duckdb.sql(f\"\"\"COPY (FROM '{raw_csv_file}') TO 'raw_zstd.parquet' (FORMAT 'parquet', compression 'zstd')\"\"\")\n",
"\n",
"# File persisted in the current directory\n",
"db = duckdb.connect(\"duckdb_rdw_cars.db\")\n",
"db.sql(f\"\"\"\n",
" create table if not exists open_data_rdw__gekentekende_voertuigen_20250117 as\n",
" select * FROM read_csv('{raw_csv_file}',header = true)\n",
"\"\"\")"
],
"id": "9fdc1e941f112e10",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# Load dataset as a queryable tables\n",
"spark.read.csv(raw_csv_file, header=True).createOrReplaceTempView(\"raw_csv\")\n",
"spark.read.parquet('raw_snappy.parquet').createOrReplaceTempView(\"raw_snappy\")\n",
"spark.read.parquet('raw_zstd.parquet').createOrReplaceTempView(\"raw_zstd\")"
],
"id": "384ce8cf991cc7b6",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"%%sh\n",
"echo \"first: $(head -n2 ~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_202* | tail -n1 | cut -d, -f1)\"\n",
"echo \"last: $(tail -n1 ~/Downloads/Open_Data_RDW__Gekentekende_voertuigen_202* | cut -d, -f1)\""
],
"id": "45616ae5c0438c4",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# Save first and last row for the queries:\n",
"first = 'MR56LN'\n",
"last = 'MR56LG'"
],
"id": "e06d2e58dfab051",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-23T21:44:35.501012Z",
"start_time": "2025-01-23T21:44:35.497961Z"
}
},
"cell_type": "code",
"source": [
"def get_analytical_query(input_table, engine='spark', year=2024):\n",
" quote = \"`\" if engine == 'spark' else '\"'\n",
" substring_offset = 0 if engine == 'spark' else 1\n",
"\n",
" return f\"\"\"\n",
" with cars_brand_per_year as (\n",
" select Merk, Handelsbenaming, substring(cast({quote}Datum eerste toelating{quote} as string), 0, {4 + substring_offset}) as jaar, count(*) total\n",
" from {input_table}\n",
" where Voertuigsoort = 'Personenauto'\n",
" and {quote}Datum eerste toelating{quote} >= {year - 1}0101\n",
" and {quote}Datum eerste toelating{quote} < {year + 1}0101\n",
" group by 1, 2, 3\n",
" )\n",
" select\n",
" Merk || ' ' || trim(split_part(Handelsbenaming, Merk, -1)) as car,\n",
" round((total / (select count(*) as year_total from {input_table} where Voertuigsoort = 'Personenauto' and substring(cast({quote}Datum eerste toelating{quote} as string), 0, {4 + substring_offset}) = {year})) * 100, 2) || '%' as marketshare,\n",
" total,\n",
" total - lag(total) over (partition by Merk, Handelsbenaming order by jaar) as diff_last_jaar\n",
" from cars_brand_per_year\n",
" where jaar = {year} or jaar = {year - 1}\n",
" order by jaar desc, total desc\n",
" limit 500\n",
" \"\"\""
],
"id": "9fe9728cbb9aa0df",
"outputs": [],
"execution_count": 128
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# Spark timings:\n",
"# CSV\n",
"%timeit spark.sql(f\"select * from raw_csv where Kenteken = '{first}' limit 1\").collect()\n",
"%timeit spark.sql(f\"select * from raw_csv where Kenteken = '{last}' limit 1\").collect()\n",
"%timeit spark.sql(get_analytical_query('raw_csv')).collect()\n",
"#\n",
"# Parquet (snappy)\n",
"%timeit spark.sql(f\"select * from raw_snappy where Kenteken = '{first}' limit 1\").collect()\n",
"%timeit spark.sql(f\"select * from raw_snappy where Kenteken = '{last}' limit 1\").collect()\n",
"%timeit spark.sql(get_analytical_query('raw_snappy')).collect()\n",
"\n",
"# Parquet (zstd)\n",
"%timeit spark.sql(f\"select * from raw_zstd where Kenteken = '{first}' limit 1\").collect()\n",
"%timeit spark.sql(f\"select * from raw_zstd where Kenteken = '{last}' limit 1\").collect()\n",
"%timeit spark.sql(get_analytical_query('raw_zstd')).toPandas()"
],
"id": "8956249145cd1a2f",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-23T21:44:39.839771Z",
"start_time": "2025-01-23T21:44:38.336395Z"
}
},
"cell_type": "code",
"source": "spark.sql(get_analytical_query('raw_zstd', year=2024)).toPandas()[:5]",
"id": "6a94d4a841579940",
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" \r"
]
},
{
"data": {
"text/plain": [
" car marketshare total diff_last_jaar\n",
"0 TESLA MODEL Y 4.79% 19078 5124.0\n",
"1 KIA NIRO 2.76% 10989 3301.0\n",
"2 VOLVO EX30 2.74% 10895 10812.0\n",
"3 TESLA MODEL 3 2.69% 10711 5569.0\n",
"4 KIA PICANTO 2.1% 8374 -2141.0"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car</th>\n",
" <th>marketshare</th>\n",
" <th>total</th>\n",
" <th>diff_last_jaar</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TESLA MODEL Y</td>\n",
" <td>4.79%</td>\n",
" <td>19078</td>\n",
" <td>5124.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>KIA NIRO</td>\n",
" <td>2.76%</td>\n",
" <td>10989</td>\n",
" <td>3301.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>VOLVO EX30</td>\n",
" <td>2.74%</td>\n",
" <td>10895</td>\n",
" <td>10812.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TESLA MODEL 3</td>\n",
" <td>2.69%</td>\n",
" <td>10711</td>\n",
" <td>5569.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>KIA PICANTO</td>\n",
" <td>2.1%</td>\n",
" <td>8374</td>\n",
" <td>-2141.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 129
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# DuckDB timings:\n",
"# CSV\n",
"%timeit duckdb.sql(f\"select * FROM '{raw_csv_file}' where Kenteken='{first}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(f\"select * FROM '{raw_csv_file}' where Kenteken='{last}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n",
"\n",
"# Parquet (snappy)\n",
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{first}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{last}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n",
"\n",
"# Parquet (zstd)\n",
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{first}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(f\"select * FROM 'raw_zstd.parquet' where Kenteken='{last}' limit 1\").fetchall()\n",
"%timeit duckdb.sql(get_analytical_query(raw_csv_file, engine='duckdb')).fetchall()\n",
"\n",
"# DuckDB internal format\n",
"%timeit db.sql(f\"select * FROM 'open_data_rdw__gekentekende_voertuigen_20250117' where Kenteken='{first}' limit 1\").fetchall()\n",
"%timeit db.sql(f\"select * FROM 'open_data_rdw__gekentekende_voertuigen_20250117' where Kenteken='{last}' limit 1\").fetchall()\n",
"%timeit db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb')).fetchall()"
],
"id": "3e52eee1e057c57e",
"outputs": [],
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-23T21:44:41.461185Z",
"start_time": "2025-01-23T21:44:41.359264Z"
}
},
"cell_type": "code",
"source": "db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb')).to_df()[:137]",
"id": "e8fbd291902a008f",
"outputs": [
{
"data": {
"text/plain": [
" car marketshare total diff_last_jaar\n",
"0 TESLA MODEL Y 4.79% 19078 5124.0\n",
"1 KIA NIRO 2.76% 10989 3301.0\n",
"2 VOLVO EX30 2.74% 10895 10812.0\n",
"3 TESLA MODEL 3 2.69% 10711 5569.0\n",
"4 KIA PICANTO 2.1% 8374 -2141.0\n",
".. ... ... ... ...\n",
"132 VOLKSWAGEN ID.3 PRO S 150 KW 0.16% 618 386.0\n",
"133 AUDI A3 LIMOUSINE 0.15% 605 -499.0\n",
"134 LAND ROVER DEFENDER 0.15% 603 41.0\n",
"135 VOLKSWAGEN UP! 0.15% 594 -3918.0\n",
"136 MAZDA 3 0.15% 590 -99.0\n",
"\n",
"[137 rows x 4 columns]"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car</th>\n",
" <th>marketshare</th>\n",
" <th>total</th>\n",
" <th>diff_last_jaar</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TESLA MODEL Y</td>\n",
" <td>4.79%</td>\n",
" <td>19078</td>\n",
" <td>5124.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>KIA NIRO</td>\n",
" <td>2.76%</td>\n",
" <td>10989</td>\n",
" <td>3301.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>VOLVO EX30</td>\n",
" <td>2.74%</td>\n",
" <td>10895</td>\n",
" <td>10812.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>TESLA MODEL 3</td>\n",
" <td>2.69%</td>\n",
" <td>10711</td>\n",
" <td>5569.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>KIA PICANTO</td>\n",
" <td>2.1%</td>\n",
" <td>8374</td>\n",
" <td>-2141.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>132</th>\n",
" <td>VOLKSWAGEN ID.3 PRO S 150 KW</td>\n",
" <td>0.16%</td>\n",
" <td>618</td>\n",
" <td>386.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>AUDI A3 LIMOUSINE</td>\n",
" <td>0.15%</td>\n",
" <td>605</td>\n",
" <td>-499.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>LAND ROVER DEFENDER</td>\n",
" <td>0.15%</td>\n",
" <td>603</td>\n",
" <td>41.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>135</th>\n",
" <td>VOLKSWAGEN UP!</td>\n",
" <td>0.15%</td>\n",
" <td>594</td>\n",
" <td>-3918.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>MAZDA 3</td>\n",
" <td>0.15%</td>\n",
" <td>590</td>\n",
" <td>-99.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>137 rows × 4 columns</p>\n",
"</div>"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 130
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-01-23T21:44:43.496878Z",
"start_time": "2025-01-23T21:44:43.406861Z"
}
},
"cell_type": "code",
"source": [
"# Top 5 in 2025?\n",
"db.sql(get_analytical_query('open_data_rdw__gekentekende_voertuigen_20250117', engine='duckdb', year=2025)).to_df()[:5]"
],
"id": "158715cea5f869d0",
"outputs": [
{
"data": {
"text/plain": [
" car marketshare total diff_last_jaar\n",
"0 KIA EV3 6.45% 946 820.0\n",
"1 VOLKSWAGEN POLO 2.56% 375 -6463.0\n",
"2 VOLVO EX30 2.46% 361 -10534.0\n",
"3 KIA PICANTO 2.35% 345 -8029.0\n",
"4 SUZUKI SWIFT 2.24% 328 -2253.0"
],
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>car</th>\n",
" <th>marketshare</th>\n",
" <th>total</th>\n",
" <th>diff_last_jaar</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>KIA EV3</td>\n",
" <td>6.45%</td>\n",
" <td>946</td>\n",
" <td>820.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>VOLKSWAGEN POLO</td>\n",
" <td>2.56%</td>\n",
" <td>375</td>\n",
" <td>-6463.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>VOLVO EX30</td>\n",
" <td>2.46%</td>\n",
" <td>361</td>\n",
" <td>-10534.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>KIA PICANTO</td>\n",
" <td>2.35%</td>\n",
" <td>345</td>\n",
" <td>-8029.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>SUZUKI SWIFT</td>\n",
" <td>2.24%</td>\n",
" <td>328</td>\n",
" <td>-2253.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"execution_count": 131,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 131
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# cleanup\n",
"spark.active and spark.stop()\n",
"db.close()\n",
"\n",
"os.path.exists(\"raw_snappy.parquet\") and os.remove(\"raw_snappy.parquet\")\n",
"os.path.exists(\"raw_zstd.parquet\") and os.remove(\"raw_zstd.parquet\")\n",
"os.path.exists(\"duckdb_rdw_cars.db\") and os.remove(\"duckdb_rdw_cars.db\")"
],
"id": "ee9c35a741f82144",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Py3.12 (fileformats)",
"language": "python",
"name": "fileformats"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment