Skip to content

Instantly share code, notes, and snippets.

@cicorias
Last active January 31, 2025 21:46
Show Gist options
  • Save cicorias/98e51d9f1da6d74202c390a8ade5e60d to your computer and use it in GitHub Desktop.
Save cicorias/98e51d9f1da6d74202c390a8ade5e60d to your computer and use it in GitHub Desktop.
Delta Tables in Python with delta-io delta-rs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"id": "c75ab3ab-ab78-4849-a020-b2ca67f85a56",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3.9.2 (default, Feb 28 2021, 17:03:44) \n",
"[GCC 10.2.1 20210110]\n"
]
},
{
"data": {
"text/plain": [
"'3.3.2'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"print (sys.version)\n",
"spark.version"
]
},
{
"cell_type": "markdown",
"id": "2f0e8d9f-0461-43ba-a19b-a6de5632e739",
"metadata": {},
"source": [
"# https://delta-io.github.io/delta-rs/\n",
"https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants\n",
"https://github.com/Azure/Azurite?tab=readme-ov-file#supported-command-line-options\n",
"https://docs.delta.io/latest/index.html\n",
"https://docs.rs/deltalake/latest/deltalake/\n",
"https://delta-io.github.io/delta-rs/usage/installation/\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "39f3d160-db5f-4ae5-a116-d52c18ada3e9",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from deltalake import write_deltalake, DeltaTable\n",
"\n",
"df = pd.DataFrame({\"foo\": np.random.randint(1, 10, 5), \"bar\": np.random.randint(1, 10, 5)})\n",
"\n",
"\n",
"# define container name\n",
"container = \"data\"\n",
"\n",
"# define credentials\n",
"storage_options = {\n",
" # \"ACCOUNT_NAME\": \"devstoreaccount1\",\n",
" # \"ACCESS_KEY\": \"Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==\",\n",
" \"azure_storage_use_emulator\" : \"true\",\n",
"}\n",
"\n",
"# storage_options = {\n",
"# \"azure_storage_account_name\": \"devstoreaccount1\",\n",
"# \"azure_storage_account_key\": \"Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==\",\n",
"# \"azure_container_name\" : \"data\",\n",
"# # \"azure_storage_use_emulator\" : \"true\",\n",
"# # \"azure_skip_signature\" : \"true\",\n",
"# # \"azure_storage_endpoint\" : \"http://azureite:10000/devstoreaccount1\",\n",
"# }\n",
"\n",
"\n",
"\n",
"\n",
"write_deltalake(\n",
" f\"abfs://{container}/delta_table_pandas\",\n",
" df,\n",
" # partition_by=['foo'],\n",
" storage_options=storage_options,\n",
" # schema_mode=\"overwrite\",\n",
" # overwrite_schema=True,\n",
" mode=\"overwrite\",\n",
" overwrite_schema=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "ad12ac57-43a9-4de5-ac6c-4bc257c39a99",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----version--------\n",
"6\n",
"----metadata-----\n",
"Metadata(id: 26b16d13-2cb8-4c47-9544-6cd309f03cb1, name: None, description: None, partition_columns: [], created_time: 1738356229969, configuration: {})\n",
"---schema---\n",
"Schema([Field(foo, PrimitiveType(\"long\"), nullable=True), Field(bar, PrimitiveType(\"long\"), nullable=True)])\n",
"-----history-----\n"
]
},
{
"data": {
"application/json": [
{
"clientVersion": "delta-rs.0.10.0",
"operation": "CREATE TABLE",
"operationParameters": {
"location": "abfs://data/delta_table_pandas",
"metadata": "{\"configuration\":{},\"created_time\":1738356229969,\"description\":null,\"format\":{\"options\":{},\"provider\":\"parquet\"},\"id\":\"26b16d13-2cb8-4c47-9544-6cd309f03cb1\",\"name\":null,\"partition_columns\":[],\"schema\":{\"fields\":[{\"metadata\":{},\"name\":\"foo\",\"nullable\":true,\"type\":\"long\"}],\"type\":\"struct\"}}",
"mode": "ErrorIfExists",
"protocol": "{\"minReaderVersion\":1,\"minWriterVersion\":1}"
},
"timestamp": 1738356229980
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738356363664
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738356686275
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738357075080
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738357220986
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738359571117
},
{
"clientVersion": "delta-rs.0.10.0",
"operation": "WRITE",
"operationParameters": {
"mode": "Overwrite",
"partitionBy": "[]"
},
"timestamp": 1738359696984
}
],
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"metadata": {
"application/json": {
"expanded": false,
"root": "root"
}
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[\n",
" {\n",
" \"timestamp\": 1738356229980,\n",
" \"operation\": \"CREATE TABLE\",\n",
" \"operationParameters\": {\n",
" \"mode\": \"ErrorIfExists\",\n",
" \"protocol\": \"{\\\"minReaderVersion\\\":1,\\\"minWriterVersion\\\":1}\",\n",
" \"metadata\": \"{\\\"configuration\\\":{},\\\"created_time\\\":1738356229969,\\\"description\\\":null,\\\"format\\\":{\\\"options\\\":{},\\\"provider\\\":\\\"parquet\\\"},\\\"id\\\":\\\"26b16d13-2cb8-4c47-9544-6cd309f03cb1\\\",\\\"name\\\":null,\\\"partition_columns\\\":[],\\\"schema\\\":{\\\"fields\\\":[{\\\"metadata\\\":{},\\\"name\\\":\\\"foo\\\",\\\"nullable\\\":true,\\\"type\\\":\\\"long\\\"}],\\\"type\\\":\\\"struct\\\"}}\",\n",
" \"location\": \"abfs://data/delta_table_pandas\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738356363664,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"partitionBy\": \"[]\",\n",
" \"mode\": \"Overwrite\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738356686275,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"mode\": \"Overwrite\",\n",
" \"partitionBy\": \"[]\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738357075080,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"mode\": \"Overwrite\",\n",
" \"partitionBy\": \"[]\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738357220986,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"partitionBy\": \"[]\",\n",
" \"mode\": \"Overwrite\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738359571117,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"partitionBy\": \"[]\",\n",
" \"mode\": \"Overwrite\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" },\n",
" {\n",
" \"timestamp\": 1738359696984,\n",
" \"operation\": \"WRITE\",\n",
" \"operationParameters\": {\n",
" \"partitionBy\": \"[]\",\n",
" \"mode\": \"Overwrite\"\n",
" },\n",
" \"clientVersion\": \"delta-rs.0.10.0\"\n",
" }\n",
"]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>foo</th>\n",
" <th>bar</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" foo bar\n",
"0 4 8\n",
"1 6 2\n",
"2 6 2\n",
"3 1 5\n",
"4 9 6"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"from IPython.display import display, JSON\n",
"from deltalake import DeltaTable\n",
"\n",
"\n",
"dt = DeltaTable(f\"abfs://{container}/delta_table_pandas\",\n",
" storage_options=storage_options)\n",
"\n",
"pd = dt.to_pandas()\n",
"\n",
"schema_json = dt.schema()\n",
"\n",
"print(\"-----version--------\")\n",
"print(dt.version())\n",
"print(\"----metadata-----\")\n",
"print(dt.metadata())\n",
"print(\"---schema---\")\n",
"# print(json.dumps(dt.schema(), indent=4))\n",
"# display(pd(schema_json[\"fields\"]))\n",
"print(schema_json)\n",
"print(\"-----history-----\")\n",
"history_list = dt.history() # Get the history list\n",
"display(JSON(history_list)) \n",
"pretty_json = json.dumps(history_list, indent=4) # Convert to pretty JSON\n",
"print(pretty_json) # Print nicely formatted JSON\n",
"# print(json.dumps(dt.history()))\n",
"\n",
"\n",
"pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7695ba21-92fa-4bff-a7e8-54e31a951e91",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dt.delete (predicate = None) #(storage_options=storage_options)\n",
"dt.vacuum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "370ba660-9204-46cb-b4ff-9b62a7f2ed91",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"df = dt.to_pandas()\n",
"df_filtered = df[df[\"foo\"] != 3]\n",
"from deltalake.writer import write_deltalake\n",
"write_deltalake(\n",
" f\"abfs://{container}/delta_table_pandas\",\n",
" df_filtered,\n",
" mode=\"overwrite\",\n",
" storage_options=storage_options)\n",
"dt.vacuum()\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "426a3b49-d2f7-473c-9676-5a1d8f92da4d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment