Last active
March 14, 2024 08:09
-
-
Save t20100/13179e38d7d8b3058f0174435d674466 to your computer and use it in GitHub Desktop.
[WIP] Convert a HDF5 compressed dataset to a stack of sparse representations - Binder: https://mybinder.org/v2/gist/t20100/13179e38d7d8b3058f0174435d674466/HEAD?labpath=hdf5-dataset-to-sparse.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "3141ebd8-380c-4fac-9a51-27fbf6292fc0", | |
"metadata": {}, | |
"source": [ | |
"# Convert a HDF5 compressed dataset to sparse matrices\n", | |
"\n", | |
"This notebook presents different ways to convert a stack of images stored as a HDF5 dataset to a stack of sparse matrices.\n", | |
"\n", | |
"Notebook license: [CC-0](https://creativecommons.org/public-domain/cc0/)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "a6020b15-3854-4d2d-bb14-9ec6704dcff2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Disable multithreading\n", | |
"import os\n", | |
"\n", | |
"os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", | |
"os.environ[\"BLOSC_NTHREADS\"] = \"1\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "d9d6510a-efcd-4dea-856a-c8599548c141", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import b2h5py\n", | |
"import blosc2\n", | |
"import bslz4_to_sparse\n", | |
"import h5py\n", | |
"from hdf5plugin import Bitshuffle, Blosc2\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "abfabbd1-3e8c-421c-9cc5-9b928a6469b1", | |
"metadata": {}, | |
"source": [ | |
"## Prepare the datasets\n", | |
"\n", | |
"Download data (credits ID11@ESRF) and create datasets with different compressions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "23f7c441-351f-43d4-8df3-5dc601ac6896", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"if not os.path.exists(\"sparse_image_stack.h5\"):\n", | |
" !wget http://www.silx.org/pub/leaps-innov/sparse_image_stack.h5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "34010312-41e1-4b5c-8cc2-53bf3a6f674d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"if not os.path.exists(\"sparse_images.h5\"):\n", | |
" with h5py.File(\"sparse_image_stack.h5\", \"r\") as h5f:\n", | |
" data = h5f[\"entry_0000/measurement/data\"][:2]\n", | |
"\n", | |
" chunk_shape = (1,) + data.shape[1:]\n", | |
"\n", | |
" with h5py.File(\"sparse_images.h5\", \"w\") as h5f:\n", | |
" h5f.create_dataset(\n", | |
" \"bslz4\",\n", | |
" data=data,\n", | |
" chunks=chunk_shape,\n", | |
" compression=Bitshuffle(),\n", | |
" )\n", | |
" h5f.create_dataset(\n", | |
" \"bszstd\",\n", | |
" data=data,\n", | |
" chunks=chunk_shape,\n", | |
" compression=Bitshuffle(cname=\"zstd\"),\n", | |
" )\n", | |
" h5f.create_dataset(\n", | |
" \"blosc2_bslz4\",\n", | |
" data=data,\n", | |
" chunks=chunk_shape,\n", | |
" compression=Blosc2(cname=\"lz4\", filters=Blosc2.BITSHUFFLE),\n", | |
" )\n", | |
" h5f.create_dataset(\n", | |
" \"blosc2_bszstd\",\n", | |
" data=data,\n", | |
" chunks=chunk_shape,\n", | |
" compression=Blosc2(cname=\"zstd\", filters=Blosc2.BITSHUFFLE),\n", | |
" )\n", | |
" \n", | |
" del data" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "dd9b1e00-2074-4b1e-a3e5-e06b7144a948", | |
"metadata": {}, | |
"source": [ | |
"## Benchmark reading data\n", | |
"\n", | |
"Compare different ways to read data from a HDf5 compressed dataset.\n", | |
"\n", | |
"The test is done on the first frame of a 3D dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "5f0bccb6-a34c-42c6-8260-bb6a8b959b9a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"h5f = h5py.File(\"sparse_images.h5\", \"r\")\n", | |
"bslz4_dataset = h5f[\"bslz4\"]\n", | |
"bszstd_dataset = h5f[\"bszstd\"]\n", | |
"blosc2_bslz4_dataset = h5f[\"blosc2_bslz4\"]\n", | |
"blosc2_bszstd_dataset = h5f[\"blosc2_bszstd\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c13d72ec-3036-4731-94ae-e0996891099e", | |
"metadata": {}, | |
"source": [ | |
"Compressed size of first frame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "5514f701-4bf4-4b69-83ff-a090734ee76f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"bslz4_nbytes=183882\n", | |
"bszstd_nbytes=111675\n", | |
"blosc2_bslz4_nbytes=102058\n", | |
"blosc2_bszstd_nbytes=12757\n" | |
] | |
} | |
], | |
"source": [ | |
"bslz4_nbytes = len(bslz4_dataset.id.read_direct_chunk((0, 0, 0))[1])\n", | |
"bszstd_nbytes = len(bszstd_dataset.id.read_direct_chunk((0, 0, 0))[1])\n", | |
"blosc2_bslz4_nbytes = len(blosc2_bslz4_dataset.id.read_direct_chunk((0, 0, 0))[1])\n", | |
"blosc2_bszstd_nbytes = len(blosc2_bszstd_dataset.id.read_direct_chunk((0, 0, 0))[1])\n", | |
"print(f\"{bslz4_nbytes=}\\n{bszstd_nbytes=}\\n{blosc2_bslz4_nbytes=}\\n{blosc2_bszstd_nbytes=}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "bc51b130-a721-480d-85c9-7c20985c7242", | |
"metadata": {}, | |
"source": [ | |
"* **With hdf5plugin**: Decompression is performed by the HDF5 filters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "7af7fa01-4e37-42b0-a043-15c1ef8541c9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"19.6 ms ± 1.46 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit bslz4_dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "b1d8e2f5-2134-47b4-a913-5feac6a08146", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"20.2 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit bszstd_dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "b212d81e-c48e-48a6-8147-face04e78a9b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"28.9 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit blosc2_bslz4_dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "0f854045-2fbb-4c5c-923e-4137c337fcab", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"27.9 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit blosc2_bszstd_dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "54d8577f-f4bb-4701-99df-6f5c759412b3", | |
"metadata": {}, | |
"source": [ | |
"* **With b2h5py**: Datasets are accessed with direct chunk read and decompressed with blosc2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "1dbd7bbc-b2a5-411f-9058-c78888de68ed", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"24.9 ms ± 2.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bslz4_b2dataset = b2h5py.B2Dataset(blosc2_bslz4_dataset)\n", | |
"\n", | |
"%timeit blosc2_bslz4_b2dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "82927a9b-e08b-408c-b7fc-d29a2a260b93", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"23.3 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bszstd_b2dataset = b2h5py.B2Dataset(blosc2_bszstd_dataset)\n", | |
"\n", | |
"%timeit blosc2_bszstd_b2dataset[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4b9722d7-29e2-4e5c-becd-212b2eab9a28", | |
"metadata": {}, | |
"source": [ | |
"* **Direct chunk read**: For reference read compressed data from file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "2fbc206d-4b42-4aca-aa67-c9067bc7ed16", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"21.1 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit bslz4_dataset.id.read_direct_chunk((0, 0, 0))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "3553196e-af12-4106-875a-0bc340ee1bd4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"13.2 µs ± 45.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit bszstd_dataset.id.read_direct_chunk((0, 0, 0))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "5cabd3d8-932b-4448-bc48-55eba0c40f67", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"12.4 µs ± 37.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit blosc2_bslz4_dataset.id.read_direct_chunk((0, 0, 0))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "8a28b0f7-42ae-4e66-9ef0-647ab73f3c04", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.77 µs ± 54.9 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit blosc2_bszstd_dataset.id.read_direct_chunk((0, 0, 0))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d194932f-d79b-43db-9ac5-ea016646776f", | |
"metadata": {}, | |
"source": [ | |
"## Convert data to sparse representation\n", | |
"\n", | |
"Compares different ways to convert a 3D dataset to sparse matrices frame by frame.\n", | |
"\n", | |
"The test is done on the first frame." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "d85f32b5-d4f2-43bd-950d-bd6fc5bd3d49", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def array_to_sparse(array: np.ndarray) -> tuple[np.ndarray, np.ndarray]:\n", | |
" \"\"\"Convert an array to sparse representation\"\"\"\n", | |
" array = np.ravel(array)\n", | |
" indices = np.nonzero(array)[0]\n", | |
" values = array[indices]\n", | |
" return values, indices" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "3906e644-1f3a-4001-9988-d57818bec72a", | |
"metadata": {}, | |
"source": [ | |
"* **Reference**: Read data through h5py and convert it with numpy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"id": "e4dcd695-d2d6-4108-be72-7f18998416a0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"29.2 ms ± 1.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit array_to_sparse(bslz4_dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "8a3c5c85-c4a2-405f-a44e-d83bc942b534", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"30 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit array_to_sparse(bszstd_dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"id": "7f7da4db-30cc-4616-b956-9454942683c6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"42.8 ms ± 3.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit array_to_sparse(blosc2_bslz4_dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "1475bb80-61fb-47f9-83bd-dbb7c2187c4a", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"39.5 ms ± 1.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit array_to_sparse(blosc2_bszstd_dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a627421c-11e2-4ac0-aab9-d4e023ecce39", | |
"metadata": {}, | |
"source": [ | |
"With b2h5py:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "f5d98b34-44d4-4f13-a44b-ef82afabfac0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"36.1 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bslz4_b2dataset = b2h5py.B2Dataset(blosc2_bslz4_dataset)\n", | |
"\n", | |
"%timeit array_to_sparse(blosc2_bslz4_b2dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 80, | |
"id": "e99e195c-bbad-4617-b192-ad72d0e2872c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"33 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bszstd_b2dataset = b2h5py.B2Dataset(blosc2_bszstd_dataset)\n", | |
"\n", | |
"%timeit array_to_sparse(blosc2_bszstd_b2dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f93cbb4d-56f6-4808-a597-94adc77f2455", | |
"metadata": {}, | |
"source": [ | |
"* **With bslz4_to_sparse** for bitshuffle+LZ4 compressed dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "789dec23-978f-4c9c-a866-ec95d629a421", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10.9 ms ± 176 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit bslz4_to_sparse.bslz4_to_sparse(bslz4_dataset, 0, 0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "bae224e2-8ec0-4ecc-8d28-b6bd127490af", | |
"metadata": {}, | |
"source": [ | |
"* **With b2h5py** for blosc2 compressed datasets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "e68c778e-8310-4b2a-abd6-ff5bce1421ce", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def blosc2_to_sparse(\n", | |
" b2dataset: b2h5py.B2Dataset,\n", | |
" frame: int\n", | |
") -> tuple[int, tuple[np.ndarray, np.ndarray]]:\n", | |
" \"\"\"Reads one frame of a 3D blsoc2 compressed dataset and converts to sparse representation\n", | |
" \"\"\"\n", | |
" assert b2dataset.ndim == 3\n", | |
" ndata = np.prod(b2dataset.shape[1:])\n", | |
" values = np.empty((ndata,), dtype=b2dataset.dtype)\n", | |
" indices = np.empty((ndata,), dtype=np.uint32)\n", | |
" nvalues = 0\n", | |
"\n", | |
" # TODO align reading with blosc2 blocs which are 256x256\n", | |
" nrows = 256\n", | |
" row_length = b2dataset.shape[2]\n", | |
" for row in range(0, b2dataset.shape[1], nrows):\n", | |
" data = np.ravel(b2dataset[frame, row:row+nrows])\n", | |
" partial_indices = np.nonzero(data)[0]\n", | |
" previous_nvalues = nvalues\n", | |
" nvalues += len(partial_indices)\n", | |
" values[previous_nvalues:nvalues] = data[partial_indices]\n", | |
" indices[previous_nvalues:nvalues] = partial_indices + (row * row_length)\n", | |
"\n", | |
" return nvalues, (values, indices)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "ad5e1548-a96b-4fd0-bd86-d00961ac4843", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"51.1 ms ± 4.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bslz4_b2dataset = b2h5py.B2Dataset(blosc2_bslz4_dataset)\n", | |
"\n", | |
"%timeit blosc2_to_sparse(blosc2_bslz4_b2dataset, 0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "e33b031e-168b-4aad-986d-b474883a363c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"46.2 ms ± 1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bszstd_b2dataset = b2h5py.B2Dataset(blosc2_bszstd_dataset)\n", | |
"\n", | |
"%timeit blosc2_to_sparse(blosc2_bszstd_b2dataset, 0)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "58c30aad-43ba-4459-92bb-c43fd3869387", | |
"metadata": {}, | |
"source": [ | |
"### Tests\n", | |
"\n", | |
"Check that all methods returns the same results" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "0ea5fd46-78a0-4e02-96a8-823b095b128b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ref_values, ref_indices = array_to_sparse(bslz4_dataset[0])\n", | |
"ref_nvalues = len(ref_values)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "adc4d43e-d337-47a4-9f4c-676e9d260555", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"blosc2_bslz4_b2dataset = b2h5py.B2Dataset(blosc2_bslz4_dataset)\n", | |
"bslz4_to_sparse_results = bslz4_to_sparse.bslz4_to_sparse(bslz4_dataset, 0, 0)\n", | |
"\n", | |
"assert bslz4_to_sparse_results[0] == ref_nvalues\n", | |
"assert np.array_equal(bslz4_to_sparse_results[1][0][:ref_nvalues], ref_values)\n", | |
"assert np.array_equal(bslz4_to_sparse_results[1][1][:ref_nvalues], ref_indices)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "77c1eeba-93a0-47a9-a954-bfaebcc2ffc4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"blosc2_bszstd_b2dataset = b2h5py.B2Dataset(blosc2_bszstd_dataset)\n", | |
"blosc2_to_sparse_results = blosc2_to_sparse(blosc2_bszstd_b2dataset, 0)\n", | |
"\n", | |
"assert blosc2_to_sparse_results[0] == ref_nvalues\n", | |
"assert np.array_equal(blosc2_to_sparse_results[1][0][:ref_nvalues], ref_values)\n", | |
"assert np.array_equal(blosc2_to_sparse_results[1][1][:ref_nvalues], ref_indices)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4cdcf600-d57b-4ebd-8659-c6de940f8053", | |
"metadata": {}, | |
"source": [ | |
"### Alternative implementation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 61, | |
"id": "c2c37a29-68ea-413f-82de-d28bca9e050b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def read_blosc2_h5_chunk(dataset: h5py.Dataset, chunk_index: int):\n", | |
" h5_chunk_info = dataset.id.get_chunk_info(chunk_index)\n", | |
" b2_schunk = blosc2.schunk.open(\n", | |
" dataset.file.filename,\n", | |
" mode='r',\n", | |
" offset=h5_chunk_info.byte_offset,\n", | |
" )\n", | |
" return b2_schunk[:].view(dtype=dataset.dtype)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"id": "70eb2409-b1d6-4b9c-94a5-5531ecdc0b59", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"28.9 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit array_to_sparse(read_blosc2_h5_chunk(blosc2_bszstd_dataset, 0))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"id": "af4938b9-65a4-437c-81f2-487b5fd1c6a9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"33.2 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"blosc2_bszstd_b2dataset = b2h5py.B2Dataset(blosc2_bszstd_dataset)\n", | |
"\n", | |
"%timeit array_to_sparse(blosc2_bszstd_b2dataset[0])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"id": "62037b5a-768b-4ea1-a7ff-02d61a326d7b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Check that result is correct\n", | |
"assert np.array_equal(\n", | |
" blosc2_bszstd_dataset[0],\n", | |
" read_blosc2_h5_chunk(blosc2_bszstd_dataset, 0).reshape(blosc2_bszstd_dataset[0].shape)\n", | |
")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
b2h5py | |
blosc2 | |
bslz4_to_sparse | |
h5py | |
hdf5plugin | |
numpy |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment