Last active
January 15, 2026 14:58
-
-
Save bmorris3/6b110f19cc21f0ec56c4cbf99acdaa40 to your computer and use it in GitHub Desktop.
example for building MAESTRO Zarr arrays
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "8f988550-1925-4851-9f0d-3ecd939257ae", | |
| "metadata": {}, | |
| "source": [ | |
| "# Minimal Example: MAESTRO HDF5 to Zarr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "95a4b1a1-f682-400e-99e4-2ef3e6274327", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import datetime\n", | |
| "import json\n", | |
| "from tqdm.auto import tqdm\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "\n", | |
| "import h5py\n", | |
| "import zarr\n", | |
| "\n", | |
| "from astropy.table import Table" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "1e9f6a5f-e90b-4f60-b08c-f27855f25965", | |
| "metadata": {}, | |
| "source": [ | |
| "The following cell depends on having a the grid definition file in the working directory. That file is stored on `dlsci210` at `/internal/data1/getting-started/grid1460.csv`." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "grid = Table.read('grid1460.csv')\n", | |
| "grid['index'] = grid['file_number'] - 1\n", | |
| "grid.add_index('index')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "cf2dca8a-f975-4fbc-a9cf-2e2de43b1279", | |
| "metadata": {}, | |
| "source": [ | |
| "Here we set the paths for files to read/write:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d68cac62-2d88-4054-bac6-6ace55100060", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# path to the HDF5 MAESTRO file:\n", | |
| "h5_archive_path = '../12C-H4.h5'\n", | |
| "\n", | |
| "# path to the JSON MAESTRO metadata:\n", | |
| "metadata_json_path = '14N-1H3.json'\n", | |
| "\n", | |
| "# path for writing the new zarr array:\n", | |
| "zarr_path = f'{path}.zarr'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f", | |
| "metadata": {}, | |
| "source": [ | |
| "Open the file, retrieve (p, T) coordinates:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d54ea68a-041e-4e3e-9242-899801a88752", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "h5_file = h5py.File(h5_archive_path, 'r')\n", | |
| "press_coords = h5_file['pressure_coords'][:]\n", | |
| "temp_coords = h5_file['temperature_coords'][:]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "160ab8d1-5cbd-416d-ba45-e34836ad2058", | |
| "metadata": {}, | |
| "source": [ | |
| "Split the molecule string name out of the filename" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "557aa663-760a-452e-8160-596e38513d7b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# extract only the molecule name, the molecule name plus \".zarr\" will be the zarr directory name:\n", | |
| "path = h5_archive_path.split('/')[-1].split('.h5')[0]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "c89364e7-cbeb-4f1a-ba45-d01568c54f1d", | |
| "metadata": {}, | |
| "source": [ | |
| "Here we downsample the wavenumber grid. Later, we'll interpolate the higher resolution H5 archive onto this coarser grid." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "799245d8-2991-45a9-bba3-c62331bf5f6e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Natasha's recommended uniform wavenumber sampling \n", | |
| "# (MAESTRO slack Nov 25 2025)\n", | |
| "row = grid[0]\n", | |
| "start = row['start_wavenumber']\n", | |
| "number_wave_pts = row['number_wave_pts']\n", | |
| "delta_wavenumber = row['delta_wavenumber']\n", | |
| "new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n", | |
| "\n", | |
| "wavenumber_sampling = new_wvno_grid[::2]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "702bedbb-4e0f-417f-b89f-29366ba0b5bb", | |
| "metadata": {}, | |
| "source": [ | |
| "Create a local zarr array at `zarr_path`. Store the float array with Blosc compression at the highest level, using the zstd (lossless) compression algorithm." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "min_wavenumber_pts = int(grid['number_wave_pts'].min())\n", | |
| "\n", | |
| "temperatures = np.sort(list(set(temp_coords)))\n", | |
| "pressures = np.sort(list(set(press_coords)))\n", | |
| "\n", | |
| "# these compression settings will apply to every array:\n", | |
| "compression = dict(\n", | |
| " compressors=zarr.codecs.BloscCodec(\n", | |
| " cname=\"zstd\",\n", | |
| " # here we use the maximum compression level,\n", | |
| " # which takes longer to compress (one time)\n", | |
| " # but not more time to decompress (many times)\n", | |
| " clevel=9,\n", | |
| " shuffle=zarr.codecs.BloscShuffle.shuffle\n", | |
| " )\n", | |
| ")\n", | |
| "\n", | |
| "store = zarr.storage.LocalStore(zarr_path)\n", | |
| "root = zarr.create_group(store, overwrite='w', zarr_format=3)\n", | |
| "root.create_array(\n", | |
| " 'temperature', \n", | |
| " data=temperatures, \n", | |
| " dimension_names=('temperature',), \n", | |
| " attributes=dict(coordinates='temperature', unit='K'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "root.create_array(\n", | |
| " 'pressure', \n", | |
| " data=pressures, \n", | |
| " dimension_names=('pressure',), \n", | |
| " attributes=dict(coordinates='pressure', unit='bar'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "root.create_array(\n", | |
| " 'wavenumber', \n", | |
| " data=wavenumber_sampling, \n", | |
| " dimension_names=('wavenumber',), \n", | |
| " attributes=dict(coordinates='wavenumber', unit='cm-1'),\n", | |
| " **compression\n", | |
| ")\n", | |
| "\n", | |
| "arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "743fc510-6957-4890-93b5-2b114c9988cf", | |
| "metadata": {}, | |
| "source": [ | |
| "Create a dictionary with the array attributes, starting with the file-specific metadata:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "7b3b4eeb-11bf-4c44-ad9f-5902a7e02f58", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "arr_attrs = dict(\n", | |
| " # `coordinates` is required for remote indexing with xarray,\n", | |
| " # don't change this entry!\n", | |
| " coordinates='wavenumber temperature pressure',\n", | |
| "\n", | |
| " # recording the compression settings, don't change this!\n", | |
| " compression=dict(\n", | |
| " codec=compression['compressors'].__class__.__name__,\n", | |
| " algorithm=compression['compressors'].cname.name,\n", | |
| " clevel=compression['compressors'].clevel,\n", | |
| " shuffle=compression['compressors'].shuffle.__class__.__name__,\n", | |
| " )\n", | |
| " \n", | |
| " # this top-level molecule name will be a bit easier to access than the \n", | |
| " # maestro metadata, where it would be: attrs['calculation']['iso_mol']\n", | |
| " molecule=path,\n", | |
| "\n", | |
| " # cross section units (astropy.units compatible string):\n", | |
| " unit='cm2',\n", | |
| "\n", | |
| " source=dict(\n", | |
| " database='MAESTRO',\n", | |
| " created=str(datetime.datetime.now()),\n", | |
| " version=0.1 # or pick your number\n", | |
| " ),\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "978fa2d1-53db-4834-9d84-69f9dc370b86", | |
| "metadata": {}, | |
| "source": [ | |
| "Now load the MAESTRO metadata from a JSON file, and add it to the array attributes:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "d1aca817-0831-4ea5-9583-3251787af82e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# open the maestro metadata dictionary:\n", | |
| "maestro_metadata = json.load(open(metadata_json_path))\n", | |
| "\n", | |
| "# combine the array attributes with the maestro metadata:\n", | |
| "arr_attrs.update(maestro_metadata)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7034de88-8924-4145-81dc-4fa4ef14edfc", | |
| "metadata": {}, | |
| "source": [ | |
| "Interolate the higher resolution H5 cross section array onto the lower resolution grid for zarr, store the values in the zarr array." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "5cb699a7-a23f-41e2-a3e7-7d8d44457ba6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "dimension_names = 'wavenumber temperature pressure'.split()\n", | |
| "\n", | |
| "arr = root.create_array(\n", | |
| " 'cross_section', \n", | |
| " shape=arr_shape, \n", | |
| " dtype=np.float64, \n", | |
| " dimension_names=dimension_names,\n", | |
| " attributes=arr_attrs,\n", | |
| " **compression\n", | |
| ")\n", | |
| "\n", | |
| "# consolidation makes for efficient remote indexing:\n", | |
| "zarr.consolidate_metadata(store)\n", | |
| "\n", | |
| "\n", | |
| "# loop over temperature and pressure to downsample the \n", | |
| "# wavenumber grid and store the results in the zarr array:\n", | |
| "for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n", | |
| " for j, pressure in enumerate(pressures):\n", | |
| "\n", | |
| " # find the corresponding (p, T) grid point entry in the table:\n", | |
| " nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature)) \n", | |
| " grid_point = dict(grid.loc[nearest_coord])\n", | |
| "\n", | |
| " # lookup the cross sections at (p, T):\n", | |
| " cross_section = h5_file['cxs'][nearest_coord]\n", | |
| "\n", | |
| " # reconstruct the wavenumber grid in the H5 file:\n", | |
| " wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n", | |
| "\n", | |
| " # interpolate onto the downsampled wavenumber grid, store in zarr\n", | |
| " arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.12" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment