bmorris3 · January 15, 2026 14:58
diff --git a/hdf5-to-zarr-minimal.ipynb b/hdf5-to-zarr-minimal.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8f988550-1925-4851-9f0d-3ecd939257ae",
   "metadata": {},
   "source": [
    "# Minimal Example: MAESTRO HDF5 to Zarr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95a4b1a1-f682-400e-99e4-2ef3e6274327",
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import json\n",
    "from tqdm.auto import tqdm\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import h5py\n",
    "import zarr\n",
    "\n",
    "from astropy.table import Table"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e9f6a5f-e90b-4f60-b08c-f27855f25965",
   "metadata": {},
   "source": [
    "The following cell depends on having a the grid definition file in the working directory. That file is stored on `dlsci210` at `/internal/data1/getting-started/grid1460.csv`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = Table.read('grid1460.csv')\n",
    "grid['index'] = grid['file_number'] - 1\n",
    "grid.add_index('index')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf2dca8a-f975-4fbc-a9cf-2e2de43b1279",
   "metadata": {},
   "source": [
    "Here we set the paths for files to read/write:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d68cac62-2d88-4054-bac6-6ace55100060",
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to the HDF5 MAESTRO file:\n",
    "h5_archive_path = '../12C-H4.h5'\n",
    "\n",
    "# path to the JSON MAESTRO metadata:\n",
    "metadata_json_path = '14N-1H3.json'\n",
    "\n",
    "# path for writing the new zarr array:\n",
    "zarr_path = f'{path}.zarr'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f",
   "metadata": {},
   "source": [
    "Open the file, retrieve (p, T) coordinates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d54ea68a-041e-4e3e-9242-899801a88752",
   "metadata": {},
   "outputs": [],
   "source": [
    "h5_file = h5py.File(h5_archive_path, 'r')\n",
    "press_coords = h5_file['pressure_coords'][:]\n",
    "temp_coords = h5_file['temperature_coords'][:]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "160ab8d1-5cbd-416d-ba45-e34836ad2058",
   "metadata": {},
   "source": [
    "Split the molecule string name out of the filename"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "557aa663-760a-452e-8160-596e38513d7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract only the molecule name, the molecule name plus \".zarr\" will be the zarr directory name:\n",
    "path = h5_archive_path.split('/')[-1].split('.h5')[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c89364e7-cbeb-4f1a-ba45-d01568c54f1d",
   "metadata": {},
   "source": [
    "Here we downsample the wavenumber grid. Later, we'll interpolate the higher resolution H5 archive onto this coarser grid."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "799245d8-2991-45a9-bba3-c62331bf5f6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Natasha's recommended uniform wavenumber sampling \n",
    "# (MAESTRO slack Nov 25 2025)\n",
    "row = grid[0]\n",
    "start = row['start_wavenumber']\n",
    "number_wave_pts = row['number_wave_pts']\n",
    "delta_wavenumber = row['delta_wavenumber']\n",
    "new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n",
    "\n",
    "wavenumber_sampling = new_wvno_grid[::2]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "702bedbb-4e0f-417f-b89f-29366ba0b5bb",
   "metadata": {},
   "source": [
    "Create a local zarr array at `zarr_path`. Store the float array with Blosc compression at the highest level, using the zstd (lossless) compression algorithm."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde",
   "metadata": {},
   "outputs": [],
   "source": [
    "min_wavenumber_pts = int(grid['number_wave_pts'].min())\n",
    "\n",
    "temperatures = np.sort(list(set(temp_coords)))\n",
    "pressures = np.sort(list(set(press_coords)))\n",
    "\n",
    "# these compression settings will apply to every array:\n",
    "compression = dict(\n",
    "    compressors=zarr.codecs.BloscCodec(\n",
    "        cname=\"zstd\",\n",
    "        # here we use the maximum compression level,\n",
    "        # which takes longer to compress (one time)\n",
    "        # but not more time to decompress (many times)\n",
    "        clevel=9,\n",
    "        shuffle=zarr.codecs.BloscShuffle.shuffle\n",
    "    )\n",
    ")\n",
    "\n",
    "store = zarr.storage.LocalStore(zarr_path)\n",
    "root = zarr.create_group(store, overwrite='w', zarr_format=3)\n",
    "root.create_array(\n",
    "    'temperature', \n",
    "    data=temperatures, \n",
    "    dimension_names=('temperature',), \n",
    "    attributes=dict(coordinates='temperature', unit='K'),\n",
    "    **compression\n",
    ")\n",
    "root.create_array(\n",
    "    'pressure', \n",
    "    data=pressures, \n",
    "    dimension_names=('pressure',), \n",
    "    attributes=dict(coordinates='pressure', unit='bar'),\n",
    "    **compression\n",
    ")\n",
    "root.create_array(\n",
    "    'wavenumber', \n",
    "    data=wavenumber_sampling, \n",
    "    dimension_names=('wavenumber',), \n",
    "    attributes=dict(coordinates='wavenumber', unit='cm-1'),\n",
    "    **compression\n",
    ")\n",
    "\n",
    "arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "743fc510-6957-4890-93b5-2b114c9988cf",
   "metadata": {},
   "source": [
    "Create a dictionary with the array attributes, starting with the file-specific metadata:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b3b4eeb-11bf-4c44-ad9f-5902a7e02f58",
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_attrs = dict(\n",
    "    # `coordinates` is required for remote indexing with xarray,\n",
    "    # don't change this entry!\n",
    "    coordinates='wavenumber temperature pressure',\n",
    "\n",
    "    # recording the compression settings, don't change this!\n",
    "    compression=dict(\n",
    "        codec=compression['compressors'].__class__.__name__,\n",
    "        algorithm=compression['compressors'].cname.name,\n",
    "        clevel=compression['compressors'].clevel,\n",
    "        shuffle=compression['compressors'].shuffle.__class__.__name__,\n",
    "    )\n",
    "    \n",
    "    # this top-level molecule name will be a bit easier to access than the \n",
    "    # maestro metadata, where it would be: attrs['calculation']['iso_mol']\n",
    "    molecule=path,\n",
    "\n",
    "    # cross section units (astropy.units compatible string):\n",
    "    unit='cm2',\n",
    "\n",
    "    source=dict(\n",
    "        database='MAESTRO',\n",
    "        created=str(datetime.datetime.now()),\n",
    "        version=0.1  # or pick your number\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "978fa2d1-53db-4834-9d84-69f9dc370b86",
   "metadata": {},
   "source": [
    "Now load the MAESTRO metadata from a JSON file, and add it to the array attributes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1aca817-0831-4ea5-9583-3251787af82e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# open the maestro metadata dictionary:\n",
    "maestro_metadata = json.load(open(metadata_json_path))\n",
    "\n",
    "# combine the array attributes with the maestro metadata:\n",
    "arr_attrs.update(maestro_metadata)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7034de88-8924-4145-81dc-4fa4ef14edfc",
   "metadata": {},
   "source": [
    "Interolate the higher resolution H5 cross section array onto the lower resolution grid for zarr, store the values in the zarr array."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5cb699a7-a23f-41e2-a3e7-7d8d44457ba6",
   "metadata": {},
   "outputs": [],
   "source": [
    "dimension_names = 'wavenumber temperature pressure'.split()\n",
    "\n",
    "arr = root.create_array(\n",
    "    'cross_section', \n",
    "    shape=arr_shape, \n",
    "    dtype=np.float64, \n",
    "    dimension_names=dimension_names,\n",
    "    attributes=arr_attrs,\n",
    "    **compression\n",
    ")\n",
    "\n",
    "# consolidation makes for efficient remote indexing:\n",
    "zarr.consolidate_metadata(store)\n",
    "\n",
    "\n",
    "# loop over temperature and pressure to downsample the \n",
    "# wavenumber grid and store the results in the zarr array:\n",
    "for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n",
    "    for j, pressure in enumerate(pressures):\n",
    "\n",
    "        # find the corresponding (p, T) grid point entry in the table:\n",
    "        nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature))        \n",
    "        grid_point = dict(grid.loc[nearest_coord])\n",
    "\n",
    "        # lookup the cross sections at (p, T):\n",
    "        cross_section = h5_file['cxs'][nearest_coord]\n",
    "\n",
    "        # reconstruct the wavenumber grid in the H5 file:\n",
    "        wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n",
    "\n",
    "        # interpolate onto the downsampled wavenumber grid, store in zarr\n",
    "        arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "8f988550-1925-4851-9f0d-3ecd939257ae",
	"metadata": {},
	"source": [
	"# Minimal Example: MAESTRO HDF5 to Zarr"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "95a4b1a1-f682-400e-99e4-2ef3e6274327",
	"metadata": {},
	"outputs": [],
	"source": [
	"import datetime\n",
	"import json\n",
	"from tqdm.auto import tqdm\n",
	"\n",
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"\n",
	"import h5py\n",
	"import zarr\n",
	"\n",
	"from astropy.table import Table"
	]
	},
	{
	"cell_type": "markdown",
	"id": "1e9f6a5f-e90b-4f60-b08c-f27855f25965",
	"metadata": {},
	"source": [
	"The following cell depends on having a the grid definition file in the working directory. That file is stored on `dlsci210` at `/internal/data1/getting-started/grid1460.csv`."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6c8cb17f-ca8d-4e13-9d5d-9c9cb2655f7c",
	"metadata": {},
	"outputs": [],
	"source": [
	"grid = Table.read('grid1460.csv')\n",
	"grid['index'] = grid['file_number'] - 1\n",
	"grid.add_index('index')"
	]
	},
	{
	"cell_type": "markdown",
	"id": "cf2dca8a-f975-4fbc-a9cf-2e2de43b1279",
	"metadata": {},
	"source": [
	"Here we set the paths for files to read/write:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d68cac62-2d88-4054-bac6-6ace55100060",
	"metadata": {},
	"outputs": [],
	"source": [
	"# path to the HDF5 MAESTRO file:\n",
	"h5_archive_path = '../12C-H4.h5'\n",
	"\n",
	"# path to the JSON MAESTRO metadata:\n",
	"metadata_json_path = '14N-1H3.json'\n",
	"\n",
	"# path for writing the new zarr array:\n",
	"zarr_path = f'{path}.zarr'"
	]
	},
	{
	"cell_type": "markdown",
	"id": "423508c0-d36c-4b46-aeb9-70ebdc38a87f",
	"metadata": {},
	"source": [
	"Open the file, retrieve (p, T) coordinates:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d54ea68a-041e-4e3e-9242-899801a88752",
	"metadata": {},
	"outputs": [],
	"source": [
	"h5_file = h5py.File(h5_archive_path, 'r')\n",
	"press_coords = h5_file['pressure_coords'][:]\n",
	"temp_coords = h5_file['temperature_coords'][:]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "160ab8d1-5cbd-416d-ba45-e34836ad2058",
	"metadata": {},
	"source": [
	"Split the molecule string name out of the filename"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "557aa663-760a-452e-8160-596e38513d7b",
	"metadata": {},
	"outputs": [],
	"source": [
	"# extract only the molecule name, the molecule name plus \".zarr\" will be the zarr directory name:\n",
	"path = h5_archive_path.split('/')[-1].split('.h5')[0]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "c89364e7-cbeb-4f1a-ba45-d01568c54f1d",
	"metadata": {},
	"source": [
	"Here we downsample the wavenumber grid. Later, we'll interpolate the higher resolution H5 archive onto this coarser grid."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "799245d8-2991-45a9-bba3-c62331bf5f6e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# Natasha's recommended uniform wavenumber sampling \n",
	"# (MAESTRO slack Nov 25 2025)\n",
	"row = grid[0]\n",
	"start = row['start_wavenumber']\n",
	"number_wave_pts = row['number_wave_pts']\n",
	"delta_wavenumber = row['delta_wavenumber']\n",
	"new_wvno_grid = np.arange(number_wave_pts) * delta_wavenumber + start\n",
	"\n",
	"wavenumber_sampling = new_wvno_grid[::2]"
	]
	},
	{
	"cell_type": "markdown",
	"id": "702bedbb-4e0f-417f-b89f-29366ba0b5bb",
	"metadata": {},
	"source": [
	"Create a local zarr array at `zarr_path`. Store the float array with Blosc compression at the highest level, using the zstd (lossless) compression algorithm."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "081a96ac-976d-4eb5-9cb1-4e2c08ba7bde",
	"metadata": {},
	"outputs": [],
	"source": [
	"min_wavenumber_pts = int(grid['number_wave_pts'].min())\n",
	"\n",
	"temperatures = np.sort(list(set(temp_coords)))\n",
	"pressures = np.sort(list(set(press_coords)))\n",
	"\n",
	"# these compression settings will apply to every array:\n",
	"compression = dict(\n",
	" compressors=zarr.codecs.BloscCodec(\n",
	" cname=\"zstd\",\n",
	" # here we use the maximum compression level,\n",
	" # which takes longer to compress (one time)\n",
	" # but not more time to decompress (many times)\n",
	" clevel=9,\n",
	" shuffle=zarr.codecs.BloscShuffle.shuffle\n",
	" )\n",
	")\n",
	"\n",
	"store = zarr.storage.LocalStore(zarr_path)\n",
	"root = zarr.create_group(store, overwrite='w', zarr_format=3)\n",
	"root.create_array(\n",
	" 'temperature', \n",
	" data=temperatures, \n",
	" dimension_names=('temperature',), \n",
	" attributes=dict(coordinates='temperature', unit='K'),\n",
	" **compression\n",
	")\n",
	"root.create_array(\n",
	" 'pressure', \n",
	" data=pressures, \n",
	" dimension_names=('pressure',), \n",
	" attributes=dict(coordinates='pressure', unit='bar'),\n",
	" **compression\n",
	")\n",
	"root.create_array(\n",
	" 'wavenumber', \n",
	" data=wavenumber_sampling, \n",
	" dimension_names=('wavenumber',), \n",
	" attributes=dict(coordinates='wavenumber', unit='cm-1'),\n",
	" **compression\n",
	")\n",
	"\n",
	"arr_shape = (wavenumber_sampling.size, temperatures.size, pressures.size)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "743fc510-6957-4890-93b5-2b114c9988cf",
	"metadata": {},
	"source": [
	"Create a dictionary with the array attributes, starting with the file-specific metadata:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "7b3b4eeb-11bf-4c44-ad9f-5902a7e02f58",
	"metadata": {},
	"outputs": [],
	"source": [
	"arr_attrs = dict(\n",
	" # `coordinates` is required for remote indexing with xarray,\n",
	" # don't change this entry!\n",
	" coordinates='wavenumber temperature pressure',\n",
	"\n",
	" # recording the compression settings, don't change this!\n",
	" compression=dict(\n",
	" codec=compression['compressors'].__class__.__name__,\n",
	" algorithm=compression['compressors'].cname.name,\n",
	" clevel=compression['compressors'].clevel,\n",
	" shuffle=compression['compressors'].shuffle.__class__.__name__,\n",
	" )\n",
	" \n",
	" # this top-level molecule name will be a bit easier to access than the \n",
	" # maestro metadata, where it would be: attrs['calculation']['iso_mol']\n",
	" molecule=path,\n",
	"\n",
	" # cross section units (astropy.units compatible string):\n",
	" unit='cm2',\n",
	"\n",
	" source=dict(\n",
	" database='MAESTRO',\n",
	" created=str(datetime.datetime.now()),\n",
	" version=0.1 # or pick your number\n",
	" ),\n",
	")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "978fa2d1-53db-4834-9d84-69f9dc370b86",
	"metadata": {},
	"source": [
	"Now load the MAESTRO metadata from a JSON file, and add it to the array attributes:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "d1aca817-0831-4ea5-9583-3251787af82e",
	"metadata": {},
	"outputs": [],
	"source": [
	"# open the maestro metadata dictionary:\n",
	"maestro_metadata = json.load(open(metadata_json_path))\n",
	"\n",
	"# combine the array attributes with the maestro metadata:\n",
	"arr_attrs.update(maestro_metadata)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "7034de88-8924-4145-81dc-4fa4ef14edfc",
	"metadata": {},
	"source": [
	"Interolate the higher resolution H5 cross section array onto the lower resolution grid for zarr, store the values in the zarr array."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "5cb699a7-a23f-41e2-a3e7-7d8d44457ba6",
	"metadata": {},
	"outputs": [],
	"source": [
	"dimension_names = 'wavenumber temperature pressure'.split()\n",
	"\n",
	"arr = root.create_array(\n",
	" 'cross_section', \n",
	" shape=arr_shape, \n",
	" dtype=np.float64, \n",
	" dimension_names=dimension_names,\n",
	" attributes=arr_attrs,\n",
	" **compression\n",
	")\n",
	"\n",
	"# consolidation makes for efficient remote indexing:\n",
	"zarr.consolidate_metadata(store)\n",
	"\n",
	"\n",
	"# loop over temperature and pressure to downsample the \n",
	"# wavenumber grid and store the results in the zarr array:\n",
	"for i, temperature in tqdm(enumerate(temperatures), total=len(temperatures)):\n",
	" for j, pressure in enumerate(pressures):\n",
	"\n",
	" # find the corresponding (p, T) grid point entry in the table:\n",
	" nearest_coord = np.argmin(np.hypot(grid['pressure_bar'] - pressure, grid['temperature_K'] - temperature)) \n",
	" grid_point = dict(grid.loc[nearest_coord])\n",
	"\n",
	" # lookup the cross sections at (p, T):\n",
	" cross_section = h5_file['cxs'][nearest_coord]\n",
	"\n",
	" # reconstruct the wavenumber grid in the H5 file:\n",
	" wavenumber = np.arange(grid_point['number_wave_pts']) * grid_point['delta_wavenumber'] + grid_point['start_wavenumber']\n",
	"\n",
	" # interpolate onto the downsampled wavenumber grid, store in zarr\n",
	" arr[:, i, j] = np.interp(wavenumber_sampling, wavenumber, cross_section)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "3657e9cc-f2cb-4359-8677-eba8a70e0d28",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.12.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found