Skip to content

Instantly share code, notes, and snippets.

@drbh
Created July 14, 2025 14:25
Show Gist options
  • Save drbh/3afbc8a6d6b721a9e3ea11ade424a3f9 to your computer and use it in GitHub Desktop.
Save drbh/3afbc8a6d6b721a9e3ea11ade424a3f9 to your computer and use it in GitHub Desktop.
tool to chunk h5ad files
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "anndata",
# "numpy",
# ]
# ///
import anndata as ad
import numpy as np
import os
import sys
if len(sys.argv) != 3:
print("Usage: python main.py <input_file> <chunk_size_gb>")
sys.exit(1)
fpath = sys.argv[1]
chunk_size_gb = float(sys.argv[2])
print("Input File Profiling")
file_size = os.path.getsize(fpath)
print(f"File size: {file_size / (1024**3):.2f} GB ({file_size:,} bytes)")
adata = ad.read_h5ad(fpath)
print(f"Shape: {adata.shape} (observations x variables)")
print(f"Memory usage: {adata.X.data.nbytes / (1024**3):.2f} GB")
print(f"Data type: {adata.X.dtype}")
target_size = chunk_size_gb * 1024**3
n_chunks = int(np.ceil(file_size / target_size))
chunk_size = len(adata) // n_chunks
print(f"\nChunking Plan")
print(f"Target chunk size: {chunk_size_gb} GB")
print(f"Number of chunks: {n_chunks}")
print(f"Observations per chunk: {chunk_size}")
print(f"Last chunk will have: {len(adata) - (n_chunks-1) * chunk_size} observations")
print(f"Estimated chunk file size: ~{file_size / n_chunks / (1024**3):.2f} GB each")
print(f"\nCreating Chunks")
for i in range(n_chunks):
start_idx = i * chunk_size
if i == n_chunks - 1:
end_idx = len(adata)
else:
end_idx = (i + 1) * chunk_size
chunk = adata[start_idx:end_idx].copy()
chunk_filename = f'chunk_{i:03d}.h5ad'
chunk.write_h5ad(chunk_filename)
chunk_file_size = os.path.getsize(chunk_filename)
print(f'Chunk {i+1}/{n_chunks}: {start_idx}-{end_idx} ({end_idx-start_idx} obs) -> {chunk_file_size / (1024**3):.2f} GB')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment