Created
July 14, 2025 14:25
-
-
Save drbh/3afbc8a6d6b721a9e3ea11ade424a3f9 to your computer and use it in GitHub Desktop.
tool to chunk h5ad files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.10" | |
# dependencies = [ | |
# "anndata", | |
# "numpy", | |
# ] | |
# /// | |
import anndata as ad | |
import numpy as np | |
import os | |
import sys | |
if len(sys.argv) != 3: | |
print("Usage: python main.py <input_file> <chunk_size_gb>") | |
sys.exit(1) | |
fpath = sys.argv[1] | |
chunk_size_gb = float(sys.argv[2]) | |
print("Input File Profiling") | |
file_size = os.path.getsize(fpath) | |
print(f"File size: {file_size / (1024**3):.2f} GB ({file_size:,} bytes)") | |
adata = ad.read_h5ad(fpath) | |
print(f"Shape: {adata.shape} (observations x variables)") | |
print(f"Memory usage: {adata.X.data.nbytes / (1024**3):.2f} GB") | |
print(f"Data type: {adata.X.dtype}") | |
target_size = chunk_size_gb * 1024**3 | |
n_chunks = int(np.ceil(file_size / target_size)) | |
chunk_size = len(adata) // n_chunks | |
print(f"\nChunking Plan") | |
print(f"Target chunk size: {chunk_size_gb} GB") | |
print(f"Number of chunks: {n_chunks}") | |
print(f"Observations per chunk: {chunk_size}") | |
print(f"Last chunk will have: {len(adata) - (n_chunks-1) * chunk_size} observations") | |
print(f"Estimated chunk file size: ~{file_size / n_chunks / (1024**3):.2f} GB each") | |
print(f"\nCreating Chunks") | |
for i in range(n_chunks): | |
start_idx = i * chunk_size | |
if i == n_chunks - 1: | |
end_idx = len(adata) | |
else: | |
end_idx = (i + 1) * chunk_size | |
chunk = adata[start_idx:end_idx].copy() | |
chunk_filename = f'chunk_{i:03d}.h5ad' | |
chunk.write_h5ad(chunk_filename) | |
chunk_file_size = os.path.getsize(chunk_filename) | |
print(f'Chunk {i+1}/{n_chunks}: {start_idx}-{end_idx} ({end_idx-start_idx} obs) -> {chunk_file_size / (1024**3):.2f} GB') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment