Last active
April 16, 2025 13:47
-
-
Save vadimkantorov/883164bd9c88ceaee28daa0b353e6360 to your computer and use it in GitHub Desktop.
Convert Parquet tables to npy (as record array) or npz (as columns)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Usage: python parquet2npyz.py test.npy data/train-*-of-*.parquet | |
# Usage: python parquet2npyz.py test.npz data/train-*-of-*.parquet | |
import sys | |
import numpy as np | |
import pyarrow.parquet as pq | |
output_path, *input_paths = sys.argv[1:] | |
cols = {} | |
for input_path in sorted(input_paths): | |
table = pq.read_table(input_path) | |
for col_name in table.column_names: | |
cols[col_name] = np.append(cols.get(col_name, []), table[col_name].to_numpy()) # to_numpy() returns dtype=object for string columns | |
if output_path.endswith('.npy'): | |
np.save(output_path, np.rec.fromarrays([cols[col_name] for col_name in cols.keys()], names = list(cols.keys())), allow_pickle=True) | |
if output_path.endswith('.npz'): | |
np.savez(output_path, **cols, allow_pickle=True) | |
# must use allow_pickle=True, NumPy does not currently support saving/loading varlen string arrays without pickle: https://github.com/numpy/numpy/issues/25693#issuecomment-2809206226 | |
# np.load(output_path, allow_pickle=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment