Created
March 25, 2025 21:05
-
-
Save DeflateAwning/dd19fd9089e7529b6d26322c4aed042d to your computer and use it in GitHub Desktop.
Benchmark the conversion of Numpy to Polars vs. Numpy to Pandas to Polars
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Related to https://github.com/pydata/xarray/issues/10135 | |
Result (in VS Code debugger): | |
Shape | NumPy → Polars | NumPy → Pandas → Polars | |
----------------------------------------------------------------- | |
10,000 x 10 | 0.003997 s | 0.033097 s | |
10,000 x 200 | 0.002424 s | 0.050915 s | |
100,000 x 10 | 0.000278 s | 0.021541 s | |
100,000 x 200 | 0.002266 s | 0.090521 s | |
1,000,000 x 10 | 0.000154 s | 0.045166 s | |
1,000,000 x 200 | 0.002373 s | 0.466811 s | |
10,000,000 x 10 | 0.000161 s | 0.254206 s | |
Result (in normal run): | |
Shape | NumPy → Polars | NumPy → Pandas → Polars | |
----------------------------------------------------------------- | |
10,000 x 10 | 0.000153 s | 0.006164 s | |
10,000 x 200 | 0.000791 s | 0.021460 s | |
100,000 x 10 | 0.000071 s | 0.007551 s | |
100,000 x 200 | 0.000794 s | 0.050238 s | |
1,000,000 x 10 | 0.000062 s | 0.024121 s | |
1,000,000 x 200 | 0.000781 s | 0.383526 s | |
10,000,000 x 10 | 0.000063 s | 0.232983 s | |
""" | |
import numpy as np | |
import pandas as pd | |
import polars as pl | |
import timeit | |
# Array shapes to test | |
shapes = [ | |
(10_000, 10), | |
(10_000, 200), | |
(100_000, 10), | |
(100_000, 200), | |
(1_000_000, 10), | |
(1_000_000, 200), | |
(10_000_000, 10), | |
] | |
REPEATS = 5 | |
def time_numpy_to_polars(arr: dict): | |
def fn(): | |
df_pl = pl.DataFrame(arr) | |
assert df_pl.height > 1000 | |
assert len(df_pl.columns) in (10, 200) | |
return df_pl | |
return timeit.timeit(fn, number=REPEATS) / REPEATS | |
def time_numpy_to_pandas_to_polars(arr): | |
def fn(): | |
df = pd.DataFrame(arr) | |
df_pl = pl.from_pandas(df, rechunk=True) | |
assert df_pl.height > 1000 | |
assert len(df_pl.columns) in (10, 200) | |
del df | |
return df_pl | |
return timeit.timeit(fn, number=REPEATS) / REPEATS | |
def benchmark(): | |
print(f"{'Shape':>15} | {'NumPy → Polars':>18} | {'NumPy → Pandas → Polars':>23}") | |
print("-" * 65) | |
for row_count, col_count in shapes: | |
arr1 = {f"col_{i}": np.random.rand(row_count) for i in range(col_count)} | |
t_np_pd_polars = time_numpy_to_pandas_to_polars(arr1) | |
del arr1 | |
arr2 = {f"col_{i}": np.random.rand(row_count) for i in range(col_count)} | |
t_np_polars = time_numpy_to_polars(arr2) | |
del arr2 | |
shape_str = f"{row_count:,} x {col_count}" | |
print(f"{shape_str:>15} | {t_np_polars:>18.6f} s | {t_np_pd_polars:>23.6f} s") | |
for _ in range(5): | |
benchmark() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment