Skip to content

Instantly share code, notes, and snippets.

@tlrmchlsmth
Last active November 26, 2024 01:06
Show Gist options
  • Save tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 to your computer and use it in GitHub Desktop.
Save tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 to your computer and use it in GitHub Desktop.
import numpy as np
import msgpack
import pickle
import time
import json
from msgspec import msgpack as msgspec_msgpack
import struct
from io import BytesIO
def benchmark_methods(arr, num_iterations=1000):
results = {}
methods = {
'numpy.save': {
'serialize': lambda x: numpy_save(x),
'deserialize': lambda x: numpy_load(x)
},
'pickle': {
'serialize': lambda x: pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL),
'deserialize': lambda x: pickle.loads(x)
},
'msgpack_raw': {
'serialize': lambda x: msgpack.packb(x.tobytes()),
'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x), dtype=arr.dtype).reshape(arr.shape)
},
'msgpack_with_shape': {
'serialize': lambda x: msgpack.packb({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x)['data'],
dtype=np.dtype(msgpack.unpackb(x)['dtype']))
.reshape(msgpack.unpackb(x)['shape'])
},
'msgspec': {
'serialize': lambda x: msgspec_msgpack.encode({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
'deserialize': lambda x: np.frombuffer(msgspec_msgpack.decode(x)['data'],
dtype=np.dtype(msgspec_msgpack.decode(x)['dtype']))
.reshape(msgspec_msgpack.decode(x)['shape'])
},
'custom_binary': {
'serialize': lambda x: struct.pack('B', len(x.shape)) + struct.pack(f'{len(x.shape)}I', *x.shape) + x.tobytes(),
'deserialize': lambda x: deserialize_custom_binary(x, arr.dtype)
}
}
# Warm-up run
for name, method in methods.items():
serialized = method['serialize'](arr)
method['deserialize'](serialized)
# Benchmark each method
for name, method in methods.items():
# Serialization timing
start_time = time.perf_counter()
for _ in range(num_iterations):
serialized = method['serialize'](arr)
serialize_time = (time.perf_counter() - start_time) / num_iterations
# Deserialization timing
start_time = time.perf_counter()
for _ in range(num_iterations):
deserialized = method['deserialize'](serialized)
deserialize_time = (time.perf_counter() - start_time) / num_iterations
# Verify correctness
final = method['deserialize'](serialized)
is_equal = np.array_equal(arr, final)
results[name] = {
'serialize_time': serialize_time,
'deserialize_time': deserialize_time,
'total_time': serialize_time + deserialize_time,
'size': len(serialized),
'correct': is_equal
}
return results
def numpy_save(arr):
bio = BytesIO()
np.save(bio, arr)
return bio.getvalue()
def numpy_load(data):
bio = BytesIO(data)
return np.load(bio)
def deserialize_custom_binary(data, dtype):
# First byte is number of dimensions
ndim = struct.unpack('B', data[:1])[0]
# Read shape (starts after ndim byte)
shape_size = ndim * struct.calcsize('I')
shape = struct.unpack(f'{ndim}I', data[1:1+shape_size])
# Convert remaining bytes to array
return np.frombuffer(data[1+shape_size:], dtype=dtype).reshape(shape)
def print_results(results):
print("\nResults:")
print(f"{'Method':<20} {'Serialize (μs)':<15} {'Deserialize (μs)':<15} {'Total (μs)':<15} {'Size (bytes)':<15} {'Correct':<8}")
print("-" * 90)
# Sort by total time
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['total_time']))
for method, data in sorted_results.items():
print(f"{method:<20} {data['serialize_time']*1e6:>8.2f} {data['deserialize_time']*1e6:>8.2f} "
f"{data['total_time']*1e6:>8.2f} {data['size']:>8,d} {str(data['correct']):>5}")
def run_benchmarks():
# Test with different array sizes and types
test_arrays = [
('small_int', np.random.randint(0, 100, (100, 100), dtype=np.int32)),
('medium_int', np.random.randint(0, 100, (1000, 1000), dtype=np.int32)),
('small_float', np.random.random((100, 100)).astype(np.float32)),
('medium_float', np.random.random((1000, 1000)).astype(np.float32))
]
for name, arr in test_arrays:
print(f"\nBenchmarking {name} array:")
print(f"Shape: {arr.shape}, dtype: {arr.dtype}")
results = benchmark_methods(arr)
print_results(results)
if __name__ == "__main__":
run_benchmarks()
@tlrmchlsmth
Copy link
Author

Benchmarking small_int array:
Shape: (100, 100), dtype: int32

Results:
Method               Serialize (μs)  Deserialize (μs) Total (μs)      Size (bytes)    Correct 
------------------------------------------------------------------------------------------
custom_binary            2.80           2.46           5.26         40,009        True
msgpack_raw              3.43           2.05           5.48         40,003        True
msgspec                  5.96           5.16          11.12         40,030        True
pickle                   6.99           4.67          11.66         40,128        True
msgpack_with_shape       8.20           5.73          13.93         40,030        True
numpy.save              12.63          44.15          56.78         40,128        True

Benchmarking medium_int array:
Shape: (1000, 1000), dtype: int32

Results:
Method               Serialize (μs)  Deserialize (μs) Total (μs)      Size (bytes)    Correct 
------------------------------------------------------------------------------------------
pickle                 308.16         185.87         494.02       4,000,139        True
custom_binary          484.85         181.77         666.63       4,000,009        True
numpy.save             401.49         312.11         713.60       4,000,128        True
msgpack_raw            765.10         180.49         945.59       4,000,005        True
msgpack_with_shape     711.30         606.56        1317.86       4,000,036        True
msgspec                945.90        1258.41        2204.32       4,000,036        True

Benchmarking small_float array:
Shape: (100, 100), dtype: float32

Results:
Method               Serialize (μs)  Deserialize (μs) Total (μs)      Size (bytes)    Correct 
------------------------------------------------------------------------------------------
custom_binary            2.49           2.17           4.65         40,009        True
msgpack_raw              3.29           1.96           5.25         40,003        True
msgspec                  5.33           4.52           9.85         40,032        True
pickle                   6.32           4.55          10.87         40,128        True
msgpack_with_shape       6.90           4.91          11.81         40,032        True
numpy.save              10.52          36.71          47.24         40,128        True

Benchmarking medium_float array:
Shape: (1000, 1000), dtype: float32

Results:
Method               Serialize (μs)  Deserialize (μs) Total (μs)      Size (bytes)    Correct 
------------------------------------------------------------------------------------------
pickle                 300.24         186.46         486.70       4,000,139        True
custom_binary          486.28         181.97         668.25       4,000,009        True
numpy.save             408.33         312.88         721.21       4,000,128        True
msgpack_raw            751.93         179.07         931.00       4,000,005        True
msgpack_with_shape     660.78         615.42        1276.20       4,000,038        True
msgspec                939.32        1275.73        2215.05       4,000,038        True

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment