tlrmchlsmth · November 26, 2024 01:06 · tlrmchlsmth · Nov 25, 2024
diff --git a/benchmark_numpy_serialization.py b/benchmark_numpy_serialization.py
 import numpy as np
 import msgpack
 import pickle
 import time
 import json
 from msgspec import msgpack as msgspec_msgpack
 import struct
 from io import BytesIO

 def benchmark_methods(arr, num_iterations=1000):
    results = {}

    methods = {
        'numpy.save': {
            'serialize': lambda x: numpy_save(x),
            'deserialize': lambda x: numpy_load(x)
        },
        'pickle': {
            'serialize': lambda x: pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL),
            'deserialize': lambda x: pickle.loads(x)
        },
        'msgpack_raw': {
            'serialize': lambda x: msgpack.packb(x.tobytes()),
            'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x), dtype=arr.dtype).reshape(arr.shape)
        },
        'msgpack_with_shape': {
            'serialize': lambda x: msgpack.packb({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
            'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x)['data'], 
                                                 dtype=np.dtype(msgpack.unpackb(x)['dtype']))
                                    .reshape(msgpack.unpackb(x)['shape'])
        },
        'msgspec': {
            'serialize': lambda x: msgspec_msgpack.encode({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
            'deserialize': lambda x: np.frombuffer(msgspec_msgpack.decode(x)['data'], 
                                                 dtype=np.dtype(msgspec_msgpack.decode(x)['dtype']))
                                    .reshape(msgspec_msgpack.decode(x)['shape'])
        },
        'custom_binary': {
            'serialize': lambda x: struct.pack('B', len(x.shape)) + struct.pack(f'{len(x.shape)}I', *x.shape) + x.tobytes(),
            'deserialize': lambda x: deserialize_custom_binary(x, arr.dtype)
        }
    }

    # Warm-up run
    for name, method in methods.items():
        serialized = method['serialize'](arr)
        method['deserialize'](serialized)

    # Benchmark each method
    for name, method in methods.items():
        # Serialization timing
        start_time = time.perf_counter()
        for _ in range(num_iterations):
            serialized = method['serialize'](arr)
        serialize_time = (time.perf_counter() - start_time) / num_iterations

        # Deserialization timing
        start_time = time.perf_counter()
        for _ in range(num_iterations):
            deserialized = method['deserialize'](serialized)
        deserialize_time = (time.perf_counter() - start_time) / num_iterations

        # Verify correctness
        final = method['deserialize'](serialized)
        is_equal = np.array_equal(arr, final)

        results[name] = {
            'serialize_time': serialize_time,
            'deserialize_time': deserialize_time,
            'total_time': serialize_time + deserialize_time,
            'size': len(serialized),
            'correct': is_equal
        }

    return results

 def numpy_save(arr):
    bio = BytesIO()
    np.save(bio, arr)
    return bio.getvalue()

 def numpy_load(data):
    bio = BytesIO(data)
    return np.load(bio)

 def deserialize_custom_binary(data, dtype):
    # First byte is number of dimensions
    ndim = struct.unpack('B', data[:1])[0]

    # Read shape (starts after ndim byte)
    shape_size = ndim * struct.calcsize('I')
    shape = struct.unpack(f'{ndim}I', data[1:1+shape_size])

    # Convert remaining bytes to array
    return np.frombuffer(data[1+shape_size:], dtype=dtype).reshape(shape)

 def print_results(results):
    print("\nResults:")
    print(f"{'Method':<20} {'Serialize (μs)':<15} {'Deserialize (μs)':<15} {'Total (μs)':<15} {'Size (bytes)':<15} {'Correct':<8}")
    print("-" * 90)

    # Sort by total time
    sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['total_time']))

    for method, data in sorted_results.items():
        print(f"{method:<20} {data['serialize_time']*1e6:>8.2f}       {data['deserialize_time']*1e6:>8.2f}       "
              f"{data['total_time']*1e6:>8.2f}       {data['size']:>8,d}       {str(data['correct']):>5}")

 def run_benchmarks():
    # Test with different array sizes and types
    test_arrays = [
        ('small_int', np.random.randint(0, 100, (100, 100), dtype=np.int32)),
        ('medium_int', np.random.randint(0, 100, (1000, 1000), dtype=np.int32)),
        ('small_float', np.random.random((100, 100)).astype(np.float32)),
        ('medium_float', np.random.random((1000, 1000)).astype(np.float32))
    ]

    for name, arr in test_arrays:
        print(f"\nBenchmarking {name} array:")
        print(f"Shape: {arr.shape}, dtype: {arr.dtype}")
        results = benchmark_methods(arr)
        print_results(results)

 if __name__ == "__main__":
    run_benchmarks()
	import numpy as np
	import msgpack
	import pickle
	import time
	import json
	from msgspec import msgpack as msgspec_msgpack
	import struct
	from io import BytesIO

	def benchmark_methods(arr, num_iterations=1000):
	results = {}

	methods = {
	'numpy.save': {
	'serialize': lambda x: numpy_save(x),
	'deserialize': lambda x: numpy_load(x)
	},
	'pickle': {
	'serialize': lambda x: pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL),
	'deserialize': lambda x: pickle.loads(x)
	},
	'msgpack_raw': {
	'serialize': lambda x: msgpack.packb(x.tobytes()),
	'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x), dtype=arr.dtype).reshape(arr.shape)
	},
	'msgpack_with_shape': {
	'serialize': lambda x: msgpack.packb({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
	'deserialize': lambda x: np.frombuffer(msgpack.unpackb(x)['data'],
	dtype=np.dtype(msgpack.unpackb(x)['dtype']))
	.reshape(msgpack.unpackb(x)['shape'])
	},
	'msgspec': {
	'serialize': lambda x: msgspec_msgpack.encode({'shape': x.shape, 'dtype': str(x.dtype), 'data': x.tobytes()}),
	'deserialize': lambda x: np.frombuffer(msgspec_msgpack.decode(x)['data'],
	dtype=np.dtype(msgspec_msgpack.decode(x)['dtype']))
	.reshape(msgspec_msgpack.decode(x)['shape'])
	},
	'custom_binary': {
	'serialize': lambda x: struct.pack('B', len(x.shape)) + struct.pack(f'{len(x.shape)}I', *x.shape) + x.tobytes(),
	'deserialize': lambda x: deserialize_custom_binary(x, arr.dtype)
	}
	}

	# Warm-up run
	for name, method in methods.items():
	serialized = method['serialize'](arr)
	method['deserialize'](serialized)

	# Benchmark each method
	for name, method in methods.items():
	# Serialization timing
	start_time = time.perf_counter()
	for _ in range(num_iterations):
	serialized = method['serialize'](arr)
	serialize_time = (time.perf_counter() - start_time) / num_iterations

	# Deserialization timing
	start_time = time.perf_counter()
	for _ in range(num_iterations):
	deserialized = method['deserialize'](serialized)
	deserialize_time = (time.perf_counter() - start_time) / num_iterations

	# Verify correctness
	final = method['deserialize'](serialized)
	is_equal = np.array_equal(arr, final)

	results[name] = {
	'serialize_time': serialize_time,
	'deserialize_time': deserialize_time,
	'total_time': serialize_time + deserialize_time,
	'size': len(serialized),
	'correct': is_equal
	}

	return results

	def numpy_save(arr):
	bio = BytesIO()
	np.save(bio, arr)
	return bio.getvalue()

	def numpy_load(data):
	bio = BytesIO(data)
	return np.load(bio)

	def deserialize_custom_binary(data, dtype):
	# First byte is number of dimensions
	ndim = struct.unpack('B', data[:1])[0]

	# Read shape (starts after ndim byte)
	shape_size = ndim * struct.calcsize('I')
	shape = struct.unpack(f'{ndim}I', data[1:1+shape_size])

	# Convert remaining bytes to array
	return np.frombuffer(data[1+shape_size:], dtype=dtype).reshape(shape)

	def print_results(results):
	print("\nResults:")
	print(f"{'Method':<20} {'Serialize (μs)':<15} {'Deserialize (μs)':<15} {'Total (μs)':<15} {'Size (bytes)':<15} {'Correct':<8}")
	print("-" * 90)

	# Sort by total time
	sorted_results = dict(sorted(results.items(), key=lambda x: x[1]['total_time']))

	for method, data in sorted_results.items():
	print(f"{method:<20} {data['serialize_time']1e6:>8.2f} {data['deserialize_time']1e6:>8.2f} "
	f"{data['total_time']*1e6:>8.2f} {data['size']:>8,d} {str(data['correct']):>5}")

	def run_benchmarks():
	# Test with different array sizes and types
	test_arrays = [
	('small_int', np.random.randint(0, 100, (100, 100), dtype=np.int32)),
	('medium_int', np.random.randint(0, 100, (1000, 1000), dtype=np.int32)),
	('small_float', np.random.random((100, 100)).astype(np.float32)),
	('medium_float', np.random.random((1000, 1000)).astype(np.float32))
	]

	for name, arr in test_arrays:
	print(f"\nBenchmarking {name} array:")
	print(f"Shape: {arr.shape}, dtype: {arr.dtype}")
	results = benchmark_methods(arr)
	print_results(results)

	if __name__ == "__main__":
	run_benchmarks()