betatim · April 13, 2026 14:45
diff --git a/bench.py b/bench.py
 """
 Benchmark: numpy-to-cupy (CPU-to-GPU) transfer times on this machine.

 Target: GPU 1 (NVIDIA RTX A6000, 48 GB, PCIe Gen4 x16 slot)
 """

 import time
 import statistics
 import numpy as np
 import cupy as cp

 GPU_DEVICE = 0  # A6000 (CUDA enumerates fastest-first, differs from nvidia-smi)
 GPU_VRAM_BYTES = 48 * 1024**3  # 48 GB
 HOST_RAM_BYTES = 50 * 1024**3  # conservative: leave headroom for OS + other processes
 LATENCY_US = 10.0
 BW_GBS = 12.0e9  # A6000 asymptotic bandwidth (conservative single-value estimate)

 N_SAMPLES_LIST = [10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000]
 N_FEATURES_LIST = [1, 10, 100, 1_000, 10_000]


 def predict_us(nbytes):
    return LATENCY_US + nbytes / BW_GBS * 1e6


 def fmt_time(us):
    if us < 1000:
        return f"{us:.0f} us"
    elif us < 1_000_000:
        return f"{us / 1000:.1f} ms"
    else:
        return f"{us / 1e6:.2f} s"


 def measure_transfer(arr, n_reps=20):
    """Transfer arr to GPU n_reps times, return median time in seconds."""
    # warmup: multiple reps to prime the pool and stabilize after free_all_blocks
    for _ in range(5):
        try:
            g = cp.asarray(arr)
            cp.cuda.Stream.null.synchronize()
            del g
        except (cp.cuda.memory.OutOfMemoryError, MemoryError):
            return None

    times = []
    for _ in range(n_reps):
        cp.cuda.Stream.null.synchronize()
        t0 = time.perf_counter()
        g = cp.asarray(arr)
        cp.cuda.Stream.null.synchronize()
        t1 = time.perf_counter()
        times.append(t1 - t0)
        del g

    return statistics.median(times)


 def bench():
    print()
    print("=" * 72)
    print("Benchmark: n_samples x n_features grid (float64)")
    print("=" * 72)

    header = f"{'n_samples':>12} x {'n_feat':>6} | {'Size':>10} | {'Predicted':>12} | {'Measured':>12} | {'BW GB/s':>8} | {'Error':>7}"
    print(header)
    print("-" * len(header))

    results = {}
    for ns in N_SAMPLES_LIST:
        for nf in N_FEATURES_LIST:
            nbytes = ns * nf * 8
            pred_us = predict_us(nbytes)

            if nbytes > HOST_RAM_BYTES:
                print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |")
                results[(ns, nf)] = "HOST OOM"
                continue

            try:
                arr = np.random.rand(ns, nf).astype(np.float64)
            except MemoryError:
                print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |")
                results[(ns, nf)] = "HOST OOM"
                continue

            t = measure_transfer(arr)
            if t is None:
                print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'GPU OOM':>12} |")
                results[(ns, nf)] = "GPU OOM"
                del arr
                cp.get_default_memory_pool().free_all_blocks()
                continue

            t_us = t * 1e6
            bw = nbytes / (t_us * 1e-6) / 1e9
            err = (t_us - pred_us) / pred_us * 100
            print(f"{ns:>12} x {nf:>6} | {nbytes / 1e6:>9.1f}M | {fmt_time(pred_us):>12} | {fmt_time(t_us):>12} | {bw:>8.2f} | {err:>+6.0f}%")
            results[(ns, nf)] = t_us
            del arr
            cp.get_default_memory_pool().free_all_blocks()

    return results



 def main():
    print(f"GPU Transfer Benchmark")
    print(f"Target: GPU {GPU_DEVICE}")
    print()

    with cp.cuda.Device(GPU_DEVICE):
        grid_results = bench()

    print("\nDone.")


 if __name__ == "__main__":
    main()
diff --git a/results.txt b/results.txt
 ========================================================================
 Benchmark: n_samples x n_features grid (float64)
 ========================================================================

   n_samples x n_feat |       Size |    Predicted |     Measured |  BW GB/s |   Error
 -------------------------------------------------------------------------------------
          10 x      1 |       0.0M |        10 us |        12 us |     0.01 |    +20%
          10 x     10 |       0.0M |        10 us |        12 us |     0.07 |    +17%
          10 x    100 |       0.0M |        11 us |        13 us |     0.63 |    +18%
          10 x   1000 |       0.1M |        17 us |        17 us |     4.70 |     +2%
          10 x  10000 |       0.8M |        77 us |        64 us |    12.41 |    -16%
         100 x      1 |       0.0M |        10 us |        12 us |     0.07 |    +15%
         100 x     10 |       0.0M |        11 us |        13 us |     0.63 |    +18%
         100 x    100 |       0.1M |        17 us |        17 us |     4.74 |     +1%
         100 x   1000 |       0.8M |        77 us |        62 us |    12.99 |    -20%
         100 x  10000 |       8.0M |       677 us |       471 us |    17.00 |    -30%
        1000 x      1 |       0.0M |        11 us |        13 us |     0.63 |    +19%
        1000 x     10 |       0.1M |        17 us |        17 us |     4.73 |     +1%
        1000 x    100 |       0.8M |        77 us |        61 us |    13.03 |    -20%
        1000 x   1000 |       8.0M |       677 us |       469 us |    17.05 |    -31%
        1000 x  10000 |      80.0M |       6.7 ms |       6.6 ms |    12.12 |     -1%
       10000 x      1 |       0.1M |        17 us |        16 us |     5.14 |     -7%
       10000 x     10 |       0.8M |        77 us |        58 us |    13.91 |    -25%
       10000 x    100 |       8.0M |       677 us |       442 us |    18.10 |    -35%
       10000 x   1000 |      80.0M |       6.7 ms |       6.5 ms |    12.24 |     -2%
       10000 x  10000 |     800.0M |      66.7 ms |      68.3 ms |    11.71 |     +2%
      100000 x      1 |       0.8M |        77 us |       946 us |     0.85 |  +1134%
      100000 x     10 |       8.0M |       677 us |       443 us |    18.05 |    -35%
      100000 x    100 |      80.0M |       6.7 ms |       6.5 ms |    12.26 |     -2%
      100000 x   1000 |     800.0M |      66.7 ms |      68.3 ms |    11.72 |     +2%
      100000 x  10000 |    8000.0M |     666.7 ms |     683.0 ms |    11.71 |     +2%
     1000000 x      1 |       8.0M |       677 us |       1.4 ms |     5.91 |   +100%
     1000000 x     10 |      80.0M |       6.7 ms |       9.3 ms |     8.61 |    +39%
     1000000 x    100 |     800.0M |      66.7 ms |      68.2 ms |    11.72 |     +2%
     1000000 x   1000 |    8000.0M |     666.7 ms |     684.0 ms |    11.70 |     +3%
     1000000 x  10000 |      80.0G |       6.67 s |     HOST OOM |
    10000000 x      1 |      80.0M |       6.7 ms |       9.3 ms |     8.61 |    +39%
    10000000 x     10 |     800.0M |      66.7 ms |      68.4 ms |    11.70 |     +3%
    10000000 x    100 |    8000.0M |     666.7 ms |     683.8 ms |    11.70 |     +3%
    10000000 x   1000 |      80.0G |       6.67 s |     HOST OOM |
    10000000 x  10000 |     800.0G |      66.67 s |     HOST OOM |
	"""
	Benchmark: numpy-to-cupy (CPU-to-GPU) transfer times on this machine.

	Target: GPU 1 (NVIDIA RTX A6000, 48 GB, PCIe Gen4 x16 slot)
	"""

	import time
	import statistics
	import numpy as np
	import cupy as cp

	GPU_DEVICE = 0 # A6000 (CUDA enumerates fastest-first, differs from nvidia-smi)
	GPU_VRAM_BYTES = 48 * 1024**3 # 48 GB
	HOST_RAM_BYTES = 50 * 1024**3 # conservative: leave headroom for OS + other processes
	LATENCY_US = 10.0
	BW_GBS = 12.0e9 # A6000 asymptotic bandwidth (conservative single-value estimate)

	N_SAMPLES_LIST = [10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000]
	N_FEATURES_LIST = [1, 10, 100, 1_000, 10_000]


	def predict_us(nbytes):
	return LATENCY_US + nbytes / BW_GBS * 1e6


	def fmt_time(us):
	if us < 1000:
	return f"{us:.0f} us"
	elif us < 1_000_000:
	return f"{us / 1000:.1f} ms"
	else:
	return f"{us / 1e6:.2f} s"


	def measure_transfer(arr, n_reps=20):
	"""Transfer arr to GPU n_reps times, return median time in seconds."""
	# warmup: multiple reps to prime the pool and stabilize after free_all_blocks
	for _ in range(5):
	try:
	g = cp.asarray(arr)
	cp.cuda.Stream.null.synchronize()
	del g
	except (cp.cuda.memory.OutOfMemoryError, MemoryError):
	return None

	times = []
	for _ in range(n_reps):
	cp.cuda.Stream.null.synchronize()
	t0 = time.perf_counter()
	g = cp.asarray(arr)
	cp.cuda.Stream.null.synchronize()
	t1 = time.perf_counter()
	times.append(t1 - t0)
	del g

	return statistics.median(times)


	def bench():
	print()
	print("=" * 72)
	print("Benchmark: n_samples x n_features grid (float64)")
	print("=" * 72)

	header = f"{'n_samples':>12} x {'n_feat':>6} \| {'Size':>10} \| {'Predicted':>12} \| {'Measured':>12} \| {'BW GB/s':>8} \| {'Error':>7}"
	print(header)
	print("-" * len(header))

	results = {}
	for ns in N_SAMPLES_LIST:
	for nf in N_FEATURES_LIST:
	nbytes = ns * nf * 8
	pred_us = predict_us(nbytes)

	if nbytes > HOST_RAM_BYTES:
	print(f"{ns:>12} x {nf:>6} \| {nbytes / 1e9:>9.1f}G \| {fmt_time(pred_us):>12} \| {'HOST OOM':>12} \|")
	results[(ns, nf)] = "HOST OOM"
	continue

	try:
	arr = np.random.rand(ns, nf).astype(np.float64)
	except MemoryError:
	print(f"{ns:>12} x {nf:>6} \| {nbytes / 1e9:>9.1f}G \| {fmt_time(pred_us):>12} \| {'HOST OOM':>12} \|")
	results[(ns, nf)] = "HOST OOM"
	continue

	t = measure_transfer(arr)
	if t is None:
	print(f"{ns:>12} x {nf:>6} \| {nbytes / 1e9:>9.1f}G \| {fmt_time(pred_us):>12} \| {'GPU OOM':>12} \|")
	results[(ns, nf)] = "GPU OOM"
	del arr
	cp.get_default_memory_pool().free_all_blocks()
	continue

	t_us = t * 1e6
	bw = nbytes / (t_us * 1e-6) / 1e9
	err = (t_us - pred_us) / pred_us * 100
	print(f"{ns:>12} x {nf:>6} \| {nbytes / 1e6:>9.1f}M \| {fmt_time(pred_us):>12} \| {fmt_time(t_us):>12} \| {bw:>8.2f} \| {err:>+6.0f}%")
	results[(ns, nf)] = t_us
	del arr
	cp.get_default_memory_pool().free_all_blocks()

	return results



	def main():
	print(f"GPU Transfer Benchmark")
	print(f"Target: GPU {GPU_DEVICE}")
	print()

	with cp.cuda.Device(GPU_DEVICE):
	grid_results = bench()

	print("\nDone.")


	if __name__ == "__main__":
	main()
	========================================================================
	Benchmark: n_samples x n_features grid (float64)
	========================================================================

	n_samples x n_feat \| Size \| Predicted \| Measured \| BW GB/s \| Error
	-------------------------------------------------------------------------------------
	10 x 1 \| 0.0M \| 10 us \| 12 us \| 0.01 \| +20%
	10 x 10 \| 0.0M \| 10 us \| 12 us \| 0.07 \| +17%
	10 x 100 \| 0.0M \| 11 us \| 13 us \| 0.63 \| +18%
	10 x 1000 \| 0.1M \| 17 us \| 17 us \| 4.70 \| +2%
	10 x 10000 \| 0.8M \| 77 us \| 64 us \| 12.41 \| -16%
	100 x 1 \| 0.0M \| 10 us \| 12 us \| 0.07 \| +15%
	100 x 10 \| 0.0M \| 11 us \| 13 us \| 0.63 \| +18%
	100 x 100 \| 0.1M \| 17 us \| 17 us \| 4.74 \| +1%
	100 x 1000 \| 0.8M \| 77 us \| 62 us \| 12.99 \| -20%
	100 x 10000 \| 8.0M \| 677 us \| 471 us \| 17.00 \| -30%
	1000 x 1 \| 0.0M \| 11 us \| 13 us \| 0.63 \| +19%
	1000 x 10 \| 0.1M \| 17 us \| 17 us \| 4.73 \| +1%
	1000 x 100 \| 0.8M \| 77 us \| 61 us \| 13.03 \| -20%
	1000 x 1000 \| 8.0M \| 677 us \| 469 us \| 17.05 \| -31%
	1000 x 10000 \| 80.0M \| 6.7 ms \| 6.6 ms \| 12.12 \| -1%
	10000 x 1 \| 0.1M \| 17 us \| 16 us \| 5.14 \| -7%
	10000 x 10 \| 0.8M \| 77 us \| 58 us \| 13.91 \| -25%
	10000 x 100 \| 8.0M \| 677 us \| 442 us \| 18.10 \| -35%
	10000 x 1000 \| 80.0M \| 6.7 ms \| 6.5 ms \| 12.24 \| -2%
	10000 x 10000 \| 800.0M \| 66.7 ms \| 68.3 ms \| 11.71 \| +2%
	100000 x 1 \| 0.8M \| 77 us \| 946 us \| 0.85 \| +1134%
	100000 x 10 \| 8.0M \| 677 us \| 443 us \| 18.05 \| -35%
	100000 x 100 \| 80.0M \| 6.7 ms \| 6.5 ms \| 12.26 \| -2%
	100000 x 1000 \| 800.0M \| 66.7 ms \| 68.3 ms \| 11.72 \| +2%
	100000 x 10000 \| 8000.0M \| 666.7 ms \| 683.0 ms \| 11.71 \| +2%
	1000000 x 1 \| 8.0M \| 677 us \| 1.4 ms \| 5.91 \| +100%
	1000000 x 10 \| 80.0M \| 6.7 ms \| 9.3 ms \| 8.61 \| +39%
	1000000 x 100 \| 800.0M \| 66.7 ms \| 68.2 ms \| 11.72 \| +2%
	1000000 x 1000 \| 8000.0M \| 666.7 ms \| 684.0 ms \| 11.70 \| +3%
	1000000 x 10000 \| 80.0G \| 6.67 s \| HOST OOM \|
	10000000 x 1 \| 80.0M \| 6.7 ms \| 9.3 ms \| 8.61 \| +39%
	10000000 x 10 \| 800.0M \| 66.7 ms \| 68.4 ms \| 11.70 \| +3%
	10000000 x 100 \| 8000.0M \| 666.7 ms \| 683.8 ms \| 11.70 \| +3%
	10000000 x 1000 \| 80.0G \| 6.67 s \| HOST OOM \|
	10000000 x 10000 \| 800.0G \| 66.67 s \| HOST OOM \|