Skip to content

Instantly share code, notes, and snippets.

@betatim
Created April 13, 2026 14:45
Show Gist options
  • Select an option

  • Save betatim/f4128b961525eda49df9b76c7c339388 to your computer and use it in GitHub Desktop.

Select an option

Save betatim/f4128b961525eda49df9b76c7c339388 to your computer and use it in GitHub Desktop.
Estimate how long it takes to "transfer" a Numpy array to a Cupy array
"""
Benchmark: numpy-to-cupy (CPU-to-GPU) transfer times on this machine.
Target: GPU 1 (NVIDIA RTX A6000, 48 GB, PCIe Gen4 x16 slot)
"""
import time
import statistics
import numpy as np
import cupy as cp
GPU_DEVICE = 0 # A6000 (CUDA enumerates fastest-first, differs from nvidia-smi)
GPU_VRAM_BYTES = 48 * 1024**3 # 48 GB
HOST_RAM_BYTES = 50 * 1024**3 # conservative: leave headroom for OS + other processes
LATENCY_US = 10.0
BW_GBS = 12.0e9 # A6000 asymptotic bandwidth (conservative single-value estimate)
N_SAMPLES_LIST = [10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000]
N_FEATURES_LIST = [1, 10, 100, 1_000, 10_000]
def predict_us(nbytes):
return LATENCY_US + nbytes / BW_GBS * 1e6
def fmt_time(us):
if us < 1000:
return f"{us:.0f} us"
elif us < 1_000_000:
return f"{us / 1000:.1f} ms"
else:
return f"{us / 1e6:.2f} s"
def measure_transfer(arr, n_reps=20):
"""Transfer arr to GPU n_reps times, return median time in seconds."""
# warmup: multiple reps to prime the pool and stabilize after free_all_blocks
for _ in range(5):
try:
g = cp.asarray(arr)
cp.cuda.Stream.null.synchronize()
del g
except (cp.cuda.memory.OutOfMemoryError, MemoryError):
return None
times = []
for _ in range(n_reps):
cp.cuda.Stream.null.synchronize()
t0 = time.perf_counter()
g = cp.asarray(arr)
cp.cuda.Stream.null.synchronize()
t1 = time.perf_counter()
times.append(t1 - t0)
del g
return statistics.median(times)
def bench():
print()
print("=" * 72)
print("Benchmark: n_samples x n_features grid (float64)")
print("=" * 72)
header = f"{'n_samples':>12} x {'n_feat':>6} | {'Size':>10} | {'Predicted':>12} | {'Measured':>12} | {'BW GB/s':>8} | {'Error':>7}"
print(header)
print("-" * len(header))
results = {}
for ns in N_SAMPLES_LIST:
for nf in N_FEATURES_LIST:
nbytes = ns * nf * 8
pred_us = predict_us(nbytes)
if nbytes > HOST_RAM_BYTES:
print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |")
results[(ns, nf)] = "HOST OOM"
continue
try:
arr = np.random.rand(ns, nf).astype(np.float64)
except MemoryError:
print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |")
results[(ns, nf)] = "HOST OOM"
continue
t = measure_transfer(arr)
if t is None:
print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'GPU OOM':>12} |")
results[(ns, nf)] = "GPU OOM"
del arr
cp.get_default_memory_pool().free_all_blocks()
continue
t_us = t * 1e6
bw = nbytes / (t_us * 1e-6) / 1e9
err = (t_us - pred_us) / pred_us * 100
print(f"{ns:>12} x {nf:>6} | {nbytes / 1e6:>9.1f}M | {fmt_time(pred_us):>12} | {fmt_time(t_us):>12} | {bw:>8.2f} | {err:>+6.0f}%")
results[(ns, nf)] = t_us
del arr
cp.get_default_memory_pool().free_all_blocks()
return results
def main():
print(f"GPU Transfer Benchmark")
print(f"Target: GPU {GPU_DEVICE}")
print()
with cp.cuda.Device(GPU_DEVICE):
grid_results = bench()
print("\nDone.")
if __name__ == "__main__":
main()
========================================================================
Benchmark: n_samples x n_features grid (float64)
========================================================================
n_samples x n_feat | Size | Predicted | Measured | BW GB/s | Error
-------------------------------------------------------------------------------------
10 x 1 | 0.0M | 10 us | 12 us | 0.01 | +20%
10 x 10 | 0.0M | 10 us | 12 us | 0.07 | +17%
10 x 100 | 0.0M | 11 us | 13 us | 0.63 | +18%
10 x 1000 | 0.1M | 17 us | 17 us | 4.70 | +2%
10 x 10000 | 0.8M | 77 us | 64 us | 12.41 | -16%
100 x 1 | 0.0M | 10 us | 12 us | 0.07 | +15%
100 x 10 | 0.0M | 11 us | 13 us | 0.63 | +18%
100 x 100 | 0.1M | 17 us | 17 us | 4.74 | +1%
100 x 1000 | 0.8M | 77 us | 62 us | 12.99 | -20%
100 x 10000 | 8.0M | 677 us | 471 us | 17.00 | -30%
1000 x 1 | 0.0M | 11 us | 13 us | 0.63 | +19%
1000 x 10 | 0.1M | 17 us | 17 us | 4.73 | +1%
1000 x 100 | 0.8M | 77 us | 61 us | 13.03 | -20%
1000 x 1000 | 8.0M | 677 us | 469 us | 17.05 | -31%
1000 x 10000 | 80.0M | 6.7 ms | 6.6 ms | 12.12 | -1%
10000 x 1 | 0.1M | 17 us | 16 us | 5.14 | -7%
10000 x 10 | 0.8M | 77 us | 58 us | 13.91 | -25%
10000 x 100 | 8.0M | 677 us | 442 us | 18.10 | -35%
10000 x 1000 | 80.0M | 6.7 ms | 6.5 ms | 12.24 | -2%
10000 x 10000 | 800.0M | 66.7 ms | 68.3 ms | 11.71 | +2%
100000 x 1 | 0.8M | 77 us | 946 us | 0.85 | +1134%
100000 x 10 | 8.0M | 677 us | 443 us | 18.05 | -35%
100000 x 100 | 80.0M | 6.7 ms | 6.5 ms | 12.26 | -2%
100000 x 1000 | 800.0M | 66.7 ms | 68.3 ms | 11.72 | +2%
100000 x 10000 | 8000.0M | 666.7 ms | 683.0 ms | 11.71 | +2%
1000000 x 1 | 8.0M | 677 us | 1.4 ms | 5.91 | +100%
1000000 x 10 | 80.0M | 6.7 ms | 9.3 ms | 8.61 | +39%
1000000 x 100 | 800.0M | 66.7 ms | 68.2 ms | 11.72 | +2%
1000000 x 1000 | 8000.0M | 666.7 ms | 684.0 ms | 11.70 | +3%
1000000 x 10000 | 80.0G | 6.67 s | HOST OOM |
10000000 x 1 | 80.0M | 6.7 ms | 9.3 ms | 8.61 | +39%
10000000 x 10 | 800.0M | 66.7 ms | 68.4 ms | 11.70 | +3%
10000000 x 100 | 8000.0M | 666.7 ms | 683.8 ms | 11.70 | +3%
10000000 x 1000 | 80.0G | 6.67 s | HOST OOM |
10000000 x 10000 | 800.0G | 66.67 s | HOST OOM |
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment