Created
April 13, 2026 14:45
-
-
Save betatim/f4128b961525eda49df9b76c7c339388 to your computer and use it in GitHub Desktop.
Estimate how long it takes to "transfer" a Numpy array to a Cupy array
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Benchmark: numpy-to-cupy (CPU-to-GPU) transfer times on this machine. | |
| Target: GPU 1 (NVIDIA RTX A6000, 48 GB, PCIe Gen4 x16 slot) | |
| """ | |
| import time | |
| import statistics | |
| import numpy as np | |
| import cupy as cp | |
| GPU_DEVICE = 0 # A6000 (CUDA enumerates fastest-first, differs from nvidia-smi) | |
| GPU_VRAM_BYTES = 48 * 1024**3 # 48 GB | |
| HOST_RAM_BYTES = 50 * 1024**3 # conservative: leave headroom for OS + other processes | |
| LATENCY_US = 10.0 | |
| BW_GBS = 12.0e9 # A6000 asymptotic bandwidth (conservative single-value estimate) | |
| N_SAMPLES_LIST = [10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000] | |
| N_FEATURES_LIST = [1, 10, 100, 1_000, 10_000] | |
| def predict_us(nbytes): | |
| return LATENCY_US + nbytes / BW_GBS * 1e6 | |
| def fmt_time(us): | |
| if us < 1000: | |
| return f"{us:.0f} us" | |
| elif us < 1_000_000: | |
| return f"{us / 1000:.1f} ms" | |
| else: | |
| return f"{us / 1e6:.2f} s" | |
| def measure_transfer(arr, n_reps=20): | |
| """Transfer arr to GPU n_reps times, return median time in seconds.""" | |
| # warmup: multiple reps to prime the pool and stabilize after free_all_blocks | |
| for _ in range(5): | |
| try: | |
| g = cp.asarray(arr) | |
| cp.cuda.Stream.null.synchronize() | |
| del g | |
| except (cp.cuda.memory.OutOfMemoryError, MemoryError): | |
| return None | |
| times = [] | |
| for _ in range(n_reps): | |
| cp.cuda.Stream.null.synchronize() | |
| t0 = time.perf_counter() | |
| g = cp.asarray(arr) | |
| cp.cuda.Stream.null.synchronize() | |
| t1 = time.perf_counter() | |
| times.append(t1 - t0) | |
| del g | |
| return statistics.median(times) | |
| def bench(): | |
| print() | |
| print("=" * 72) | |
| print("Benchmark: n_samples x n_features grid (float64)") | |
| print("=" * 72) | |
| header = f"{'n_samples':>12} x {'n_feat':>6} | {'Size':>10} | {'Predicted':>12} | {'Measured':>12} | {'BW GB/s':>8} | {'Error':>7}" | |
| print(header) | |
| print("-" * len(header)) | |
| results = {} | |
| for ns in N_SAMPLES_LIST: | |
| for nf in N_FEATURES_LIST: | |
| nbytes = ns * nf * 8 | |
| pred_us = predict_us(nbytes) | |
| if nbytes > HOST_RAM_BYTES: | |
| print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |") | |
| results[(ns, nf)] = "HOST OOM" | |
| continue | |
| try: | |
| arr = np.random.rand(ns, nf).astype(np.float64) | |
| except MemoryError: | |
| print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'HOST OOM':>12} |") | |
| results[(ns, nf)] = "HOST OOM" | |
| continue | |
| t = measure_transfer(arr) | |
| if t is None: | |
| print(f"{ns:>12} x {nf:>6} | {nbytes / 1e9:>9.1f}G | {fmt_time(pred_us):>12} | {'GPU OOM':>12} |") | |
| results[(ns, nf)] = "GPU OOM" | |
| del arr | |
| cp.get_default_memory_pool().free_all_blocks() | |
| continue | |
| t_us = t * 1e6 | |
| bw = nbytes / (t_us * 1e-6) / 1e9 | |
| err = (t_us - pred_us) / pred_us * 100 | |
| print(f"{ns:>12} x {nf:>6} | {nbytes / 1e6:>9.1f}M | {fmt_time(pred_us):>12} | {fmt_time(t_us):>12} | {bw:>8.2f} | {err:>+6.0f}%") | |
| results[(ns, nf)] = t_us | |
| del arr | |
| cp.get_default_memory_pool().free_all_blocks() | |
| return results | |
| def main(): | |
| print(f"GPU Transfer Benchmark") | |
| print(f"Target: GPU {GPU_DEVICE}") | |
| print() | |
| with cp.cuda.Device(GPU_DEVICE): | |
| grid_results = bench() | |
| print("\nDone.") | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ======================================================================== | |
| Benchmark: n_samples x n_features grid (float64) | |
| ======================================================================== | |
| n_samples x n_feat | Size | Predicted | Measured | BW GB/s | Error | |
| ------------------------------------------------------------------------------------- | |
| 10 x 1 | 0.0M | 10 us | 12 us | 0.01 | +20% | |
| 10 x 10 | 0.0M | 10 us | 12 us | 0.07 | +17% | |
| 10 x 100 | 0.0M | 11 us | 13 us | 0.63 | +18% | |
| 10 x 1000 | 0.1M | 17 us | 17 us | 4.70 | +2% | |
| 10 x 10000 | 0.8M | 77 us | 64 us | 12.41 | -16% | |
| 100 x 1 | 0.0M | 10 us | 12 us | 0.07 | +15% | |
| 100 x 10 | 0.0M | 11 us | 13 us | 0.63 | +18% | |
| 100 x 100 | 0.1M | 17 us | 17 us | 4.74 | +1% | |
| 100 x 1000 | 0.8M | 77 us | 62 us | 12.99 | -20% | |
| 100 x 10000 | 8.0M | 677 us | 471 us | 17.00 | -30% | |
| 1000 x 1 | 0.0M | 11 us | 13 us | 0.63 | +19% | |
| 1000 x 10 | 0.1M | 17 us | 17 us | 4.73 | +1% | |
| 1000 x 100 | 0.8M | 77 us | 61 us | 13.03 | -20% | |
| 1000 x 1000 | 8.0M | 677 us | 469 us | 17.05 | -31% | |
| 1000 x 10000 | 80.0M | 6.7 ms | 6.6 ms | 12.12 | -1% | |
| 10000 x 1 | 0.1M | 17 us | 16 us | 5.14 | -7% | |
| 10000 x 10 | 0.8M | 77 us | 58 us | 13.91 | -25% | |
| 10000 x 100 | 8.0M | 677 us | 442 us | 18.10 | -35% | |
| 10000 x 1000 | 80.0M | 6.7 ms | 6.5 ms | 12.24 | -2% | |
| 10000 x 10000 | 800.0M | 66.7 ms | 68.3 ms | 11.71 | +2% | |
| 100000 x 1 | 0.8M | 77 us | 946 us | 0.85 | +1134% | |
| 100000 x 10 | 8.0M | 677 us | 443 us | 18.05 | -35% | |
| 100000 x 100 | 80.0M | 6.7 ms | 6.5 ms | 12.26 | -2% | |
| 100000 x 1000 | 800.0M | 66.7 ms | 68.3 ms | 11.72 | +2% | |
| 100000 x 10000 | 8000.0M | 666.7 ms | 683.0 ms | 11.71 | +2% | |
| 1000000 x 1 | 8.0M | 677 us | 1.4 ms | 5.91 | +100% | |
| 1000000 x 10 | 80.0M | 6.7 ms | 9.3 ms | 8.61 | +39% | |
| 1000000 x 100 | 800.0M | 66.7 ms | 68.2 ms | 11.72 | +2% | |
| 1000000 x 1000 | 8000.0M | 666.7 ms | 684.0 ms | 11.70 | +3% | |
| 1000000 x 10000 | 80.0G | 6.67 s | HOST OOM | | |
| 10000000 x 1 | 80.0M | 6.7 ms | 9.3 ms | 8.61 | +39% | |
| 10000000 x 10 | 800.0M | 66.7 ms | 68.4 ms | 11.70 | +3% | |
| 10000000 x 100 | 8000.0M | 666.7 ms | 683.8 ms | 11.70 | +3% | |
| 10000000 x 1000 | 80.0G | 6.67 s | HOST OOM | | |
| 10000000 x 10000 | 800.0G | 66.67 s | HOST OOM | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment