Created
March 27, 2026 12:15
-
-
Save aurotripathy/c4c303d52269d31fc0c0bf498a796ad0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Run vLLM ``benchmark_serving`` scenarios against local OpenAI-compatible servers. | |
| Originally derived from ``script_run_jio.sh``. For each scenario triple in ``TRIPLES``, | |
| starts one ``benchmark_serving.py`` subprocess per port in parallel (thread pool). | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| import urllib.error | |
| import urllib.request | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from pathlib import Path | |
| # Triples: (input_tokens, output_tokens, parallel_requests); grid inherited from script_run_jio.sh | |
| TRIPLES: list[tuple[int, int, int]] = [ | |
| # (500, 50, 25), | |
| # (500, 50, 50), | |
| # (500, 50, 75), | |
| # (500, 100, 25), | |
| # (500, 100, 50), | |
| # (500, 100, 75), | |
| # (500, 1500, 25), | |
| # (500, 1500, 50), | |
| # (500, 1500, 75), | |
| # (500, 5000, 25), | |
| # (500, 5000, 50), | |
| # (500, 5000, 75), | |
| # (2500, 50, 25), | |
| # (2500, 50, 50), | |
| # (2500, 50, 75), | |
| # (2500, 100, 25), | |
| # (2500, 100, 50), | |
| # (2500, 100, 75), | |
| # (2500, 1500, 25), | |
| # (2500, 1500, 50), | |
| # (2500, 1500, 75), | |
| # (2500, 5000, 25), | |
| # (2500, 5000, 50), | |
| # (2500, 5000, 75), | |
| # (5000, 50, 25), | |
| # (5000, 50, 50), | |
| # (5000, 50, 75), | |
| # (5000, 100, 25), | |
| # (5000, 100, 50), | |
| # (5000, 100, 75), | |
| # (5000, 1500, 25), | |
| # (5000, 1500, 50), | |
| # (5000, 1500, 75), | |
| # (5000, 5000, 25), | |
| # (5000, 5000, 50), | |
| # (5000, 5000, 75), | |
| # (10000, 50, 25), | |
| # (10000, 50, 50), | |
| # (10000, 50, 75), | |
| # (10000, 100, 25), | |
| # (10000, 100, 50), | |
| # (10000, 100, 75), | |
| # (10000, 1500, 25), | |
| # (10000, 1500, 50), | |
| # (10000, 1500, 75), | |
| # (10000, 5000, 25), | |
| # (10000, 5000, 50), | |
| (10000, 5000, 75), | |
| ] | |
| def _fetch_first_model_id(host: str, port: str) -> str: | |
| url = f"http://{host}:{port}/v1/models" | |
| try: | |
| req = urllib.request.Request(url, method="GET") | |
| with urllib.request.urlopen(req, timeout=60) as resp: | |
| data = json.load(resp) | |
| except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e: | |
| raise RuntimeError(f"request failed for {url}: {e}") from e | |
| for item in data.get("data") or []: | |
| mid = item.get("id") | |
| if mid: | |
| return str(mid) | |
| raise RuntimeError(f"Failed to parse PRETRAINED_ID from {url}") | |
| def _run_benchmark(cmd: list[str], cwd: Path) -> int: | |
| return subprocess.run(cmd, cwd=cwd).returncode | |
| def main() -> int: | |
| parser = argparse.ArgumentParser( | |
| description=__doc__, | |
| ) | |
| parser.add_argument("results_suffix", help="Suffix for results directory name") | |
| parser.add_argument( | |
| "ports", | |
| nargs="+", | |
| help="Server ports (e.g. 8000 8001 8002 8003); benchmarks run in parallel per triple", | |
| ) | |
| args = parser.parse_args() | |
| results_suffix = args.results_suffix | |
| ports = [str(p) for p in args.ports] | |
| root = Path.cwd() | |
| vllm_dir = root / "vllm" | |
| if not vllm_dir.is_dir(): | |
| print(f"Error: vllm directory not found: {vllm_dir}", file=sys.stderr) | |
| return 1 | |
| bench = vllm_dir / "benchmarks" / "benchmark_serving.py" | |
| if not bench.is_file(): | |
| print(f"Error: benchmark script not found: {bench}", file=sys.stderr) | |
| return 1 | |
| bench_rel = str(bench.relative_to(vllm_dir)) | |
| monitor_scope = os.environ.get("MONITOR_SCOPE", "").strip() | |
| pretrained_global = os.environ.get("PRETRAINED_ID", "").strip() | |
| model_by_port: dict[str, str] = {} | |
| try: | |
| if pretrained_global: | |
| model_by_port = {p: pretrained_global for p in ports} | |
| else: | |
| with ThreadPoolExecutor(max_workers=len(ports)) as pool: | |
| future_map = { | |
| pool.submit(_fetch_first_model_id, "127.0.0.1", p): p for p in ports | |
| } | |
| for fut in as_completed(future_map): | |
| port = future_map[fut] | |
| model_by_port[port] = fut.result() | |
| except RuntimeError as e: | |
| print(f"Error: {e}", file=sys.stderr) | |
| return 1 | |
| for port in ports: | |
| rd = root / "vllm-result" / f"results-{results_suffix}_{port}" | |
| rd.mkdir(parents=True, exist_ok=True) | |
| for input_tokens, output_tokens, parallel_requests in TRIPLES: | |
| workers: list[tuple[str, list[str]]] = [] | |
| for port in ports: | |
| results_dir = root / "vllm-result" / f"results-{results_suffix}_{port}" | |
| scenario_dir = results_dir / f"{input_tokens}.{output_tokens}.{parallel_requests}" | |
| scenario_dir.mkdir(parents=True, exist_ok=True) | |
| cmd: list[str] = [ | |
| sys.executable, | |
| bench_rel, | |
| "--backend", | |
| "vllm", | |
| "--model", | |
| model_by_port[port], | |
| "--dataset-name", | |
| "random", | |
| "--random-input-len", | |
| str(input_tokens), | |
| "--ignore-eos", | |
| "--random-output-len", | |
| str(output_tokens), | |
| "--max-concurrency", | |
| str(parallel_requests), | |
| "--num-prompts", | |
| "500", | |
| "--result-dir", | |
| str(scenario_dir), | |
| "--percentile-metrics", | |
| "ttft,tpot,itl,e2el", | |
| "--metric-percentiles", | |
| "25,50,75,90,95,99", | |
| "--host", | |
| "127.0.0.1", | |
| "--port", | |
| port, | |
| "--enable-device-monitor", | |
| "npu", | |
| "--save-result", | |
| ] | |
| if monitor_scope: | |
| cmd.extend(["--monitor-scope", monitor_scope]) | |
| workers.append((port, cmd)) | |
| print( | |
| f"Parallel run (ports {', '.join(ports)}): " | |
| f"{input_tokens}.{output_tokens}.{parallel_requests}", | |
| flush=True, | |
| ) | |
| cmd_by_port = dict(workers) | |
| results_by_port: dict[str, int] = {} | |
| with ThreadPoolExecutor(max_workers=len(ports)) as pool: | |
| future_map = { | |
| pool.submit(_run_benchmark, cmd, vllm_dir): port for port, cmd in workers | |
| } | |
| for fut in as_completed(future_map): | |
| port = future_map[fut] | |
| rc = fut.result() | |
| results_by_port[port] = rc | |
| print(f" port {port} finished with exit code {rc}", flush=True) | |
| failed = [(p, r) for p, r in results_by_port.items() if r != 0] | |
| if failed: | |
| for p, r in failed: | |
| print(f"Error: port {p} exit {r}", file=sys.stderr) | |
| print(" ".join(cmd_by_port[p]), file=sys.stderr) | |
| return failed[0][1] | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment