Skip to content

Instantly share code, notes, and snippets.

@aurotripathy
Created March 27, 2026 12:15
Show Gist options
  • Select an option

  • Save aurotripathy/c4c303d52269d31fc0c0bf498a796ad0 to your computer and use it in GitHub Desktop.

Select an option

Save aurotripathy/c4c303d52269d31fc0c0bf498a796ad0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Run vLLM ``benchmark_serving`` scenarios against local OpenAI-compatible servers.
Originally derived from ``script_run_jio.sh``. For each scenario triple in ``TRIPLES``,
starts one ``benchmark_serving.py`` subprocess per port in parallel (thread pool).
"""
from __future__ import annotations
import argparse
import json
import os
import subprocess
import sys
import urllib.error
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
# Triples: (input_tokens, output_tokens, parallel_requests); grid inherited from script_run_jio.sh
TRIPLES: list[tuple[int, int, int]] = [
# (500, 50, 25),
# (500, 50, 50),
# (500, 50, 75),
# (500, 100, 25),
# (500, 100, 50),
# (500, 100, 75),
# (500, 1500, 25),
# (500, 1500, 50),
# (500, 1500, 75),
# (500, 5000, 25),
# (500, 5000, 50),
# (500, 5000, 75),
# (2500, 50, 25),
# (2500, 50, 50),
# (2500, 50, 75),
# (2500, 100, 25),
# (2500, 100, 50),
# (2500, 100, 75),
# (2500, 1500, 25),
# (2500, 1500, 50),
# (2500, 1500, 75),
# (2500, 5000, 25),
# (2500, 5000, 50),
# (2500, 5000, 75),
# (5000, 50, 25),
# (5000, 50, 50),
# (5000, 50, 75),
# (5000, 100, 25),
# (5000, 100, 50),
# (5000, 100, 75),
# (5000, 1500, 25),
# (5000, 1500, 50),
# (5000, 1500, 75),
# (5000, 5000, 25),
# (5000, 5000, 50),
# (5000, 5000, 75),
# (10000, 50, 25),
# (10000, 50, 50),
# (10000, 50, 75),
# (10000, 100, 25),
# (10000, 100, 50),
# (10000, 100, 75),
# (10000, 1500, 25),
# (10000, 1500, 50),
# (10000, 1500, 75),
# (10000, 5000, 25),
# (10000, 5000, 50),
(10000, 5000, 75),
]
def _fetch_first_model_id(host: str, port: str) -> str:
url = f"http://{host}:{port}/v1/models"
try:
req = urllib.request.Request(url, method="GET")
with urllib.request.urlopen(req, timeout=60) as resp:
data = json.load(resp)
except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e:
raise RuntimeError(f"request failed for {url}: {e}") from e
for item in data.get("data") or []:
mid = item.get("id")
if mid:
return str(mid)
raise RuntimeError(f"Failed to parse PRETRAINED_ID from {url}")
def _run_benchmark(cmd: list[str], cwd: Path) -> int:
return subprocess.run(cmd, cwd=cwd).returncode
def main() -> int:
parser = argparse.ArgumentParser(
description=__doc__,
)
parser.add_argument("results_suffix", help="Suffix for results directory name")
parser.add_argument(
"ports",
nargs="+",
help="Server ports (e.g. 8000 8001 8002 8003); benchmarks run in parallel per triple",
)
args = parser.parse_args()
results_suffix = args.results_suffix
ports = [str(p) for p in args.ports]
root = Path.cwd()
vllm_dir = root / "vllm"
if not vllm_dir.is_dir():
print(f"Error: vllm directory not found: {vllm_dir}", file=sys.stderr)
return 1
bench = vllm_dir / "benchmarks" / "benchmark_serving.py"
if not bench.is_file():
print(f"Error: benchmark script not found: {bench}", file=sys.stderr)
return 1
bench_rel = str(bench.relative_to(vllm_dir))
monitor_scope = os.environ.get("MONITOR_SCOPE", "").strip()
pretrained_global = os.environ.get("PRETRAINED_ID", "").strip()
model_by_port: dict[str, str] = {}
try:
if pretrained_global:
model_by_port = {p: pretrained_global for p in ports}
else:
with ThreadPoolExecutor(max_workers=len(ports)) as pool:
future_map = {
pool.submit(_fetch_first_model_id, "127.0.0.1", p): p for p in ports
}
for fut in as_completed(future_map):
port = future_map[fut]
model_by_port[port] = fut.result()
except RuntimeError as e:
print(f"Error: {e}", file=sys.stderr)
return 1
for port in ports:
rd = root / "vllm-result" / f"results-{results_suffix}_{port}"
rd.mkdir(parents=True, exist_ok=True)
for input_tokens, output_tokens, parallel_requests in TRIPLES:
workers: list[tuple[str, list[str]]] = []
for port in ports:
results_dir = root / "vllm-result" / f"results-{results_suffix}_{port}"
scenario_dir = results_dir / f"{input_tokens}.{output_tokens}.{parallel_requests}"
scenario_dir.mkdir(parents=True, exist_ok=True)
cmd: list[str] = [
sys.executable,
bench_rel,
"--backend",
"vllm",
"--model",
model_by_port[port],
"--dataset-name",
"random",
"--random-input-len",
str(input_tokens),
"--ignore-eos",
"--random-output-len",
str(output_tokens),
"--max-concurrency",
str(parallel_requests),
"--num-prompts",
"500",
"--result-dir",
str(scenario_dir),
"--percentile-metrics",
"ttft,tpot,itl,e2el",
"--metric-percentiles",
"25,50,75,90,95,99",
"--host",
"127.0.0.1",
"--port",
port,
"--enable-device-monitor",
"npu",
"--save-result",
]
if monitor_scope:
cmd.extend(["--monitor-scope", monitor_scope])
workers.append((port, cmd))
print(
f"Parallel run (ports {', '.join(ports)}): "
f"{input_tokens}.{output_tokens}.{parallel_requests}",
flush=True,
)
cmd_by_port = dict(workers)
results_by_port: dict[str, int] = {}
with ThreadPoolExecutor(max_workers=len(ports)) as pool:
future_map = {
pool.submit(_run_benchmark, cmd, vllm_dir): port for port, cmd in workers
}
for fut in as_completed(future_map):
port = future_map[fut]
rc = fut.result()
results_by_port[port] = rc
print(f" port {port} finished with exit code {rc}", flush=True)
failed = [(p, r) for p, r in results_by_port.items() if r != 0]
if failed:
for p, r in failed:
print(f"Error: port {p} exit {r}", file=sys.stderr)
print(" ".join(cmd_by_port[p]), file=sys.stderr)
return failed[0][1]
return 0
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment