aurotripathy · March 27, 2026 12:15
diff --git a/client-script-run.py b/client-script-run.py
 #!/usr/bin/env python3
 """Run vLLM ``benchmark_serving`` scenarios against local OpenAI-compatible servers.

 Originally derived from ``script_run_jio.sh``. For each scenario triple in ``TRIPLES``,
 starts one ``benchmark_serving.py`` subprocess per port in parallel (thread pool).
 """

 from __future__ import annotations

 import argparse
 import json
 import os
 import subprocess
 import sys
 import urllib.error
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path

 # Triples: (input_tokens, output_tokens, parallel_requests); grid inherited from script_run_jio.sh
 TRIPLES: list[tuple[int, int, int]] = [
    # (500, 50, 25),
    # (500, 50, 50),
    # (500, 50, 75),
    # (500, 100, 25),
    # (500, 100, 50),
    # (500, 100, 75),
    # (500, 1500, 25),
    # (500, 1500, 50),
    # (500, 1500, 75),
    # (500, 5000, 25),
    # (500, 5000, 50),
    # (500, 5000, 75),
    # (2500, 50, 25),
    # (2500, 50, 50),
    # (2500, 50, 75),
    # (2500, 100, 25),
    # (2500, 100, 50),
    # (2500, 100, 75),
    # (2500, 1500, 25),
    # (2500, 1500, 50),
    # (2500, 1500, 75),
    # (2500, 5000, 25),
    # (2500, 5000, 50),
    # (2500, 5000, 75),
    # (5000, 50, 25),
    # (5000, 50, 50),
    # (5000, 50, 75),
    # (5000, 100, 25),
    # (5000, 100, 50),
    # (5000, 100, 75),
    # (5000, 1500, 25),
    # (5000, 1500, 50),
    # (5000, 1500, 75),
    # (5000, 5000, 25),
    # (5000, 5000, 50),
    # (5000, 5000, 75),
    # (10000, 50, 25),
    # (10000, 50, 50),
    # (10000, 50, 75),
    # (10000, 100, 25),
    # (10000, 100, 50),
    # (10000, 100, 75),
    # (10000, 1500, 25),
    # (10000, 1500, 50),
    # (10000, 1500, 75),
    # (10000, 5000, 25),
    # (10000, 5000, 50),
    (10000, 5000, 75),
 ]


 def _fetch_first_model_id(host: str, port: str) -> str:
    url = f"http://{host}:{port}/v1/models"
    try:
        req = urllib.request.Request(url, method="GET")
        with urllib.request.urlopen(req, timeout=60) as resp:
            data = json.load(resp)
    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e:
        raise RuntimeError(f"request failed for {url}: {e}") from e

    for item in data.get("data") or []:
        mid = item.get("id")
        if mid:
            return str(mid)
    raise RuntimeError(f"Failed to parse PRETRAINED_ID from {url}")


 def _run_benchmark(cmd: list[str], cwd: Path) -> int:
    return subprocess.run(cmd, cwd=cwd).returncode


 def main() -> int:
    parser = argparse.ArgumentParser(
        description=__doc__,
    )
    parser.add_argument("results_suffix", help="Suffix for results directory name")
    parser.add_argument(
        "ports",
        nargs="+",
        help="Server ports (e.g. 8000 8001 8002 8003); benchmarks run in parallel per triple",
    )
    args = parser.parse_args()

    results_suffix = args.results_suffix
    ports = [str(p) for p in args.ports]

    root = Path.cwd()
    vllm_dir = root / "vllm"
    if not vllm_dir.is_dir():
        print(f"Error: vllm directory not found: {vllm_dir}", file=sys.stderr)
        return 1

    bench = vllm_dir / "benchmarks" / "benchmark_serving.py"
    if not bench.is_file():
        print(f"Error: benchmark script not found: {bench}", file=sys.stderr)
        return 1

    bench_rel = str(bench.relative_to(vllm_dir))
    monitor_scope = os.environ.get("MONITOR_SCOPE", "").strip()

    pretrained_global = os.environ.get("PRETRAINED_ID", "").strip()
    model_by_port: dict[str, str] = {}
    try:
        if pretrained_global:
            model_by_port = {p: pretrained_global for p in ports}
        else:
            with ThreadPoolExecutor(max_workers=len(ports)) as pool:
                future_map = {
                    pool.submit(_fetch_first_model_id, "127.0.0.1", p): p for p in ports
                }
                for fut in as_completed(future_map):
                    port = future_map[fut]
                    model_by_port[port] = fut.result()
    except RuntimeError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    for port in ports:
        rd = root / "vllm-result" / f"results-{results_suffix}_{port}"
        rd.mkdir(parents=True, exist_ok=True)

    for input_tokens, output_tokens, parallel_requests in TRIPLES:
        workers: list[tuple[str, list[str]]] = []
        for port in ports:
            results_dir = root / "vllm-result" / f"results-{results_suffix}_{port}"
            scenario_dir = results_dir / f"{input_tokens}.{output_tokens}.{parallel_requests}"
            scenario_dir.mkdir(parents=True, exist_ok=True)

            cmd: list[str] = [
                sys.executable,
                bench_rel,
                "--backend",
                "vllm",
                "--model",
                model_by_port[port],
                "--dataset-name",
                "random",
                "--random-input-len",
                str(input_tokens),
                "--ignore-eos",
                "--random-output-len",
                str(output_tokens),
                "--max-concurrency",
                str(parallel_requests),
                "--num-prompts",
                "500",
                "--result-dir",
                str(scenario_dir),
                "--percentile-metrics",
                "ttft,tpot,itl,e2el",
                "--metric-percentiles",
                "25,50,75,90,95,99",
                "--host",
                "127.0.0.1",
                "--port",
                port,
                "--enable-device-monitor",
                "npu",
                "--save-result",
            ]
            if monitor_scope:
                cmd.extend(["--monitor-scope", monitor_scope])
            workers.append((port, cmd))

        print(
            f"Parallel run (ports {', '.join(ports)}): "
            f"{input_tokens}.{output_tokens}.{parallel_requests}",
            flush=True,
        )
        cmd_by_port = dict(workers)
        results_by_port: dict[str, int] = {}
        with ThreadPoolExecutor(max_workers=len(ports)) as pool:
            future_map = {
                pool.submit(_run_benchmark, cmd, vllm_dir): port for port, cmd in workers
            }
            for fut in as_completed(future_map):
                port = future_map[fut]
                rc = fut.result()
                results_by_port[port] = rc
                print(f"  port {port} finished with exit code {rc}", flush=True)

        failed = [(p, r) for p, r in results_by_port.items() if r != 0]
        if failed:
            for p, r in failed:
                print(f"Error: port {p} exit {r}", file=sys.stderr)
                print(" ".join(cmd_by_port[p]), file=sys.stderr)
            return failed[0][1]

    return 0


 if __name__ == "__main__":
    raise SystemExit(main())
	#!/usr/bin/env python3
	"""Run vLLM ``benchmark_serving`` scenarios against local OpenAI-compatible servers.

	Originally derived from ``script_run_jio.sh``. For each scenario triple in ``TRIPLES``,
	starts one ``benchmark_serving.py`` subprocess per port in parallel (thread pool).
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import subprocess
	import sys
	import urllib.error
	import urllib.request
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from pathlib import Path

	# Triples: (input_tokens, output_tokens, parallel_requests); grid inherited from script_run_jio.sh
	TRIPLES: list[tuple[int, int, int]] = [
	# (500, 50, 25),
	# (500, 50, 50),
	# (500, 50, 75),
	# (500, 100, 25),
	# (500, 100, 50),
	# (500, 100, 75),
	# (500, 1500, 25),
	# (500, 1500, 50),
	# (500, 1500, 75),
	# (500, 5000, 25),
	# (500, 5000, 50),
	# (500, 5000, 75),
	# (2500, 50, 25),
	# (2500, 50, 50),
	# (2500, 50, 75),
	# (2500, 100, 25),
	# (2500, 100, 50),
	# (2500, 100, 75),
	# (2500, 1500, 25),
	# (2500, 1500, 50),
	# (2500, 1500, 75),
	# (2500, 5000, 25),
	# (2500, 5000, 50),
	# (2500, 5000, 75),
	# (5000, 50, 25),
	# (5000, 50, 50),
	# (5000, 50, 75),
	# (5000, 100, 25),
	# (5000, 100, 50),
	# (5000, 100, 75),
	# (5000, 1500, 25),
	# (5000, 1500, 50),
	# (5000, 1500, 75),
	# (5000, 5000, 25),
	# (5000, 5000, 50),
	# (5000, 5000, 75),
	# (10000, 50, 25),
	# (10000, 50, 50),
	# (10000, 50, 75),
	# (10000, 100, 25),
	# (10000, 100, 50),
	# (10000, 100, 75),
	# (10000, 1500, 25),
	# (10000, 1500, 50),
	# (10000, 1500, 75),
	# (10000, 5000, 25),
	# (10000, 5000, 50),
	(10000, 5000, 75),
	]


	def _fetch_first_model_id(host: str, port: str) -> str:
	url = f"http://{host}:{port}/v1/models"
	try:
	req = urllib.request.Request(url, method="GET")
	with urllib.request.urlopen(req, timeout=60) as resp:
	data = json.load(resp)
	except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as e:
	raise RuntimeError(f"request failed for {url}: {e}") from e

	for item in data.get("data") or []:
	mid = item.get("id")
	if mid:
	return str(mid)
	raise RuntimeError(f"Failed to parse PRETRAINED_ID from {url}")


	def _run_benchmark(cmd: list[str], cwd: Path) -> int:
	return subprocess.run(cmd, cwd=cwd).returncode


	def main() -> int:
	parser = argparse.ArgumentParser(
	description=__doc__,
	)
	parser.add_argument("results_suffix", help="Suffix for results directory name")
	parser.add_argument(
	"ports",
	nargs="+",
	help="Server ports (e.g. 8000 8001 8002 8003); benchmarks run in parallel per triple",
	)
	args = parser.parse_args()

	results_suffix = args.results_suffix
	ports = [str(p) for p in args.ports]

	root = Path.cwd()
	vllm_dir = root / "vllm"
	if not vllm_dir.is_dir():
	print(f"Error: vllm directory not found: {vllm_dir}", file=sys.stderr)
	return 1

	bench = vllm_dir / "benchmarks" / "benchmark_serving.py"
	if not bench.is_file():
	print(f"Error: benchmark script not found: {bench}", file=sys.stderr)
	return 1

	bench_rel = str(bench.relative_to(vllm_dir))
	monitor_scope = os.environ.get("MONITOR_SCOPE", "").strip()

	pretrained_global = os.environ.get("PRETRAINED_ID", "").strip()
	model_by_port: dict[str, str] = {}
	try:
	if pretrained_global:
	model_by_port = {p: pretrained_global for p in ports}
	else:
	with ThreadPoolExecutor(max_workers=len(ports)) as pool:
	future_map = {
	pool.submit(_fetch_first_model_id, "127.0.0.1", p): p for p in ports
	}
	for fut in as_completed(future_map):
	port = future_map[fut]
	model_by_port[port] = fut.result()
	except RuntimeError as e:
	print(f"Error: {e}", file=sys.stderr)
	return 1

	for port in ports:
	rd = root / "vllm-result" / f"results-{results_suffix}_{port}"
	rd.mkdir(parents=True, exist_ok=True)

	for input_tokens, output_tokens, parallel_requests in TRIPLES:
	workers: list[tuple[str, list[str]]] = []
	for port in ports:
	results_dir = root / "vllm-result" / f"results-{results_suffix}_{port}"
	scenario_dir = results_dir / f"{input_tokens}.{output_tokens}.{parallel_requests}"
	scenario_dir.mkdir(parents=True, exist_ok=True)

	cmd: list[str] = [
	sys.executable,
	bench_rel,
	"--backend",
	"vllm",
	"--model",
	model_by_port[port],
	"--dataset-name",
	"random",
	"--random-input-len",
	str(input_tokens),
	"--ignore-eos",
	"--random-output-len",
	str(output_tokens),
	"--max-concurrency",
	str(parallel_requests),
	"--num-prompts",
	"500",
	"--result-dir",
	str(scenario_dir),
	"--percentile-metrics",
	"ttft,tpot,itl,e2el",
	"--metric-percentiles",
	"25,50,75,90,95,99",
	"--host",
	"127.0.0.1",
	"--port",
	port,
	"--enable-device-monitor",
	"npu",
	"--save-result",
	]
	if monitor_scope:
	cmd.extend(["--monitor-scope", monitor_scope])
	workers.append((port, cmd))

	print(
	f"Parallel run (ports {', '.join(ports)}): "
	f"{input_tokens}.{output_tokens}.{parallel_requests}",
	flush=True,
	)
	cmd_by_port = dict(workers)
	results_by_port: dict[str, int] = {}
	with ThreadPoolExecutor(max_workers=len(ports)) as pool:
	future_map = {
	pool.submit(_run_benchmark, cmd, vllm_dir): port for port, cmd in workers
	}
	for fut in as_completed(future_map):
	port = future_map[fut]
	rc = fut.result()
	results_by_port[port] = rc
	print(f" port {port} finished with exit code {rc}", flush=True)

	failed = [(p, r) for p, r in results_by_port.items() if r != 0]
	if failed:
	for p, r in failed:
	print(f"Error: port {p} exit {r}", file=sys.stderr)
	print(" ".join(cmd_by_port[p]), file=sys.stderr)
	return failed[0][1]

	return 0


	if __name__ == "__main__":
	raise SystemExit(main())
No results found