aurotripathy · March 26, 2026 20:55
diff --git a/start-stop-furiosa-llm-containers.py b/start-stop-furiosa-llm-containers.py
 #!/usr/bin/env python3
 """Launch four furiosa-llm Docker containers on npu:0..3 with non-conflicting host ports.

 Uses ``docker run -d --rm`` so all four can run at once. Host ports:
  instance i -> 8000+i on host and inside the container.

 Requires HF_TOKEN in the environment if the image needs it (same as your shell).

 After each ``docker run``, the script optionally polls ``GET http://<ready-host>:<port><ready-path>``
 until it receives HTTP 2xx (default path ``/v1/models``), then continues with the next container.

 After startup the process waits; Ctrl-C (or SIGTERM) stops and removes all containers.
 """

 from __future__ import annotations

 import argparse
 import os
 import signal
 import subprocess
 import sys
 import threading
 import time
 import urllib.error
 import urllib.request


 def _container_pid(cid: str) -> int | None:
    proc = subprocess.run(
        ["docker", "inspect", "-f", "{{.State.Pid}}", cid],
        capture_output=True,
        text=True,
        check=False,
    )
    if proc.returncode != 0:
        return None
    try:
        pid = int((proc.stdout or "").strip())
    except ValueError:
        return None
    return pid if pid > 0 else None


 def _wait_container_pid(cid: str, attempts: int = 30, delay: float = 0.1) -> int | None:
    for _ in range(attempts):
        pid = _container_pid(cid)
        if pid is not None:
            return pid
        time.sleep(delay)
    return _container_pid(cid)


 def _wait_http_ready(
    host: str,
    port: int,
    path: str,
    timeout_sec: float,
    interval: float,
 ) -> bool:
    """Return True once the server returns HTTP 2xx on GET host:port/path (e.g. /v1/models)."""
    deadline = time.monotonic() + timeout_sec
    while time.monotonic() < deadline:
        url = f"http://{host}:{port}{path}"
        try:
            req = urllib.request.Request(url, method="GET")
            open_timeout = min(15.0, max(5.0, interval * 3))
            with urllib.request.urlopen(req, timeout=open_timeout) as resp:
                code = resp.getcode()
                if 200 <= code < 300:
                    return True
        except urllib.error.HTTPError as e:
            # 503 / 502 while loading — keep polling; 404 might mean wrong path
            if e.code == 404:
                print(
                    f"  readiness check got 404 for {url} — wrong --ready-path?",
                    file=sys.stderr,
                )
        except (urllib.error.URLError, TimeoutError, OSError):
            pass
        time.sleep(interval)
    return False


 def _docker_kill_all(container_ids: list[str]) -> None:
    for cid in container_ids:
        print(f"Killing container {cid}")
        subprocess.run(
            ["docker", "kill", cid],
            capture_output=True,
            text=True,
            check=False,
        )


 def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--image", default="furiosaai/furiosa-llm:latest")
    parser.add_argument(
        "--model",
        default="furiosa-ai/Llama-3.1-8B-Instruct",
    )
    parser.add_argument(
        "--ready-timeout",
        type=float,
        default=900.0,
        help="Seconds to wait per container for HTTP readiness (0 = do not wait).",
    )
    parser.add_argument(
        "--ready-interval",
        type=float,
        default=2.0,
        help="Seconds between readiness polls.",
    )
    parser.add_argument(
        "--ready-host",
        default="127.0.0.1",
        help="Host to probe (published container port on this machine).",
    )
    parser.add_argument(
        "--ready-path",
        default="/v1/models",
        help="GET path; OpenAI-compatible stacks usually serve this when the API is up.",
    )
    args = parser.parse_args()

    hf_token = os.environ.get("HF_TOKEN", "")
    hf_cache = os.path.expanduser("~/.cache/huggingface")

    container_ids: list[str] = []
    stop = threading.Event()

    def handle_stop(_signum: int, _frame) -> None:
        stop.set()

    signal.signal(signal.SIGINT, handle_stop)
    signal.signal(signal.SIGTERM, handle_stop)

    try:
        for i in range(4):
            port_num = 8000 + i
            cmd: list[str] = [
                "docker",
                "run",
                "-d",
                "--rm",
                "-p",
                f"{port_num}:{port_num}",
                "--device",
                "/dev/rngd:/dev/rngd",
                "--security-opt",
                "seccomp=unconfined",
                "--env",
                f"HF_TOKEN={hf_token}",
                "-v",
                f"{hf_cache}:/root/.cache/huggingface",
                args.image,
                "serve",
                args.model,
                "--port",
                str(port_num),
                "--device",
                f"npu:{i}",
            ]
            print(f"Starting instance {i} (npu:{i}) -> http://127.0.0.1:{port_num}/ ...")
            proc = subprocess.run(cmd, capture_output=True, text=True)
            if proc.returncode != 0:
                print(proc.stderr or proc.stdout, file=sys.stderr)
                return proc.returncode
            cid = (proc.stdout or "").strip()
            if cid:
                container_ids.append(cid)

            if args.ready_timeout > 0 and cid:
                print(
                    f"  Waiting for HTTP ready on "
                    f"http://{args.ready_host}:{port_num}{args.ready_path} "
                    f"(up to {args.ready_timeout:.0f}s)…",
                    flush=True,
                )
                ok = _wait_http_ready(
                    args.ready_host,
                    port_num,
                    args.ready_path,
                    timeout_sec=args.ready_timeout,
                    interval=args.ready_interval,
                )
                if not ok:
                    print(
                        f"  Timed out waiting for instance {i} on port {port_num}.",
                        file=sys.stderr,
                    )
                    return 1

        rows: list[tuple[int, str, int | None]] = []
        for i, cid in enumerate(container_ids):
            pid = _wait_container_pid(cid)
            rows.append((i, cid, pid))

        print("Container IDs:", " ".join(container_ids))
        for i, cid, pid in rows:
            pid_s = str(pid) if pid is not None else "?"
            print(f"  instance {i}: container={cid[:12]}…  host_pid={pid_s}")

        print("Running — Ctrl-C (or SIGTERM) to stop all containers.")
        stop.wait()
        return 0
    finally:
        if container_ids:
            print("Stopping containers…")
            _docker_kill_all(container_ids)


 if __name__ == "__main__":
    raise SystemExit(main())
	#!/usr/bin/env python3
	"""Launch four furiosa-llm Docker containers on npu:0..3 with non-conflicting host ports.

	Uses ``docker run -d --rm`` so all four can run at once. Host ports:
	instance i -> 8000+i on host and inside the container.

	Requires HF_TOKEN in the environment if the image needs it (same as your shell).

	After each ``docker run``, the script optionally polls ``GET http://<ready-host>:<port><ready-path>``
	until it receives HTTP 2xx (default path ``/v1/models``), then continues with the next container.

	After startup the process waits; Ctrl-C (or SIGTERM) stops and removes all containers.
	"""

	from __future__ import annotations

	import argparse
	import os
	import signal
	import subprocess
	import sys
	import threading
	import time
	import urllib.error
	import urllib.request


	def _container_pid(cid: str) -> int \| None:
	proc = subprocess.run(
	["docker", "inspect", "-f", "{{.State.Pid}}", cid],
	capture_output=True,
	text=True,
	check=False,
	)
	if proc.returncode != 0:
	return None
	try:
	pid = int((proc.stdout or "").strip())
	except ValueError:
	return None
	return pid if pid > 0 else None


	def _wait_container_pid(cid: str, attempts: int = 30, delay: float = 0.1) -> int \| None:
	for _ in range(attempts):
	pid = _container_pid(cid)
	if pid is not None:
	return pid
	time.sleep(delay)
	return _container_pid(cid)


	def _wait_http_ready(
	host: str,
	port: int,
	path: str,
	timeout_sec: float,
	interval: float,
	) -> bool:
	"""Return True once the server returns HTTP 2xx on GET host:port/path (e.g. /v1/models)."""
	deadline = time.monotonic() + timeout_sec
	while time.monotonic() < deadline:
	url = f"http://{host}:{port}{path}"
	try:
	req = urllib.request.Request(url, method="GET")
	open_timeout = min(15.0, max(5.0, interval * 3))
	with urllib.request.urlopen(req, timeout=open_timeout) as resp:
	code = resp.getcode()
	if 200 <= code < 300:
	return True
	except urllib.error.HTTPError as e:
	# 503 / 502 while loading — keep polling; 404 might mean wrong path
	if e.code == 404:
	print(
	f" readiness check got 404 for {url} — wrong --ready-path?",
	file=sys.stderr,
	)
	except (urllib.error.URLError, TimeoutError, OSError):
	pass
	time.sleep(interval)
	return False


	def _docker_kill_all(container_ids: list[str]) -> None:
	for cid in container_ids:
	print(f"Killing container {cid}")
	subprocess.run(
	["docker", "kill", cid],
	capture_output=True,
	text=True,
	check=False,
	)


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--image", default="furiosaai/furiosa-llm:latest")
	parser.add_argument(
	"--model",
	default="furiosa-ai/Llama-3.1-8B-Instruct",
	)
	parser.add_argument(
	"--ready-timeout",
	type=float,
	default=900.0,
	help="Seconds to wait per container for HTTP readiness (0 = do not wait).",
	)
	parser.add_argument(
	"--ready-interval",
	type=float,
	default=2.0,
	help="Seconds between readiness polls.",
	)
	parser.add_argument(
	"--ready-host",
	default="127.0.0.1",
	help="Host to probe (published container port on this machine).",
	)
	parser.add_argument(
	"--ready-path",
	default="/v1/models",
	help="GET path; OpenAI-compatible stacks usually serve this when the API is up.",
	)
	args = parser.parse_args()

	hf_token = os.environ.get("HF_TOKEN", "")
	hf_cache = os.path.expanduser("~/.cache/huggingface")

	container_ids: list[str] = []
	stop = threading.Event()

	def handle_stop(_signum: int, _frame) -> None:
	stop.set()

	signal.signal(signal.SIGINT, handle_stop)
	signal.signal(signal.SIGTERM, handle_stop)

	try:
	for i in range(4):
	port_num = 8000 + i
	cmd: list[str] = [
	"docker",
	"run",
	"-d",
	"--rm",
	"-p",
	f"{port_num}:{port_num}",
	"--device",
	"/dev/rngd:/dev/rngd",
	"--security-opt",
	"seccomp=unconfined",
	"--env",
	f"HF_TOKEN={hf_token}",
	"-v",
	f"{hf_cache}:/root/.cache/huggingface",
	args.image,
	"serve",
	args.model,
	"--port",
	str(port_num),
	"--device",
	f"npu:{i}",
	]
	print(f"Starting instance {i} (npu:{i}) -> http://127.0.0.1:{port_num}/ ...")
	proc = subprocess.run(cmd, capture_output=True, text=True)
	if proc.returncode != 0:
	print(proc.stderr or proc.stdout, file=sys.stderr)
	return proc.returncode
	cid = (proc.stdout or "").strip()
	if cid:
	container_ids.append(cid)

	if args.ready_timeout > 0 and cid:
	print(
	f" Waiting for HTTP ready on "
	f"http://{args.ready_host}:{port_num}{args.ready_path} "
	f"(up to {args.ready_timeout:.0f}s)…",
	flush=True,
	)
	ok = _wait_http_ready(
	args.ready_host,
	port_num,
	args.ready_path,
	timeout_sec=args.ready_timeout,
	interval=args.ready_interval,
	)
	if not ok:
	print(
	f" Timed out waiting for instance {i} on port {port_num}.",
	file=sys.stderr,
	)
	return 1

	rows: list[tuple[int, str, int \| None]] = []
	for i, cid in enumerate(container_ids):
	pid = _wait_container_pid(cid)
	rows.append((i, cid, pid))

	print("Container IDs:", " ".join(container_ids))
	for i, cid, pid in rows:
	pid_s = str(pid) if pid is not None else "?"
	print(f" instance {i}: container={cid[:12]}… host_pid={pid_s}")

	print("Running — Ctrl-C (or SIGTERM) to stop all containers.")
	stop.wait()
	return 0
	finally:
	if container_ids:
	print("Stopping containers…")
	_docker_kill_all(container_ids)


	if __name__ == "__main__":
	raise SystemExit(main())
No results found