Skip to content

Instantly share code, notes, and snippets.

@aurotripathy
Created March 26, 2026 20:55
Show Gist options
  • Select an option

  • Save aurotripathy/ed5b3b92842ba1f8be000d8942e17f62 to your computer and use it in GitHub Desktop.

Select an option

Save aurotripathy/ed5b3b92842ba1f8be000d8942e17f62 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Launch four furiosa-llm Docker containers on npu:0..3 with non-conflicting host ports.
Uses ``docker run -d --rm`` so all four can run at once. Host ports:
instance i -> 8000+i on host and inside the container.
Requires HF_TOKEN in the environment if the image needs it (same as your shell).
After each ``docker run``, the script optionally polls ``GET http://<ready-host>:<port><ready-path>``
until it receives HTTP 2xx (default path ``/v1/models``), then continues with the next container.
After startup the process waits; Ctrl-C (or SIGTERM) stops and removes all containers.
"""
from __future__ import annotations
import argparse
import os
import signal
import subprocess
import sys
import threading
import time
import urllib.error
import urllib.request
def _container_pid(cid: str) -> int | None:
proc = subprocess.run(
["docker", "inspect", "-f", "{{.State.Pid}}", cid],
capture_output=True,
text=True,
check=False,
)
if proc.returncode != 0:
return None
try:
pid = int((proc.stdout or "").strip())
except ValueError:
return None
return pid if pid > 0 else None
def _wait_container_pid(cid: str, attempts: int = 30, delay: float = 0.1) -> int | None:
for _ in range(attempts):
pid = _container_pid(cid)
if pid is not None:
return pid
time.sleep(delay)
return _container_pid(cid)
def _wait_http_ready(
host: str,
port: int,
path: str,
timeout_sec: float,
interval: float,
) -> bool:
"""Return True once the server returns HTTP 2xx on GET host:port/path (e.g. /v1/models)."""
deadline = time.monotonic() + timeout_sec
while time.monotonic() < deadline:
url = f"http://{host}:{port}{path}"
try:
req = urllib.request.Request(url, method="GET")
open_timeout = min(15.0, max(5.0, interval * 3))
with urllib.request.urlopen(req, timeout=open_timeout) as resp:
code = resp.getcode()
if 200 <= code < 300:
return True
except urllib.error.HTTPError as e:
# 503 / 502 while loading — keep polling; 404 might mean wrong path
if e.code == 404:
print(
f" readiness check got 404 for {url} — wrong --ready-path?",
file=sys.stderr,
)
except (urllib.error.URLError, TimeoutError, OSError):
pass
time.sleep(interval)
return False
def _docker_kill_all(container_ids: list[str]) -> None:
for cid in container_ids:
print(f"Killing container {cid}")
subprocess.run(
["docker", "kill", cid],
capture_output=True,
text=True,
check=False,
)
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--image", default="furiosaai/furiosa-llm:latest")
parser.add_argument(
"--model",
default="furiosa-ai/Llama-3.1-8B-Instruct",
)
parser.add_argument(
"--ready-timeout",
type=float,
default=900.0,
help="Seconds to wait per container for HTTP readiness (0 = do not wait).",
)
parser.add_argument(
"--ready-interval",
type=float,
default=2.0,
help="Seconds between readiness polls.",
)
parser.add_argument(
"--ready-host",
default="127.0.0.1",
help="Host to probe (published container port on this machine).",
)
parser.add_argument(
"--ready-path",
default="/v1/models",
help="GET path; OpenAI-compatible stacks usually serve this when the API is up.",
)
args = parser.parse_args()
hf_token = os.environ.get("HF_TOKEN", "")
hf_cache = os.path.expanduser("~/.cache/huggingface")
container_ids: list[str] = []
stop = threading.Event()
def handle_stop(_signum: int, _frame) -> None:
stop.set()
signal.signal(signal.SIGINT, handle_stop)
signal.signal(signal.SIGTERM, handle_stop)
try:
for i in range(4):
port_num = 8000 + i
cmd: list[str] = [
"docker",
"run",
"-d",
"--rm",
"-p",
f"{port_num}:{port_num}",
"--device",
"/dev/rngd:/dev/rngd",
"--security-opt",
"seccomp=unconfined",
"--env",
f"HF_TOKEN={hf_token}",
"-v",
f"{hf_cache}:/root/.cache/huggingface",
args.image,
"serve",
args.model,
"--port",
str(port_num),
"--device",
f"npu:{i}",
]
print(f"Starting instance {i} (npu:{i}) -> http://127.0.0.1:{port_num}/ ...")
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
print(proc.stderr or proc.stdout, file=sys.stderr)
return proc.returncode
cid = (proc.stdout or "").strip()
if cid:
container_ids.append(cid)
if args.ready_timeout > 0 and cid:
print(
f" Waiting for HTTP ready on "
f"http://{args.ready_host}:{port_num}{args.ready_path} "
f"(up to {args.ready_timeout:.0f}s)…",
flush=True,
)
ok = _wait_http_ready(
args.ready_host,
port_num,
args.ready_path,
timeout_sec=args.ready_timeout,
interval=args.ready_interval,
)
if not ok:
print(
f" Timed out waiting for instance {i} on port {port_num}.",
file=sys.stderr,
)
return 1
rows: list[tuple[int, str, int | None]] = []
for i, cid in enumerate(container_ids):
pid = _wait_container_pid(cid)
rows.append((i, cid, pid))
print("Container IDs:", " ".join(container_ids))
for i, cid, pid in rows:
pid_s = str(pid) if pid is not None else "?"
print(f" instance {i}: container={cid[:12]}… host_pid={pid_s}")
print("Running — Ctrl-C (or SIGTERM) to stop all containers.")
stop.wait()
return 0
finally:
if container_ids:
print("Stopping containers…")
_docker_kill_all(container_ids)
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment