Created
March 26, 2026 20:55
-
-
Save aurotripathy/ed5b3b92842ba1f8be000d8942e17f62 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Launch four furiosa-llm Docker containers on npu:0..3 with non-conflicting host ports. | |
| Uses ``docker run -d --rm`` so all four can run at once. Host ports: | |
| instance i -> 8000+i on host and inside the container. | |
| Requires HF_TOKEN in the environment if the image needs it (same as your shell). | |
| After each ``docker run``, the script optionally polls ``GET http://<ready-host>:<port><ready-path>`` | |
| until it receives HTTP 2xx (default path ``/v1/models``), then continues with the next container. | |
| After startup the process waits; Ctrl-C (or SIGTERM) stops and removes all containers. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import signal | |
| import subprocess | |
| import sys | |
| import threading | |
| import time | |
| import urllib.error | |
| import urllib.request | |
| def _container_pid(cid: str) -> int | None: | |
| proc = subprocess.run( | |
| ["docker", "inspect", "-f", "{{.State.Pid}}", cid], | |
| capture_output=True, | |
| text=True, | |
| check=False, | |
| ) | |
| if proc.returncode != 0: | |
| return None | |
| try: | |
| pid = int((proc.stdout or "").strip()) | |
| except ValueError: | |
| return None | |
| return pid if pid > 0 else None | |
| def _wait_container_pid(cid: str, attempts: int = 30, delay: float = 0.1) -> int | None: | |
| for _ in range(attempts): | |
| pid = _container_pid(cid) | |
| if pid is not None: | |
| return pid | |
| time.sleep(delay) | |
| return _container_pid(cid) | |
| def _wait_http_ready( | |
| host: str, | |
| port: int, | |
| path: str, | |
| timeout_sec: float, | |
| interval: float, | |
| ) -> bool: | |
| """Return True once the server returns HTTP 2xx on GET host:port/path (e.g. /v1/models).""" | |
| deadline = time.monotonic() + timeout_sec | |
| while time.monotonic() < deadline: | |
| url = f"http://{host}:{port}{path}" | |
| try: | |
| req = urllib.request.Request(url, method="GET") | |
| open_timeout = min(15.0, max(5.0, interval * 3)) | |
| with urllib.request.urlopen(req, timeout=open_timeout) as resp: | |
| code = resp.getcode() | |
| if 200 <= code < 300: | |
| return True | |
| except urllib.error.HTTPError as e: | |
| # 503 / 502 while loading — keep polling; 404 might mean wrong path | |
| if e.code == 404: | |
| print( | |
| f" readiness check got 404 for {url} — wrong --ready-path?", | |
| file=sys.stderr, | |
| ) | |
| except (urllib.error.URLError, TimeoutError, OSError): | |
| pass | |
| time.sleep(interval) | |
| return False | |
| def _docker_kill_all(container_ids: list[str]) -> None: | |
| for cid in container_ids: | |
| print(f"Killing container {cid}") | |
| subprocess.run( | |
| ["docker", "kill", cid], | |
| capture_output=True, | |
| text=True, | |
| check=False, | |
| ) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--image", default="furiosaai/furiosa-llm:latest") | |
| parser.add_argument( | |
| "--model", | |
| default="furiosa-ai/Llama-3.1-8B-Instruct", | |
| ) | |
| parser.add_argument( | |
| "--ready-timeout", | |
| type=float, | |
| default=900.0, | |
| help="Seconds to wait per container for HTTP readiness (0 = do not wait).", | |
| ) | |
| parser.add_argument( | |
| "--ready-interval", | |
| type=float, | |
| default=2.0, | |
| help="Seconds between readiness polls.", | |
| ) | |
| parser.add_argument( | |
| "--ready-host", | |
| default="127.0.0.1", | |
| help="Host to probe (published container port on this machine).", | |
| ) | |
| parser.add_argument( | |
| "--ready-path", | |
| default="/v1/models", | |
| help="GET path; OpenAI-compatible stacks usually serve this when the API is up.", | |
| ) | |
| args = parser.parse_args() | |
| hf_token = os.environ.get("HF_TOKEN", "") | |
| hf_cache = os.path.expanduser("~/.cache/huggingface") | |
| container_ids: list[str] = [] | |
| stop = threading.Event() | |
| def handle_stop(_signum: int, _frame) -> None: | |
| stop.set() | |
| signal.signal(signal.SIGINT, handle_stop) | |
| signal.signal(signal.SIGTERM, handle_stop) | |
| try: | |
| for i in range(4): | |
| port_num = 8000 + i | |
| cmd: list[str] = [ | |
| "docker", | |
| "run", | |
| "-d", | |
| "--rm", | |
| "-p", | |
| f"{port_num}:{port_num}", | |
| "--device", | |
| "/dev/rngd:/dev/rngd", | |
| "--security-opt", | |
| "seccomp=unconfined", | |
| "--env", | |
| f"HF_TOKEN={hf_token}", | |
| "-v", | |
| f"{hf_cache}:/root/.cache/huggingface", | |
| args.image, | |
| "serve", | |
| args.model, | |
| "--port", | |
| str(port_num), | |
| "--device", | |
| f"npu:{i}", | |
| ] | |
| print(f"Starting instance {i} (npu:{i}) -> http://127.0.0.1:{port_num}/ ...") | |
| proc = subprocess.run(cmd, capture_output=True, text=True) | |
| if proc.returncode != 0: | |
| print(proc.stderr or proc.stdout, file=sys.stderr) | |
| return proc.returncode | |
| cid = (proc.stdout or "").strip() | |
| if cid: | |
| container_ids.append(cid) | |
| if args.ready_timeout > 0 and cid: | |
| print( | |
| f" Waiting for HTTP ready on " | |
| f"http://{args.ready_host}:{port_num}{args.ready_path} " | |
| f"(up to {args.ready_timeout:.0f}s)…", | |
| flush=True, | |
| ) | |
| ok = _wait_http_ready( | |
| args.ready_host, | |
| port_num, | |
| args.ready_path, | |
| timeout_sec=args.ready_timeout, | |
| interval=args.ready_interval, | |
| ) | |
| if not ok: | |
| print( | |
| f" Timed out waiting for instance {i} on port {port_num}.", | |
| file=sys.stderr, | |
| ) | |
| return 1 | |
| rows: list[tuple[int, str, int | None]] = [] | |
| for i, cid in enumerate(container_ids): | |
| pid = _wait_container_pid(cid) | |
| rows.append((i, cid, pid)) | |
| print("Container IDs:", " ".join(container_ids)) | |
| for i, cid, pid in rows: | |
| pid_s = str(pid) if pid is not None else "?" | |
| print(f" instance {i}: container={cid[:12]}… host_pid={pid_s}") | |
| print("Running — Ctrl-C (or SIGTERM) to stop all containers.") | |
| stop.wait() | |
| return 0 | |
| finally: | |
| if container_ids: | |
| print("Stopping containers…") | |
| _docker_kill_all(container_ids) | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment