Created
May 28, 2026 19:16
-
-
Save Aaronontheweb/c91c72362dd431509b726cbc9cd713d9 to your computer and use it in GitHub Desktop.
DGX-Spark vLLM on docker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # DGX Spark -- RDMA direct-link network config. | |
| # | |
| # Deploy to /etc/netplan/90-dgx-rdma-direct.yaml on spark-362c, then: | |
| # sudo chmod 600 /etc/netplan/90-dgx-rdma-direct.yaml | |
| # sudo netplan generate | |
| # sudo netplan apply | |
| # | |
| # This brings up the 200GbE ConnectX-7 DAC direct link to spark-acad on an | |
| # isolated /24 (192.168.200.0/24). Without this file the link config is lost | |
| # on reboot (a plain `ip addr add` does not persist). | |
| # | |
| # GOTCHA: the RDMA device name (rocep1s0f1) is NOT the Linux interface name. | |
| # `ip`/netplan use the Linux name `enp1s0f1np1`. The cable is on lane 1. | |
| # services/dgx-spark/README.md for the full reference. | |
| network: | |
| version: 2 | |
| ethernets: | |
| enp1s0f1np1: | |
| dhcp4: false | |
| dhcp6: false | |
| addresses: | |
| - 192.168.200.10/24 | |
| mtu: 9000 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: dgx-spark | |
| # DGX Spark `spark` -- Qwen3.6-35B-A3B-FP8 (MoE) vLLM serving. | |
| # | |
| # Declarative translation of the `docker run` command from the Memorizer | |
| # runbook "DGX Spark: how to launch Qwen3.6-35B-A3B-FP8 (MoE) on spark". | |
| # A single-container compose project is the same spec as `docker run` -- it | |
| # does NOT reintroduce sparkrun or any orchestrator, which were deliberately | |
| # rejected for this 2-node, single-model-per-node workload. | |
| # | |
| # This is the MoE high-throughput endpoint (3B active params/token). Callers | |
| # that want speed over reasoning depth should pass | |
| # `chat_template_kwargs: {"enable_thinking": false}` in the request body. | |
| # | |
| # IMAGE TAG: `dgx-vllm-eugr-nightly:latest` is intentionally a moving nightly | |
| # tag. The whole point of this deployment was a current vLLM nightly, which | |
| # fixes the MTP=2 crashes seen on older pinned images. `latest` will NOT | |
| # auto-refresh on `up -d` or host reboot -- use `../update.sh` to pull a newer | |
| # nightly deliberately. Pinning to an image digest is the future hardening | |
| # step once a known-good nightly is identified. | |
| services: | |
| vllm: | |
| image: ghcr.io/spark-arena/dgx-vllm-eugr-nightly:latest | |
| container_name: vllm-qwen36-a3b | |
| restart: unless-stopped | |
| ipc: host | |
| shm_size: 32gb | |
| ports: | |
| - "8000:8000" | |
| environment: | |
| - VLLM_MARLIN_USE_ATOMIC_ADD=1 | |
| volumes: | |
| # Bind-mounts to the petabridge user's caches. The HuggingFace cache | |
| # holds the pre-downloaded FP8 weights (~36 GB); the vLLM cache holds | |
| # the torch.compile / cudagraph cache that keeps restarts fast. | |
| # docker compose does NOT expand `~`, so ${HOME} is used explicitly. | |
| - ${HOME}/.cache/huggingface:/root/.cache/huggingface | |
| - ${HOME}/.cache/vllm:/root/.cache/vllm | |
| # No deploy.resources.limits.memory: these are single-tenant inference | |
| # boxes where GPU / 128 GB unified memory is the real constraint, not a | |
| # container cgroup limit. Consistent with services/vllm/docker-compose.yml. | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: all | |
| capabilities: [gpu] | |
| command: | |
| - vllm | |
| - serve | |
| - Qwen/Qwen3.6-35B-A3B-FP8 | |
| - --host | |
| - 0.0.0.0 | |
| - --port | |
| - "8000" | |
| - --max-model-len | |
| - "262144" | |
| - --max-num-batched-tokens | |
| - "32768" | |
| - --trust-remote-code | |
| - --gpu-memory-utilization | |
| - "0.8" | |
| - --enable-auto-tool-choice | |
| - --tool-call-parser | |
| - qwen3_coder | |
| - --reasoning-parser | |
| - qwen3 | |
| - --kv-cache-dtype | |
| - fp8 | |
| - --load-format | |
| - instanttensor | |
| - --attention-backend | |
| - flashinfer | |
| # Native Qwen3.6 multi-token-prediction speculative decoding (MTP=2). | |
| # MTP heads share weights with the base model -- no separate drafter. | |
| # | |
| # TEMPORARILY DISABLED (2026-05-25, diagnostic). | |
| # | |
| # Post-restart of the eugr nightly `0fa888465.d20260517`, this stack | |
| # showed `vllm:prefix_cache_hits_total=0` across hundreds of identical- | |
| # prefix probes despite `vllm:prefix_cache_queries_total` incrementing | |
| # 1:1 with prompt tokens, and `vllm:kv_cache_usage_perc=0`. Caching is | |
| # effectively non-functional under (MoE + MTP + --enable-prefix-caching). | |
| # Suspect upstream vllm#43559 (filed 2026-05-25): "Accuracy drops ~20% | |
| # when --enable-prefix-caching is used together with MTP speculative | |
| # decoding (Qwen3.6 35B-A3B)" -- same model family, same flag combo, | |
| # core dev `zack041` replicated and suspects GDN/mamba cache-boundary | |
| # corruption. Restoring MTP is gated on that issue resolving or on a | |
| # known-good newer nightly being pinned. | |
| # - --speculative-config | |
| # - '{"method": "mtp", "num_speculative_tokens": 2}' | |
| - --enable-prefix-caching | |
| # Populates response.usage.prompt_tokens_details.cached_tokens so OpenAI | |
| # API clients (e.g. Netclaw's eval harness) can read per-request prefix | |
| # cache hits. Without this flag the field is null and server-side | |
| # Prometheus /metrics is the only ground truth. The V1-engine bug that | |
| # used to break this field (vllm#16162) was fixed by vllm#18149 in | |
| # 2025-05, well before the nightly we run. | |
| - --enable-prompt-tokens-details | |
| - -tp | |
| - "1" | |
| - -pp | |
| - "1" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment