Aaronontheweb · May 28, 2026 19:16
diff --git a/90-dgx-rdma-direct.yaml b/90-dgx-rdma-direct.yaml
 # DGX Spark -- RDMA direct-link network config.
 #
 # Deploy to /etc/netplan/90-dgx-rdma-direct.yaml on spark-362c, then:
 #   sudo chmod 600 /etc/netplan/90-dgx-rdma-direct.yaml
 #   sudo netplan generate
 #   sudo netplan apply
 #
 # This brings up the 200GbE ConnectX-7 DAC direct link to spark-acad on an
 # isolated /24 (192.168.200.0/24). Without this file the link config is lost
 # on reboot (a plain `ip addr add` does not persist).
 #
 # GOTCHA: the RDMA device name (rocep1s0f1) is NOT the Linux interface name.
 # `ip`/netplan use the Linux name `enp1s0f1np1`. The cable is on lane 1.
 # services/dgx-spark/README.md for the full reference.

 network:
  version: 2
  ethernets:
    enp1s0f1np1:
      dhcp4: false
      dhcp6: false
      addresses:
        - 192.168.200.10/24
      mtu: 9000
diff --git a/docker-compose.yml b/docker-compose.yml
 name: dgx-spark

 # DGX Spark `spark` -- Qwen3.6-35B-A3B-FP8 (MoE) vLLM serving.
 #
 # Declarative translation of the `docker run` command from the Memorizer
 # runbook "DGX Spark: how to launch Qwen3.6-35B-A3B-FP8 (MoE) on spark".
 # A single-container compose project is the same spec as `docker run` -- it
 # does NOT reintroduce sparkrun or any orchestrator, which were deliberately
 # rejected for this 2-node, single-model-per-node workload.
 #
 # This is the MoE high-throughput endpoint (3B active params/token). Callers
 # that want speed over reasoning depth should pass
 # `chat_template_kwargs: {"enable_thinking": false}` in the request body.
 #
 # IMAGE TAG: `dgx-vllm-eugr-nightly:latest` is intentionally a moving nightly
 # tag. The whole point of this deployment was a current vLLM nightly, which
 # fixes the MTP=2 crashes seen on older pinned images. `latest` will NOT
 # auto-refresh on `up -d` or host reboot -- use `../update.sh` to pull a newer
 # nightly deliberately. Pinning to an image digest is the future hardening
 # step once a known-good nightly is identified.

 services:
  vllm:
    image: ghcr.io/spark-arena/dgx-vllm-eugr-nightly:latest
    container_name: vllm-qwen36-a3b
    restart: unless-stopped
    ipc: host
    shm_size: 32gb
    ports:
      - "8000:8000"
    environment:
      - VLLM_MARLIN_USE_ATOMIC_ADD=1
    volumes:
      # Bind-mounts to the petabridge user's caches. The HuggingFace cache
      # holds the pre-downloaded FP8 weights (~36 GB); the vLLM cache holds
      # the torch.compile / cudagraph cache that keeps restarts fast.
      # docker compose does NOT expand `~`, so ${HOME} is used explicitly.
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
      - ${HOME}/.cache/vllm:/root/.cache/vllm
    # No deploy.resources.limits.memory: these are single-tenant inference
    # boxes where GPU / 128 GB unified memory is the real constraint, not a
    # container cgroup limit. Consistent with services/vllm/docker-compose.yml.
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    command:
      - vllm
      - serve
      - Qwen/Qwen3.6-35B-A3B-FP8
      - --host
      - 0.0.0.0
      - --port
      - "8000"
      - --max-model-len
      - "262144"
      - --max-num-batched-tokens
      - "32768"
      - --trust-remote-code
      - --gpu-memory-utilization
      - "0.8"
      - --enable-auto-tool-choice
      - --tool-call-parser
      - qwen3_coder
      - --reasoning-parser
      - qwen3
      - --kv-cache-dtype
      - fp8
      - --load-format
      - instanttensor
      - --attention-backend
      - flashinfer
      # Native Qwen3.6 multi-token-prediction speculative decoding (MTP=2).
      # MTP heads share weights with the base model -- no separate drafter.
      #
      # TEMPORARILY DISABLED (2026-05-25, diagnostic).
      #
      # Post-restart of the eugr nightly `0fa888465.d20260517`, this stack
      # showed `vllm:prefix_cache_hits_total=0` across hundreds of identical-
      # prefix probes despite `vllm:prefix_cache_queries_total` incrementing
      # 1:1 with prompt tokens, and `vllm:kv_cache_usage_perc=0`. Caching is
      # effectively non-functional under (MoE + MTP + --enable-prefix-caching).
      # Suspect upstream vllm#43559 (filed 2026-05-25): "Accuracy drops ~20%
      # when --enable-prefix-caching is used together with MTP speculative
      # decoding (Qwen3.6 35B-A3B)" -- same model family, same flag combo,
      # core dev `zack041` replicated and suspects GDN/mamba cache-boundary
      # corruption. Restoring MTP is gated on that issue resolving or on a
      # known-good newer nightly being pinned.
      # - --speculative-config
      # - '{"method": "mtp", "num_speculative_tokens": 2}'
      - --enable-prefix-caching
      # Populates response.usage.prompt_tokens_details.cached_tokens so OpenAI
      # API clients (e.g. Netclaw's eval harness) can read per-request prefix
      # cache hits. Without this flag the field is null and server-side
      # Prometheus /metrics is the only ground truth. The V1-engine bug that
      # used to break this field (vllm#16162) was fixed by vllm#18149 in
      # 2025-05, well before the nightly we run.
      - --enable-prompt-tokens-details
      - -tp
      - "1"
      - -pp
      - "1"
	# DGX Spark -- RDMA direct-link network config.
	#
	# Deploy to /etc/netplan/90-dgx-rdma-direct.yaml on spark-362c, then:
	# sudo chmod 600 /etc/netplan/90-dgx-rdma-direct.yaml
	# sudo netplan generate
	# sudo netplan apply
	#
	# This brings up the 200GbE ConnectX-7 DAC direct link to spark-acad on an
	# isolated /24 (192.168.200.0/24). Without this file the link config is lost
	# on reboot (a plain `ip addr add` does not persist).
	#
	# GOTCHA: the RDMA device name (rocep1s0f1) is NOT the Linux interface name.
	# `ip`/netplan use the Linux name `enp1s0f1np1`. The cable is on lane 1.
	# services/dgx-spark/README.md for the full reference.

	network:
	version: 2
	ethernets:
	enp1s0f1np1:
	dhcp4: false
	dhcp6: false
	addresses:
	- 192.168.200.10/24
	mtu: 9000
	name: dgx-spark

	# DGX Spark `spark` -- Qwen3.6-35B-A3B-FP8 (MoE) vLLM serving.
	#
	# Declarative translation of the `docker run` command from the Memorizer
	# runbook "DGX Spark: how to launch Qwen3.6-35B-A3B-FP8 (MoE) on spark".
	# A single-container compose project is the same spec as `docker run` -- it
	# does NOT reintroduce sparkrun or any orchestrator, which were deliberately
	# rejected for this 2-node, single-model-per-node workload.
	#
	# This is the MoE high-throughput endpoint (3B active params/token). Callers
	# that want speed over reasoning depth should pass
	# `chat_template_kwargs: {"enable_thinking": false}` in the request body.
	#
	# IMAGE TAG: `dgx-vllm-eugr-nightly:latest` is intentionally a moving nightly
	# tag. The whole point of this deployment was a current vLLM nightly, which
	# fixes the MTP=2 crashes seen on older pinned images. `latest` will NOT
	# auto-refresh on `up -d` or host reboot -- use `../update.sh` to pull a newer
	# nightly deliberately. Pinning to an image digest is the future hardening
	# step once a known-good nightly is identified.

	services:
	vllm:
	image: ghcr.io/spark-arena/dgx-vllm-eugr-nightly:latest
	container_name: vllm-qwen36-a3b
	restart: unless-stopped
	ipc: host
	shm_size: 32gb
	ports:
	- "8000:8000"
	environment:
	- VLLM_MARLIN_USE_ATOMIC_ADD=1
	volumes:
	# Bind-mounts to the petabridge user's caches. The HuggingFace cache
	# holds the pre-downloaded FP8 weights (~36 GB); the vLLM cache holds
	# the torch.compile / cudagraph cache that keeps restarts fast.
	# docker compose does NOT expand `~`, so ${HOME} is used explicitly.
	- ${HOME}/.cache/huggingface:/root/.cache/huggingface
	- ${HOME}/.cache/vllm:/root/.cache/vllm
	# No deploy.resources.limits.memory: these are single-tenant inference
	# boxes where GPU / 128 GB unified memory is the real constraint, not a
	# container cgroup limit. Consistent with services/vllm/docker-compose.yml.
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: all
	capabilities: [gpu]
	command:
	- vllm
	- serve
	- Qwen/Qwen3.6-35B-A3B-FP8
	- --host
	- 0.0.0.0
	- --port
	- "8000"
	- --max-model-len
	- "262144"
	- --max-num-batched-tokens
	- "32768"
	- --trust-remote-code
	- --gpu-memory-utilization
	- "0.8"
	- --enable-auto-tool-choice
	- --tool-call-parser
	- qwen3_coder
	- --reasoning-parser
	- qwen3
	- --kv-cache-dtype
	- fp8
	- --load-format
	- instanttensor
	- --attention-backend
	- flashinfer
	# Native Qwen3.6 multi-token-prediction speculative decoding (MTP=2).
	# MTP heads share weights with the base model -- no separate drafter.
	#
	# TEMPORARILY DISABLED (2026-05-25, diagnostic).
	#
	# Post-restart of the eugr nightly `0fa888465.d20260517`, this stack
	# showed `vllm:prefix_cache_hits_total=0` across hundreds of identical-
	# prefix probes despite `vllm:prefix_cache_queries_total` incrementing
	# 1:1 with prompt tokens, and `vllm:kv_cache_usage_perc=0`. Caching is
	# effectively non-functional under (MoE + MTP + --enable-prefix-caching).
	# Suspect upstream vllm#43559 (filed 2026-05-25): "Accuracy drops ~20%
	# when --enable-prefix-caching is used together with MTP speculative
	# decoding (Qwen3.6 35B-A3B)" -- same model family, same flag combo,
	# core dev `zack041` replicated and suspects GDN/mamba cache-boundary
	# corruption. Restoring MTP is gated on that issue resolving or on a
	# known-good newer nightly being pinned.
	# - --speculative-config
	# - '{"method": "mtp", "num_speculative_tokens": 2}'
	- --enable-prefix-caching
	# Populates response.usage.prompt_tokens_details.cached_tokens so OpenAI
	# API clients (e.g. Netclaw's eval harness) can read per-request prefix
	# cache hits. Without this flag the field is null and server-side
	# Prometheus /metrics is the only ground truth. The V1-engine bug that
	# used to break this field (vllm#16162) was fixed by vllm#18149 in
	# 2025-05, well before the nightly we run.
	- --enable-prompt-tokens-details
	- -tp
	- "1"
	- -pp
	- "1"