zeryx zeryx

Bug: Mamba prefix caching + MTP speculative decoding crashes on startup

vLLM version: 0.19.0 (official Docker image vllm/vllm-openai:v0.19.0)
GPU: NVIDIA B200 (178 GB VRAM), tested TP=1 through TP=8
Model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 (NemotronH hybrid Mamba2-Transformer MoE)
Python: 3.12

	#!/usr/bin/env bash
	# NVFP4 SM12x Validation Script
	# Tests CUTLASS FP4 kernels and MoE backend selection on Blackwell (SM120/SM121)
	# Usage: docker run --gpus all --privileged --rm -v /path/to/this:/test ghcr.io/zeryx/vllm:nvfp4-sm120-f59929f59 bash /test/test_nvfp4_sm12x.sh
	# Or run directly in a vLLM environment built with TORCH_CUDA_ARCH_LIST="12.0"

	set -euo pipefail

	RED='\033[0;31m'
	GREEN='\033[0;32m'

	"""Test prefix caching with /v1/completions (raw text, no chat template).

	Sends 3 identical long prompts, scrapes /metrics pre/post each, reports
	per-request prefix_cache counter delta. If hits start incrementing on
	req 2+, then cache works for raw text and the chat-template is the
	variance source. If hits stay 0 across all 3, the bug is deeper.
	"""
	import json, urllib.request, time

	PORT = 8000

	"""Send N identical requests with sys=1000, scrape /metrics before/after
	each request, and report per-request cache hit delta."""
	import json, urllib.request, time

	PORT = 8000
	MODEL = "/models/nano-30b-nvfp4"
	BASE = f"http://localhost:{PORT}"

	# ~1000-token shared system prompt
	SYS = ("You are a helpful assistant. Please provide detailed technical answers "

	import json, urllib.request

	URL = "http://localhost:8000/v1/chat/completions"
	MODEL = "/models/nano-30b-nvfp4"

	def trial(label, sys_repeat, n=3):
	sys_text = ("You are a helpful assistant. Please provide detailed technical answers. " * sys_repeat).strip()
	payload = {
	"model": MODEL,
	"messages": [

	from Algorithmia import ADK
	import joblib


	## This function uses the model manifest `state` or `modelData` class to get model files defined in the model manifest automatically.
	## No client work required, just make sure the name in `get_model` matches the name in your model manifest.
	def load(state):
	state['model'] = joblib.load(state.get_model("model"))
	state['vectorizer'] = joblib.load(state.get_model("vectorizer"))
	return state

	import Algorithmia
	from time import time
	import pickle
	from src.data import data

	client = Algorithmia.client()
	DATA_MODEL_DIR = "data://.my/example"
	MODEL_NAME = "example.pkl"
	TIME_0 = 0
	LAST_MODIFIED = ""

	from Algorithmia import ADK
	import Algorithmia
	from time import sleep, time

	state_file_path = "data://.my/locking/resource.json"
	lock_file_path = "data://.my/locking/lock"
	client = Algorithmia.client()


	class AlgorithmiaLock(object):

	import Algorithmia
	import pandas as pd
	client = Algorithmia.client()



	def apply(input):
	input_dataframe = pd.DataFrame.from_dict(client.file(input).getJson())
	...
	...


	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import AdamW
	from random import choice
	from torch.nn import functional as F
	import torch

	tokenizer = AutoTokenizer.from_pretrained("gpt2")
	model = AutoModelForCausalLM.from_pretrained("gpt2").to('cuda')