Skip to content

Instantly share code, notes, and snippets.

@17twenty
Created June 1, 2026 03:41
Show Gist options
  • Select an option

  • Save 17twenty/b3f55d88fc9eb3b873463d5faedf560b to your computer and use it in GitHub Desktop.

Select an option

Save 17twenty/b3f55d88fc9eb3b873463d5faedf560b to your computer and use it in GitHub Desktop.
Benchmark your vllm powered models - check throughput vs concurrency to see what your hardware supports
#!/usr/bin/env python3
import os, time
os.environ.setdefault("HF_HOME", "/root/deepseek/hf-cache")
os.environ.setdefault("HF_HUB_OFFLINE", "1")
os.environ.setdefault("VLLM_DEEPSEEK_V4_FALLBACK", "1")
os.environ.setdefault("VLLM_USE_DEEP_GEMM", "0")
os.environ.setdefault("VLLM_USE_FLASHINFER_SAMPLER", "0")
from vllm import LLM, SamplingParams
"""Measure decode throughput at N concurrent streams, no offload."""
N = int(os.environ.get("NUSERS", "10"))
GEN = int(os.environ.get("GENTOK", "200"))
llm = LLM(model="deepseek-ai/DeepSeek-V4-Flash", tensor_parallel_size=4,
enforce_eager=True, trust_remote_code=True, max_model_len=32768,
max_num_seqs=max(N,16), max_num_batched_tokens=2048,
gpu_memory_utilization=0.95, kv_cache_dtype="fp8_ds_mla")
base = ("Write a short paragraph about the history and significance of the "
"city of {}.")
cities = ["Paris","Tokyo","Cairo","Lima","Oslo","Delhi","Rome","Seoul","Accra","Quito",
"Bern","Lagos","Hanoi","Kyiv","Doha","Riga"][:N]
prompts = [base.format(c) for c in cities]
sp = SamplingParams(temperature=0.0, max_tokens=GEN, ignore_eos=True)
# warm up 1 step (1 short req) to exclude first-call JIT from timing
llm.generate(["hello"], SamplingParams(temperature=0.0, max_tokens=4))
t0 = time.perf_counter()
outs = llm.generate(prompts, sp)
dt = time.perf_counter() - t0
out_tok = sum(len(o.outputs[0].token_ids) for o in outs)
print(f"\n=== {N} concurrent users, {GEN} tok each, no offload ===")
print(f"wall: {dt:.1f}s total output tokens: {out_tok}")
print(f"AGGREGATE throughput: {out_tok/dt:.1f} tok/s")
print(f"PER-USER throughput: {out_tok/dt/N:.1f} tok/s/user")
# single-stream for comparison
t0 = time.perf_counter()
o1 = llm.generate([prompts[0]], sp)
dt1 = time.perf_counter() - t0
n1 = len(o1[0].outputs[0].token_ids)
print(f"\nSINGLE stream: {n1/dt1:.1f} tok/s")
print("BENCH_DONE")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment