Created
June 1, 2026 03:41
-
-
Save 17twenty/b3f55d88fc9eb3b873463d5faedf560b to your computer and use it in GitHub Desktop.
Benchmark your vllm powered models - check throughput vs concurrency to see what your hardware supports
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import os, time | |
| os.environ.setdefault("HF_HOME", "/root/deepseek/hf-cache") | |
| os.environ.setdefault("HF_HUB_OFFLINE", "1") | |
| os.environ.setdefault("VLLM_DEEPSEEK_V4_FALLBACK", "1") | |
| os.environ.setdefault("VLLM_USE_DEEP_GEMM", "0") | |
| os.environ.setdefault("VLLM_USE_FLASHINFER_SAMPLER", "0") | |
| from vllm import LLM, SamplingParams | |
| """Measure decode throughput at N concurrent streams, no offload.""" | |
| N = int(os.environ.get("NUSERS", "10")) | |
| GEN = int(os.environ.get("GENTOK", "200")) | |
| llm = LLM(model="deepseek-ai/DeepSeek-V4-Flash", tensor_parallel_size=4, | |
| enforce_eager=True, trust_remote_code=True, max_model_len=32768, | |
| max_num_seqs=max(N,16), max_num_batched_tokens=2048, | |
| gpu_memory_utilization=0.95, kv_cache_dtype="fp8_ds_mla") | |
| base = ("Write a short paragraph about the history and significance of the " | |
| "city of {}.") | |
| cities = ["Paris","Tokyo","Cairo","Lima","Oslo","Delhi","Rome","Seoul","Accra","Quito", | |
| "Bern","Lagos","Hanoi","Kyiv","Doha","Riga"][:N] | |
| prompts = [base.format(c) for c in cities] | |
| sp = SamplingParams(temperature=0.0, max_tokens=GEN, ignore_eos=True) | |
| # warm up 1 step (1 short req) to exclude first-call JIT from timing | |
| llm.generate(["hello"], SamplingParams(temperature=0.0, max_tokens=4)) | |
| t0 = time.perf_counter() | |
| outs = llm.generate(prompts, sp) | |
| dt = time.perf_counter() - t0 | |
| out_tok = sum(len(o.outputs[0].token_ids) for o in outs) | |
| print(f"\n=== {N} concurrent users, {GEN} tok each, no offload ===") | |
| print(f"wall: {dt:.1f}s total output tokens: {out_tok}") | |
| print(f"AGGREGATE throughput: {out_tok/dt:.1f} tok/s") | |
| print(f"PER-USER throughput: {out_tok/dt/N:.1f} tok/s/user") | |
| # single-stream for comparison | |
| t0 = time.perf_counter() | |
| o1 = llm.generate([prompts[0]], sp) | |
| dt1 = time.perf_counter() - t0 | |
| n1 = len(o1[0].outputs[0].token_ids) | |
| print(f"\nSINGLE stream: {n1/dt1:.1f} tok/s") | |
| print("BENCH_DONE") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment