17twenty · June 1, 2026 03:41
diff --git a/benchmark_throughput.py b/benchmark_throughput.py
 #!/usr/bin/env python3

 import os, time
 os.environ.setdefault("HF_HOME", "/root/deepseek/hf-cache")
 os.environ.setdefault("HF_HUB_OFFLINE", "1")
 os.environ.setdefault("VLLM_DEEPSEEK_V4_FALLBACK", "1")
 os.environ.setdefault("VLLM_USE_DEEP_GEMM", "0")
 os.environ.setdefault("VLLM_USE_FLASHINFER_SAMPLER", "0")
 from vllm import LLM, SamplingParams

 """Measure decode throughput at N concurrent streams, no offload."""

 N = int(os.environ.get("NUSERS", "10"))
 GEN = int(os.environ.get("GENTOK", "200"))
 llm = LLM(model="deepseek-ai/DeepSeek-V4-Flash", tensor_parallel_size=4,
          enforce_eager=True, trust_remote_code=True, max_model_len=32768,
          max_num_seqs=max(N,16), max_num_batched_tokens=2048,
          gpu_memory_utilization=0.95, kv_cache_dtype="fp8_ds_mla")

 base = ("Write a short paragraph about the history and significance of the "
        "city of {}.")
 cities = ["Paris","Tokyo","Cairo","Lima","Oslo","Delhi","Rome","Seoul","Accra","Quito",
          "Bern","Lagos","Hanoi","Kyiv","Doha","Riga"][:N]
 prompts = [base.format(c) for c in cities]
 sp = SamplingParams(temperature=0.0, max_tokens=GEN, ignore_eos=True)

 # warm up 1 step (1 short req) to exclude first-call JIT from timing
 llm.generate(["hello"], SamplingParams(temperature=0.0, max_tokens=4))

 t0 = time.perf_counter()
 outs = llm.generate(prompts, sp)
 dt = time.perf_counter() - t0
 out_tok = sum(len(o.outputs[0].token_ids) for o in outs)
 print(f"\n=== {N} concurrent users, {GEN} tok each, no offload ===")
 print(f"wall: {dt:.1f}s   total output tokens: {out_tok}")
 print(f"AGGREGATE throughput: {out_tok/dt:.1f} tok/s")
 print(f"PER-USER throughput:  {out_tok/dt/N:.1f} tok/s/user")

 # single-stream for comparison
 t0 = time.perf_counter()
 o1 = llm.generate([prompts[0]], sp)
 dt1 = time.perf_counter() - t0
 n1 = len(o1[0].outputs[0].token_ids)
 print(f"\nSINGLE stream: {n1/dt1:.1f} tok/s")
 print("BENCH_DONE")
	#!/usr/bin/env python3

	import os, time
	os.environ.setdefault("HF_HOME", "/root/deepseek/hf-cache")
	os.environ.setdefault("HF_HUB_OFFLINE", "1")
	os.environ.setdefault("VLLM_DEEPSEEK_V4_FALLBACK", "1")
	os.environ.setdefault("VLLM_USE_DEEP_GEMM", "0")
	os.environ.setdefault("VLLM_USE_FLASHINFER_SAMPLER", "0")
	from vllm import LLM, SamplingParams

	"""Measure decode throughput at N concurrent streams, no offload."""

	N = int(os.environ.get("NUSERS", "10"))
	GEN = int(os.environ.get("GENTOK", "200"))
	llm = LLM(model="deepseek-ai/DeepSeek-V4-Flash", tensor_parallel_size=4,
	enforce_eager=True, trust_remote_code=True, max_model_len=32768,
	max_num_seqs=max(N,16), max_num_batched_tokens=2048,
	gpu_memory_utilization=0.95, kv_cache_dtype="fp8_ds_mla")

	base = ("Write a short paragraph about the history and significance of the "
	"city of {}.")
	cities = ["Paris","Tokyo","Cairo","Lima","Oslo","Delhi","Rome","Seoul","Accra","Quito",
	"Bern","Lagos","Hanoi","Kyiv","Doha","Riga"][:N]
	prompts = [base.format(c) for c in cities]
	sp = SamplingParams(temperature=0.0, max_tokens=GEN, ignore_eos=True)

	# warm up 1 step (1 short req) to exclude first-call JIT from timing
	llm.generate(["hello"], SamplingParams(temperature=0.0, max_tokens=4))

	t0 = time.perf_counter()
	outs = llm.generate(prompts, sp)
	dt = time.perf_counter() - t0
	out_tok = sum(len(o.outputs[0].token_ids) for o in outs)
	print(f"\n=== {N} concurrent users, {GEN} tok each, no offload ===")
	print(f"wall: {dt:.1f}s total output tokens: {out_tok}")
	print(f"AGGREGATE throughput: {out_tok/dt:.1f} tok/s")
	print(f"PER-USER throughput: {out_tok/dt/N:.1f} tok/s/user")

	# single-stream for comparison
	t0 = time.perf_counter()
	o1 = llm.generate([prompts[0]], sp)
	dt1 = time.perf_counter() - t0
	n1 = len(o1[0].outputs[0].token_ids)
	print(f"\nSINGLE stream: {n1/dt1:.1f} tok/s")
	print("BENCH_DONE")
No results found