Skip to content

Instantly share code, notes, and snippets.

@djinn
Created August 6, 2025 03:35
Show Gist options
  • Save djinn/1fe7dea35781878a65a1bbf9b1d4c9bc to your computer and use it in GitHub Desktop.
Save djinn/1fe7dea35781878a65a1bbf9b1d4c9bc to your computer and use it in GitHub Desktop.
import time
from vllm import LLM, SamplingParams
import lmdeploy
# Test prompts for real-world language use cases
PROMPTS = [
"Explain quantum computing in simple terms.",
"What is the future of renewable energy?",
"Describe the Great Wall of China.",
"Why is the sky blue?",
"How do you make sourdough bread?",
"Summarize the history of the Roman Empire.",
"What is the Pythagorean theorem?",
"Describe a black hole.",
"Who was Ada Lovelace?",
"What are some uses of machine learning?",
"Tell me a joke.",
"What is a blockchain?",
"What are the benefits of yoga?",
"Explain the greenhouse effect.",
"What causes inflation?",
"Describe the process of photosynthesis.",
"Who was Albert Einstein?",
"What is quantum entanglement?",
"Why do cats purr?",
"What is climate change?"
]
def benchmark_vllm(do_sample=False):
print("\nBenchmarking vLLM...")
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
# vLLM uses temperature to decide sampling vs greedy
temperature = 0.8 if do_sample else 0.0
sampling_params = SamplingParams(
temperature=temperature,
top_p=0.9,
max_tokens=512,
)
start = time.time()
outputs = llm.generate(PROMPTS, sampling_params)
end = time.time()
total_output_tokens = sum(len(output.outputs[0].text.split()) for output in outputs)
elapsed = end - start
print(f"vLLM → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")
def benchmark_lmdeploy(do_sample=False):
print("\nBenchmarking LMDeploy (TurboMind)...")
with lmdeploy.pipeline("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") as pipe:
start = time.time()
responses = pipe(
PROMPTS,
do_sample=do_sample,
temperature=0.8 if do_sample else 0.0,
top_p=0.9,
max_new_tokens=512
)
end = time.time()
# Ensure outputs are converted to strings
all_outputs = responses if isinstance(responses, list) else [responses]
total_output_tokens = 0
for r in all_outputs:
if hasattr(r, "text"):
total_output_tokens += len(r.text.split())
else:
total_output_tokens += len(str(r).split())
elapsed = end - start
print(f"LMDeploy → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")
if __name__ == "__main__":
benchmark_vllm(do_sample=False)
benchmark_lmdeploy(do_sample=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment