Created
August 6, 2025 03:35
-
-
Save djinn/1fe7dea35781878a65a1bbf9b1d4c9bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from vllm import LLM, SamplingParams | |
import lmdeploy | |
# Test prompts for real-world language use cases | |
PROMPTS = [ | |
"Explain quantum computing in simple terms.", | |
"What is the future of renewable energy?", | |
"Describe the Great Wall of China.", | |
"Why is the sky blue?", | |
"How do you make sourdough bread?", | |
"Summarize the history of the Roman Empire.", | |
"What is the Pythagorean theorem?", | |
"Describe a black hole.", | |
"Who was Ada Lovelace?", | |
"What are some uses of machine learning?", | |
"Tell me a joke.", | |
"What is a blockchain?", | |
"What are the benefits of yoga?", | |
"Explain the greenhouse effect.", | |
"What causes inflation?", | |
"Describe the process of photosynthesis.", | |
"Who was Albert Einstein?", | |
"What is quantum entanglement?", | |
"Why do cats purr?", | |
"What is climate change?" | |
] | |
def benchmark_vllm(do_sample=False): | |
print("\nBenchmarking vLLM...") | |
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B") | |
# vLLM uses temperature to decide sampling vs greedy | |
temperature = 0.8 if do_sample else 0.0 | |
sampling_params = SamplingParams( | |
temperature=temperature, | |
top_p=0.9, | |
max_tokens=512, | |
) | |
start = time.time() | |
outputs = llm.generate(PROMPTS, sampling_params) | |
end = time.time() | |
total_output_tokens = sum(len(output.outputs[0].text.split()) for output in outputs) | |
elapsed = end - start | |
print(f"vLLM → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec") | |
def benchmark_lmdeploy(do_sample=False): | |
print("\nBenchmarking LMDeploy (TurboMind)...") | |
with lmdeploy.pipeline("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") as pipe: | |
start = time.time() | |
responses = pipe( | |
PROMPTS, | |
do_sample=do_sample, | |
temperature=0.8 if do_sample else 0.0, | |
top_p=0.9, | |
max_new_tokens=512 | |
) | |
end = time.time() | |
# Ensure outputs are converted to strings | |
all_outputs = responses if isinstance(responses, list) else [responses] | |
total_output_tokens = 0 | |
for r in all_outputs: | |
if hasattr(r, "text"): | |
total_output_tokens += len(r.text.split()) | |
else: | |
total_output_tokens += len(str(r).split()) | |
elapsed = end - start | |
print(f"LMDeploy → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec") | |
if __name__ == "__main__": | |
benchmark_vllm(do_sample=False) | |
benchmark_lmdeploy(do_sample=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment