djinn · August 6, 2025 03:35
diff --git a/vllm_lmdeploy.py b/vllm_lmdeploy.py
 import time
 from vllm import LLM, SamplingParams
 import lmdeploy

 # Test prompts for real-world language use cases
 PROMPTS = [
    "Explain quantum computing in simple terms.",
    "What is the future of renewable energy?",
    "Describe the Great Wall of China.",
    "Why is the sky blue?",
    "How do you make sourdough bread?",
    "Summarize the history of the Roman Empire.",
    "What is the Pythagorean theorem?",
    "Describe a black hole.",
    "Who was Ada Lovelace?",
    "What are some uses of machine learning?",
    "Tell me a joke.",
    "What is a blockchain?",
    "What are the benefits of yoga?",
    "Explain the greenhouse effect.",
    "What causes inflation?",
    "Describe the process of photosynthesis.",
    "Who was Albert Einstein?",
    "What is quantum entanglement?",
    "Why do cats purr?",
    "What is climate change?"
 ]

 def benchmark_vllm(do_sample=False):
    print("\nBenchmarking vLLM...")
    llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

    # vLLM uses temperature to decide sampling vs greedy
    temperature = 0.8 if do_sample else 0.0
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.9,
        max_tokens=512,
    )

    start = time.time()
    outputs = llm.generate(PROMPTS, sampling_params)
    end = time.time()

    total_output_tokens = sum(len(output.outputs[0].text.split()) for output in outputs)
    elapsed = end - start
    print(f"vLLM → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")


 def benchmark_lmdeploy(do_sample=False):
    print("\nBenchmarking LMDeploy (TurboMind)...")
    with lmdeploy.pipeline("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") as pipe:
        start = time.time()
        responses = pipe(
            PROMPTS,
            do_sample=do_sample,
            temperature=0.8 if do_sample else 0.0,
            top_p=0.9,
            max_new_tokens=512
        )
        end = time.time()

    # Ensure outputs are converted to strings
    all_outputs = responses if isinstance(responses, list) else [responses]
    total_output_tokens = 0
    for r in all_outputs:
        if hasattr(r, "text"):
            total_output_tokens += len(r.text.split())
        else:
            total_output_tokens += len(str(r).split())

    elapsed = end - start
    print(f"LMDeploy → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")


 if __name__ == "__main__":
    benchmark_vllm(do_sample=False)
    benchmark_lmdeploy(do_sample=True)
	import time
	from vllm import LLM, SamplingParams
	import lmdeploy

	# Test prompts for real-world language use cases
	PROMPTS = [
	"Explain quantum computing in simple terms.",
	"What is the future of renewable energy?",
	"Describe the Great Wall of China.",
	"Why is the sky blue?",
	"How do you make sourdough bread?",
	"Summarize the history of the Roman Empire.",
	"What is the Pythagorean theorem?",
	"Describe a black hole.",
	"Who was Ada Lovelace?",
	"What are some uses of machine learning?",
	"Tell me a joke.",
	"What is a blockchain?",
	"What are the benefits of yoga?",
	"Explain the greenhouse effect.",
	"What causes inflation?",
	"Describe the process of photosynthesis.",
	"Who was Albert Einstein?",
	"What is quantum entanglement?",
	"Why do cats purr?",
	"What is climate change?"
	]

	def benchmark_vllm(do_sample=False):
	print("\nBenchmarking vLLM...")
	llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

	# vLLM uses temperature to decide sampling vs greedy
	temperature = 0.8 if do_sample else 0.0
	sampling_params = SamplingParams(
	temperature=temperature,
	top_p=0.9,
	max_tokens=512,
	)

	start = time.time()
	outputs = llm.generate(PROMPTS, sampling_params)
	end = time.time()

	total_output_tokens = sum(len(output.outputs[0].text.split()) for output in outputs)
	elapsed = end - start
	print(f"vLLM → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")


	def benchmark_lmdeploy(do_sample=False):
	print("\nBenchmarking LMDeploy (TurboMind)...")
	with lmdeploy.pipeline("deepseek-ai/DeepSeek-R1-Distill-Llama-8B") as pipe:
	start = time.time()
	responses = pipe(
	PROMPTS,
	do_sample=do_sample,
	temperature=0.8 if do_sample else 0.0,
	top_p=0.9,
	max_new_tokens=512
	)
	end = time.time()

	# Ensure outputs are converted to strings
	all_outputs = responses if isinstance(responses, list) else [responses]
	total_output_tokens = 0
	for r in all_outputs:
	if hasattr(r, "text"):
	total_output_tokens += len(r.text.split())
	else:
	total_output_tokens += len(str(r).split())

	elapsed = end - start
	print(f"LMDeploy → {total_output_tokens} tokens in {elapsed:.2f}s → {total_output_tokens / elapsed:.2f} tokens/sec")


	if __name__ == "__main__":
	benchmark_vllm(do_sample=False)
	benchmark_lmdeploy(do_sample=True)