Skip to content

Instantly share code, notes, and snippets.

@zillwc
Created May 23, 2025 23:06
Show Gist options
  • Select an option

  • Save zillwc/df6091714f05e3b109b1ee6a6c83b3ef to your computer and use it in GitHub Desktop.

Select an option

Save zillwc/df6091714f05e3b109b1ee6a6c83b3ef to your computer and use it in GitHub Desktop.
Benchmark local GGUF and MLX models
import time
import requests
prompts = [
"Implement an LRU (Least Recently Used) cache in Python as a single .py file. It must support get and put operations in O(1) time complexity. Include test code at the bottom.",
"Write a complete single-file Python implementation of a multi-threaded producer-consumer system using threading and a shared queue. Include comments and a short main() function that demonstrates functionality.",
"Design and fully implement a rate limiter in Python that allows up to 100 requests per 10-second window per user. Use in-memory data structures and implement it in a single .py file with example usage.",
"Implement a recursive descent parser in Python in one file. The parser should handle basic arithmetic expressions with +, -, *, /, and parentheses. Include both the lexer and parser logic in the same file, and provide example input/output.",
"Write a single Python script that simulates a simplified Git CLI. It should support commands: init, add, commit, and log. Store all metadata in a `.mygit` folder in the working directory. No external dependencies allowed."
]
# model<>server config
models = [
{
"name": "LMStudio - devstral GGUF",
"url": "http://localhost:1234/v1/chat/completions",
"model": "devstral-small-2505"
},
{
"name": "LMStudio - devstral MLX",
"url": "http://localhost:1234/v1/chat/completions",
"model": "devstral-small-2505-mlx"
},
{
"name": "Ollama - devstral:24b GGUF",
"url": "http://localhost:11434/v1/chat/completions",
"model": "devstral:24b"
}
]
for config in models:
print(f"\n=== Benchmarking {config['name']} ===")
ttft_list = []
tps_list = []
for prompt in prompts:
print(f"\n▶ Prompt: {prompt}")
updated_prompt = f"{prompt}. Give me just the code, nothing else."
payload = {
"model": config["model"],
"messages": [{"role": "user", "content": updated_prompt}],
"temperature": 0.7,
"stream": False
}
headers = {"Content-Type": "application/json"}
start_time = time.time()
try:
response = requests.post(config["url"], headers=headers, json=payload)
response.raise_for_status()
result = response.json()
except Exception as e:
print(f"❌ Request failed: {e}")
continue
try:
output = result["choices"][0]["message"]["content"]
tokens = result["usage"]["completion_tokens"]
except Exception as e:
print(f"❌ Failed to parse response: {result}")
continue
end_time = time.time()
ttft = end_time - start_time
tps = tokens / ttft if ttft > 0 else 0
ttft_list.append(ttft)
tps_list.append(tps)
print(f"🕒 TTFT: {ttft:.2f}s | ✍️ Tokens: {tokens} | ⚡ TPS: {tps:.2f}")
if ttft_list and tps_list:
avg_ttft = sum(ttft_list) / len(ttft_list)
avg_tps = sum(tps_list) / len(tps_list)
print(f"\n📊 {config['name']} Summary")
print(f"Average TTFT: {avg_ttft:.2f} seconds")
print(f"Average TPS: {avg_tps:.2f}")
else:
print(f"\n⚠️ No successful responses for {config['name']}")
# pause between benchmarks
print("⏳ Sleeping 5 seconds before next model...\n")
time.sleep(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment