Created
May 23, 2025 23:06
-
-
Save zillwc/df6091714f05e3b109b1ee6a6c83b3ef to your computer and use it in GitHub Desktop.
Benchmark local GGUF and MLX models
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import requests | |
| prompts = [ | |
| "Implement an LRU (Least Recently Used) cache in Python as a single .py file. It must support get and put operations in O(1) time complexity. Include test code at the bottom.", | |
| "Write a complete single-file Python implementation of a multi-threaded producer-consumer system using threading and a shared queue. Include comments and a short main() function that demonstrates functionality.", | |
| "Design and fully implement a rate limiter in Python that allows up to 100 requests per 10-second window per user. Use in-memory data structures and implement it in a single .py file with example usage.", | |
| "Implement a recursive descent parser in Python in one file. The parser should handle basic arithmetic expressions with +, -, *, /, and parentheses. Include both the lexer and parser logic in the same file, and provide example input/output.", | |
| "Write a single Python script that simulates a simplified Git CLI. It should support commands: init, add, commit, and log. Store all metadata in a `.mygit` folder in the working directory. No external dependencies allowed." | |
| ] | |
| # model<>server config | |
| models = [ | |
| { | |
| "name": "LMStudio - devstral GGUF", | |
| "url": "http://localhost:1234/v1/chat/completions", | |
| "model": "devstral-small-2505" | |
| }, | |
| { | |
| "name": "LMStudio - devstral MLX", | |
| "url": "http://localhost:1234/v1/chat/completions", | |
| "model": "devstral-small-2505-mlx" | |
| }, | |
| { | |
| "name": "Ollama - devstral:24b GGUF", | |
| "url": "http://localhost:11434/v1/chat/completions", | |
| "model": "devstral:24b" | |
| } | |
| ] | |
| for config in models: | |
| print(f"\n=== Benchmarking {config['name']} ===") | |
| ttft_list = [] | |
| tps_list = [] | |
| for prompt in prompts: | |
| print(f"\n▶ Prompt: {prompt}") | |
| updated_prompt = f"{prompt}. Give me just the code, nothing else." | |
| payload = { | |
| "model": config["model"], | |
| "messages": [{"role": "user", "content": updated_prompt}], | |
| "temperature": 0.7, | |
| "stream": False | |
| } | |
| headers = {"Content-Type": "application/json"} | |
| start_time = time.time() | |
| try: | |
| response = requests.post(config["url"], headers=headers, json=payload) | |
| response.raise_for_status() | |
| result = response.json() | |
| except Exception as e: | |
| print(f"❌ Request failed: {e}") | |
| continue | |
| try: | |
| output = result["choices"][0]["message"]["content"] | |
| tokens = result["usage"]["completion_tokens"] | |
| except Exception as e: | |
| print(f"❌ Failed to parse response: {result}") | |
| continue | |
| end_time = time.time() | |
| ttft = end_time - start_time | |
| tps = tokens / ttft if ttft > 0 else 0 | |
| ttft_list.append(ttft) | |
| tps_list.append(tps) | |
| print(f"🕒 TTFT: {ttft:.2f}s | ✍️ Tokens: {tokens} | ⚡ TPS: {tps:.2f}") | |
| if ttft_list and tps_list: | |
| avg_ttft = sum(ttft_list) / len(ttft_list) | |
| avg_tps = sum(tps_list) / len(tps_list) | |
| print(f"\n📊 {config['name']} Summary") | |
| print(f"Average TTFT: {avg_ttft:.2f} seconds") | |
| print(f"Average TPS: {avg_tps:.2f}") | |
| else: | |
| print(f"\n⚠️ No successful responses for {config['name']}") | |
| # pause between benchmarks | |
| print("⏳ Sleeping 5 seconds before next model...\n") | |
| time.sleep(5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment