zillwc · May 23, 2025 23:06
diff --git a/benchmark_gguf_mlx.py b/benchmark_gguf_mlx.py
 import time
 import requests

 prompts = [
    "Implement an LRU (Least Recently Used) cache in Python as a single .py file. It must support get and put operations in O(1) time complexity. Include test code at the bottom.",

    "Write a complete single-file Python implementation of a multi-threaded producer-consumer system using threading and a shared queue. Include comments and a short main() function that demonstrates functionality.",

    "Design and fully implement a rate limiter in Python that allows up to 100 requests per 10-second window per user. Use in-memory data structures and implement it in a single .py file with example usage.",

    "Implement a recursive descent parser in Python in one file. The parser should handle basic arithmetic expressions with +, -, *, /, and parentheses. Include both the lexer and parser logic in the same file, and provide example input/output.",

    "Write a single Python script that simulates a simplified Git CLI. It should support commands: init, add, commit, and log. Store all metadata in a `.mygit` folder in the working directory. No external dependencies allowed."
 ]

 # model<>server config
 models = [
    {
        "name": "LMStudio - devstral GGUF",
        "url": "http://localhost:1234/v1/chat/completions",
        "model": "devstral-small-2505"
    },
    {
        "name": "LMStudio - devstral MLX",
        "url": "http://localhost:1234/v1/chat/completions",
        "model": "devstral-small-2505-mlx"
    },
    {
        "name": "Ollama - devstral:24b GGUF",
        "url": "http://localhost:11434/v1/chat/completions",
        "model": "devstral:24b"
    }
 ]

 for config in models:
    print(f"\n=== Benchmarking {config['name']} ===")

    ttft_list = []
    tps_list = []

    for prompt in prompts:
        print(f"\n▶ Prompt: {prompt}")

        updated_prompt = f"{prompt}. Give me just the code, nothing else."
        payload = {
            "model": config["model"],
            "messages": [{"role": "user", "content": updated_prompt}],
            "temperature": 0.7,
            "stream": False
        }

        headers = {"Content-Type": "application/json"}

        start_time = time.time()
        try:
            response = requests.post(config["url"], headers=headers, json=payload)
            response.raise_for_status()
            result = response.json()
        except Exception as e:
            print(f"❌ Request failed: {e}")
            continue

        try:
            output = result["choices"][0]["message"]["content"]
            tokens = result["usage"]["completion_tokens"]
        except Exception as e:
            print(f"❌ Failed to parse response: {result}")
            continue

        end_time = time.time()

        ttft = end_time - start_time
        tps = tokens / ttft if ttft > 0 else 0

        ttft_list.append(ttft)
        tps_list.append(tps)

        print(f"🕒 TTFT: {ttft:.2f}s | ✍️ Tokens: {tokens} | ⚡ TPS: {tps:.2f}")

    if ttft_list and tps_list:
        avg_ttft = sum(ttft_list) / len(ttft_list)
        avg_tps = sum(tps_list) / len(tps_list)
        print(f"\n📊 {config['name']} Summary")
        print(f"Average TTFT: {avg_ttft:.2f} seconds")
        print(f"Average TPS: {avg_tps:.2f}")
    else:
        print(f"\n⚠️ No successful responses for {config['name']}")

    # pause between benchmarks
    print("⏳ Sleeping 5 seconds before next model...\n")
    time.sleep(5)
	import time
	import requests

	prompts = [
	"Implement an LRU (Least Recently Used) cache in Python as a single .py file. It must support get and put operations in O(1) time complexity. Include test code at the bottom.",

	"Write a complete single-file Python implementation of a multi-threaded producer-consumer system using threading and a shared queue. Include comments and a short main() function that demonstrates functionality.",

	"Design and fully implement a rate limiter in Python that allows up to 100 requests per 10-second window per user. Use in-memory data structures and implement it in a single .py file with example usage.",

	"Implement a recursive descent parser in Python in one file. The parser should handle basic arithmetic expressions with +, -, *, /, and parentheses. Include both the lexer and parser logic in the same file, and provide example input/output.",

	"Write a single Python script that simulates a simplified Git CLI. It should support commands: init, add, commit, and log. Store all metadata in a `.mygit` folder in the working directory. No external dependencies allowed."
	]

	# model<>server config
	models = [
	{
	"name": "LMStudio - devstral GGUF",
	"url": "http://localhost:1234/v1/chat/completions",
	"model": "devstral-small-2505"
	},
	{
	"name": "LMStudio - devstral MLX",
	"url": "http://localhost:1234/v1/chat/completions",
	"model": "devstral-small-2505-mlx"
	},
	{
	"name": "Ollama - devstral:24b GGUF",
	"url": "http://localhost:11434/v1/chat/completions",
	"model": "devstral:24b"
	}
	]

	for config in models:
	print(f"\n=== Benchmarking {config['name']} ===")

	ttft_list = []
	tps_list = []

	for prompt in prompts:
	print(f"\n▶ Prompt: {prompt}")

	updated_prompt = f"{prompt}. Give me just the code, nothing else."
	payload = {
	"model": config["model"],
	"messages": [{"role": "user", "content": updated_prompt}],
	"temperature": 0.7,
	"stream": False
	}

	headers = {"Content-Type": "application/json"}

	start_time = time.time()
	try:
	response = requests.post(config["url"], headers=headers, json=payload)
	response.raise_for_status()
	result = response.json()
	except Exception as e:
	print(f"❌ Request failed: {e}")
	continue

	try:
	output = result["choices"][0]["message"]["content"]
	tokens = result["usage"]["completion_tokens"]
	except Exception as e:
	print(f"❌ Failed to parse response: {result}")
	continue

	end_time = time.time()

	ttft = end_time - start_time
	tps = tokens / ttft if ttft > 0 else 0

	ttft_list.append(ttft)
	tps_list.append(tps)

	print(f"🕒 TTFT: {ttft:.2f}s \| ✍️ Tokens: {tokens} \| ⚡ TPS: {tps:.2f}")

	if ttft_list and tps_list:
	avg_ttft = sum(ttft_list) / len(ttft_list)
	avg_tps = sum(tps_list) / len(tps_list)
	print(f"\n📊 {config['name']} Summary")
	print(f"Average TTFT: {avg_ttft:.2f} seconds")
	print(f"Average TPS: {avg_tps:.2f}")
	else:
	print(f"\n⚠️ No successful responses for {config['name']}")

	# pause between benchmarks
	print("⏳ Sleeping 5 seconds before next model...\n")
	time.sleep(5)
No results found