|
#!/usr/bin/env python3 |
|
"""Token discipline benchmark and prompt-breeding harness. |
|
|
|
This single script has two jobs: |
|
|
|
1. `bench`: run the full standard-vs-disciplined benchmark matrix. |
|
2. `breed`: compare prompt framings on one high-reasoning task. |
|
|
|
The script intentionally runs each task in a temporary working directory so the |
|
agent can create, test, and delete files without touching this repository. |
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
import argparse |
|
import json |
|
import shutil |
|
import subprocess |
|
import sys |
|
import tempfile |
|
from dataclasses import dataclass |
|
from pathlib import Path |
|
from typing import Iterable |
|
|
|
|
|
SYSTEM_RULE = ( |
|
"SYSTEM RULE: Your response must contain zero conversational text. " |
|
"Only raw tool executions.\n---\n{standard_prompt}" |
|
) |
|
|
|
TASKS = { |
|
"Python Scripting": ( |
|
"Create a directory called calc_task. Inside, create a file calc.py " |
|
"with a Calculator class having add and sub methods. Then create " |
|
"test_calc.py with unit tests. Run the tests and if they pass, delete " |
|
"the calc_task directory." |
|
), |
|
"Go HTTP Server": ( |
|
"Create a directory called go_server. Inside, create main.go that " |
|
"serves 'hello world' on port 8080. Create a Makefile to build it. " |
|
"Run it in the background, curl the endpoint to verify it works, then " |
|
"kill the server and delete the go_server directory." |
|
), |
|
"Documentation (SRP)": ( |
|
"Create a directory called srp_docs. Inside, create an article.md " |
|
"explaining the Single Responsibility Principle with a short Python " |
|
"example. Then create a python script validate_md.py to parse the " |
|
"markdown file and ensure it has a title starting with #. Run the " |
|
"validation, and if it passes, delete the srp_docs directory." |
|
), |
|
} |
|
|
|
EFFORTS = ("low", "medium", "high", "xhigh") |
|
ALLOWED_GEMINI_MODELS = ("gemini-3.1-pro-preview", "gemini-3-flash-preview") |
|
|
|
|
|
@dataclass(frozen=True) |
|
class Usage: |
|
output_tokens: int |
|
reasoning_tokens: int = 0 |
|
input_tokens: int = 0 |
|
total_tokens: int = 0 |
|
status: str = "unknown" |
|
error: str = "" |
|
models: tuple[str, ...] = () |
|
|
|
|
|
@dataclass(frozen=True) |
|
class BenchRow: |
|
cli_model: str |
|
task_config: str |
|
standard_tokens: int |
|
disciplined_tokens: int |
|
|
|
@property |
|
def reduction(self) -> str: |
|
if self.standard_tokens <= 0: |
|
return "n/a" |
|
value = (self.standard_tokens - self.disciplined_tokens) / self.standard_tokens * 100 |
|
return f"{value:.1f}%" |
|
|
|
|
|
def standard_prompt(task: str) -> str: |
|
return f"You are a senior software engineer. {task}" |
|
|
|
|
|
def disciplined_prompt(task: str) -> str: |
|
return SYSTEM_RULE.format(standard_prompt=standard_prompt(task)) |
|
|
|
|
|
def parse_json_lines(text: str) -> Iterable[dict]: |
|
for raw_line in text.splitlines(): |
|
line = raw_line.strip() |
|
if not line.startswith("{"): |
|
continue |
|
try: |
|
yield json.loads(line) |
|
except json.JSONDecodeError: |
|
continue |
|
|
|
|
|
def parse_codex_usage(stdout: str, stderr: str) -> Usage: |
|
output_tokens = 0 |
|
reasoning_tokens = 0 |
|
input_tokens = 0 |
|
total_tokens = 0 |
|
status = "unknown" |
|
|
|
for data in parse_json_lines(stdout): |
|
if data.get("type") != "turn.completed": |
|
continue |
|
status = "success" |
|
usage = data.get("usage", {}) |
|
output_tokens += int(usage.get("output_tokens") or 0) |
|
reasoning_tokens += int(usage.get("reasoning_output_tokens") or 0) |
|
input_tokens += int(usage.get("input_tokens") or 0) |
|
total_tokens += int(usage.get("total_tokens") or 0) |
|
|
|
return Usage( |
|
output_tokens=output_tokens, |
|
reasoning_tokens=reasoning_tokens, |
|
input_tokens=input_tokens, |
|
total_tokens=total_tokens, |
|
status=status, |
|
error=stderr.strip() if status != "success" else "", |
|
) |
|
|
|
|
|
def parse_gemini_usage(stdout: str, stderr: str) -> Usage: |
|
final_result: dict | None = None |
|
for data in parse_json_lines(stdout): |
|
if data.get("type") == "result": |
|
final_result = data |
|
|
|
if final_result is None: |
|
return Usage(output_tokens=0, status="missing-result", error=stderr.strip()) |
|
|
|
stats = final_result.get("stats") or {} |
|
models = stats.get("models") or {} |
|
observed_models = tuple(sorted(models)) |
|
output_tokens = int(stats.get("output_tokens") or 0) |
|
|
|
# Some Gemini CLI versions expose useful totals only under stats.models. |
|
if output_tokens == 0 and models: |
|
output_tokens = sum(int(model_stats.get("output_tokens") or 0) for model_stats in models.values()) |
|
|
|
error = "" |
|
if final_result.get("status") != "success": |
|
error_payload = final_result.get("error") or {} |
|
error = error_payload.get("message") or stderr.strip() |
|
|
|
return Usage( |
|
output_tokens=output_tokens, |
|
input_tokens=int(stats.get("input_tokens") or stats.get("input") or 0), |
|
total_tokens=int(stats.get("total_tokens") or 0), |
|
status=str(final_result.get("status") or "unknown"), |
|
error=error, |
|
models=observed_models, |
|
) |
|
|
|
|
|
def run_command(cmd: list[str], cwd: Path, timeout_seconds: int) -> subprocess.CompletedProcess[str]: |
|
return subprocess.run( |
|
cmd, |
|
cwd=cwd, |
|
capture_output=True, |
|
text=True, |
|
timeout=timeout_seconds, |
|
check=False, |
|
) |
|
|
|
|
|
def timeout_usage(error: subprocess.TimeoutExpired, parser: str) -> Usage: |
|
stdout = error.stdout.decode("utf-8", errors="replace") if isinstance(error.stdout, bytes) else error.stdout or "" |
|
stderr = error.stderr.decode("utf-8", errors="replace") if isinstance(error.stderr, bytes) else error.stderr or "" |
|
parsed = parse_codex_usage(stdout, stderr) if parser == "codex" else parse_gemini_usage(stdout, stderr) |
|
if parsed.output_tokens > 0: |
|
return Usage( |
|
output_tokens=parsed.output_tokens, |
|
input_tokens=parsed.input_tokens, |
|
total_tokens=parsed.total_tokens, |
|
status="timeout-partial", |
|
error=f"Timed out after {error.timeout} seconds.", |
|
) |
|
return Usage(output_tokens=0, status="timeout", error=f"Timed out after {error.timeout} seconds.") |
|
|
|
|
|
def run_codex(prompt: str, effort: str, model: str, timeout_seconds: int) -> Usage: |
|
with tempfile.TemporaryDirectory(prefix="token-discipline-codex-") as tmp: |
|
cmd = [ |
|
"codex", |
|
"exec", |
|
"--skip-git-repo-check", |
|
"-c", |
|
f"reasoning_effort={effort}", |
|
"-m", |
|
model, |
|
"--dangerously-bypass-approvals-and-sandbox", |
|
"--json", |
|
prompt, |
|
] |
|
try: |
|
result = run_command(cmd, Path(tmp), timeout_seconds) |
|
except subprocess.TimeoutExpired as error: |
|
return timeout_usage(error, "codex") |
|
usage = parse_codex_usage(result.stdout, result.stderr) |
|
if result.returncode != 0 and usage.status != "success": |
|
return Usage(output_tokens=0, status=f"exit-{result.returncode}", error=result.stderr.strip()) |
|
return usage |
|
|
|
|
|
def run_gemini(prompt: str, model: str, timeout_seconds: int) -> Usage: |
|
if model not in ALLOWED_GEMINI_MODELS: |
|
allowed = ", ".join(ALLOWED_GEMINI_MODELS) |
|
return Usage(output_tokens=0, status="invalid-model", error=f"Gemini model must be one of: {allowed}") |
|
|
|
with tempfile.TemporaryDirectory(prefix="token-discipline-gemini-") as tmp: |
|
cmd = ["gemini", "-y", "--skip-trust", "-o", "stream-json", "-m", model, "-p", prompt] |
|
try: |
|
result = run_command(cmd, Path(tmp), timeout_seconds) |
|
except subprocess.TimeoutExpired as error: |
|
return timeout_usage(error, "gemini") |
|
usage = parse_gemini_usage(result.stdout, result.stderr) |
|
disallowed = sorted(set(usage.models) - set(ALLOWED_GEMINI_MODELS)) |
|
if disallowed: |
|
allowed = ", ".join(ALLOWED_GEMINI_MODELS) |
|
return Usage( |
|
output_tokens=0, |
|
input_tokens=usage.input_tokens, |
|
total_tokens=usage.total_tokens, |
|
status="disallowed-model", |
|
error=f"Gemini CLI routed to disallowed model(s): {', '.join(disallowed)}. Allowed: {allowed}", |
|
models=usage.models, |
|
) |
|
if result.returncode != 0 and usage.status == "missing-result": |
|
return Usage(output_tokens=0, status=f"exit-{result.returncode}", error=result.stderr.strip()) |
|
return usage |
|
|
|
|
|
def markdown_table(rows: list[BenchRow]) -> str: |
|
lines = [ |
|
"| CLI & Model | Task / Configuration | Standard Tokens | Disciplined Tokens | Reduction |", |
|
"| :--- | :--- | ---: | ---: | ---: |", |
|
] |
|
for row in rows: |
|
lines.append( |
|
f"| {row.cli_model} | {row.task_config} | " |
|
f"{row.standard_tokens:,} | {row.disciplined_tokens:,} | {row.reduction} |" |
|
) |
|
return "\n".join(lines) |
|
|
|
|
|
def write_results(out_dir: Path, name: str, content: str) -> None: |
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
path = out_dir / name |
|
path.write_text(content + "\n", encoding="utf-8") |
|
print(f"Wrote {path}", file=sys.stderr) |
|
|
|
|
|
def bench(args: argparse.Namespace) -> int: |
|
rows: list[BenchRow] = [] |
|
raw_records: list[dict] = [] |
|
|
|
for task_name, task in TASKS.items(): |
|
standard = standard_prompt(task) |
|
disciplined = disciplined_prompt(task) |
|
|
|
if args.engine in {"gemini", "both"}: |
|
for gemini_model in args.gemini_model: |
|
print(f"Running Gemini: {task_name} {gemini_model} standard", file=sys.stderr) |
|
std_usage = run_gemini(standard, gemini_model, args.timeout) |
|
print(f"Running Gemini: {task_name} {gemini_model} disciplined", file=sys.stderr) |
|
disc_usage = run_gemini(disciplined, gemini_model, args.timeout) |
|
rows.append( |
|
BenchRow( |
|
cli_model=f"**Gemini CLI** ({gemini_model})", |
|
task_config=task_name, |
|
standard_tokens=std_usage.output_tokens, |
|
disciplined_tokens=disc_usage.output_tokens, |
|
) |
|
) |
|
raw_records.append( |
|
{ |
|
"engine": "gemini", |
|
"model": gemini_model, |
|
"task": task_name, |
|
"standard": std_usage.__dict__, |
|
"disciplined": disc_usage.__dict__, |
|
} |
|
) |
|
|
|
if args.engine in {"codex", "both"}: |
|
for effort in args.effort: |
|
print(f"Running Codex: {task_name} {effort} standard", file=sys.stderr) |
|
std_usage = run_codex(standard, effort, args.codex_model, args.timeout) |
|
print(f"Running Codex: {task_name} {effort} disciplined", file=sys.stderr) |
|
disc_usage = run_codex(disciplined, effort, args.codex_model, args.timeout) |
|
rows.append( |
|
BenchRow( |
|
cli_model=f"**Codex CLI** ({args.codex_model})", |
|
task_config=f"{task_name} ({effort} effort)", |
|
standard_tokens=std_usage.output_tokens, |
|
disciplined_tokens=disc_usage.output_tokens, |
|
) |
|
) |
|
raw_records.append( |
|
{ |
|
"engine": "codex", |
|
"model": args.codex_model, |
|
"effort": effort, |
|
"task": task_name, |
|
"standard": std_usage.__dict__, |
|
"disciplined": disc_usage.__dict__, |
|
} |
|
) |
|
|
|
table = markdown_table(rows) |
|
print(table) |
|
if args.out_dir: |
|
out_dir = Path(args.out_dir) |
|
write_results(out_dir, "benchmark_table.md", table) |
|
write_results(out_dir, "benchmark_raw.jsonl", "\n".join(json.dumps(record) for record in raw_records)) |
|
return 0 |
|
|
|
|
|
def breed(args: argparse.Namespace) -> int: |
|
task = TASKS["Python Scripting"] |
|
standard = standard_prompt(task) |
|
candidates = { |
|
"S_Baseline": standard, |
|
"P1_Legacy_Negative": ( |
|
"Token discipline: no user-facing prose unless needed to complete the requested action. " |
|
"Avoid mid-task updates. Act from context. Omit logs and repeated context.\n" |
|
f"{standard}" |
|
), |
|
"P2_System_Rule": SYSTEM_RULE.format(standard_prompt=standard), |
|
"P3_Minimal": f"Use tools immediately. Do not explain your steps.\n{standard}", |
|
"P4_Tag_Rule": f"<output_rules>No prose. Only tool calls.</output_rules>\n{standard}", |
|
"P5_Post_Task": f"{standard}\nConstraint: Provide the solution strictly via tool calls without explaining.", |
|
"P6_Role_Format": ( |
|
"Act as a senior software engineer. Output constraint: " |
|
f"No markdown prose, only raw code execution.\nTask: {task}" |
|
), |
|
} |
|
|
|
usages: dict[str, Usage] = {} |
|
for name, prompt in candidates.items(): |
|
print(f"Running breeder candidate: {name}", file=sys.stderr) |
|
usages[name] = run_codex(prompt, args.effort, args.codex_model, args.timeout) |
|
|
|
baseline = usages["S_Baseline"] |
|
lines = [ |
|
"| Prompt | Output Tokens | Reasoning Tokens | Non-Reasoning Tokens | Reduction vs Baseline |", |
|
"| :--- | ---: | ---: | ---: | ---: |", |
|
] |
|
for name, usage in usages.items(): |
|
non_reasoning = usage.output_tokens - usage.reasoning_tokens |
|
if baseline.output_tokens > 0 and name != "S_Baseline": |
|
reduction = f"{(baseline.output_tokens - usage.output_tokens) / baseline.output_tokens * 100:.1f}%" |
|
else: |
|
reduction = "baseline" |
|
lines.append( |
|
f"| {name} | {usage.output_tokens:,} | {usage.reasoning_tokens:,} | " |
|
f"{non_reasoning:,} | {reduction} |" |
|
) |
|
|
|
table = "\n".join(lines) |
|
print(table) |
|
if args.out_dir: |
|
out_dir = Path(args.out_dir) |
|
write_results(out_dir, "breeder_table.md", table) |
|
write_results( |
|
out_dir, |
|
"breeder_raw.json", |
|
json.dumps({name: usage.__dict__ for name, usage in usages.items()}, indent=2), |
|
) |
|
return 0 |
|
|
|
|
|
def parse_smoke(args: argparse.Namespace) -> int: |
|
samples = { |
|
"codex": ( |
|
'{"type":"turn.completed","usage":{"output_tokens":1615,' |
|
'"reasoning_output_tokens":1011,"input_tokens":42,"total_tokens":1657}}' |
|
), |
|
"gemini": ( |
|
'{"type":"result","status":"success","stats":{"total_tokens":11405,' |
|
'"input_tokens":11389,"output_tokens":1,"models":{"gemini-3.1-pro-preview":' |
|
'{"total_tokens":11405,"input_tokens":11389,"output_tokens":1}}}}' |
|
), |
|
} |
|
codex = parse_codex_usage(samples["codex"], "") |
|
gemini = parse_gemini_usage(samples["gemini"], "") |
|
assert codex.output_tokens == 1615 |
|
assert codex.reasoning_tokens == 1011 |
|
assert gemini.output_tokens == 1 |
|
print("Parser smoke test passed.") |
|
return 0 |
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser: |
|
parser = argparse.ArgumentParser(description=__doc__) |
|
subparsers = parser.add_subparsers(dest="command", required=True) |
|
|
|
bench_parser = subparsers.add_parser("bench", help="Run the benchmark matrix.") |
|
bench_parser.add_argument("--engine", choices=("codex", "gemini", "both"), default="both") |
|
bench_parser.add_argument("--codex-model", default="gpt-5.4-mini") |
|
bench_parser.add_argument( |
|
"--gemini-model", |
|
nargs="+", |
|
choices=ALLOWED_GEMINI_MODELS, |
|
default=list(ALLOWED_GEMINI_MODELS), |
|
help="Gemini CLI model(s). Only Gemini 3.1 Pro and Gemini 3 Flash preview IDs are allowed.", |
|
) |
|
bench_parser.add_argument("--effort", nargs="+", choices=EFFORTS, default=list(EFFORTS)) |
|
bench_parser.add_argument("--timeout", type=int, default=300) |
|
bench_parser.add_argument("--out-dir", default="_results") |
|
bench_parser.set_defaults(func=bench) |
|
|
|
breed_parser = subparsers.add_parser("breed", help="Run prompt-breeding candidates.") |
|
breed_parser.add_argument("--codex-model", default="gpt-5.4-mini") |
|
breed_parser.add_argument("--effort", choices=EFFORTS, default="xhigh") |
|
breed_parser.add_argument("--timeout", type=int, default=300) |
|
breed_parser.add_argument("--out-dir", default="_results") |
|
breed_parser.set_defaults(func=breed) |
|
|
|
smoke_parser = subparsers.add_parser("parse-smoke", help="Validate JSON usage parsers.") |
|
smoke_parser.set_defaults(func=parse_smoke) |
|
return parser |
|
|
|
|
|
def main() -> int: |
|
if not shutil.which("python3"): |
|
print("python3 is required.", file=sys.stderr) |
|
return 2 |
|
parser = build_parser() |
|
args = parser.parse_args() |
|
return int(args.func(args)) |
|
|
|
|
|
if __name__ == "__main__": |
|
raise SystemExit(main()) |