Skip to content

Instantly share code, notes, and snippets.

@omar391
Last active May 9, 2026 06:42
Show Gist options
  • Select an option

  • Save omar391/bc15f0cbeaa7453d1c3d3958e736264f to your computer and use it in GitHub Desktop.

Select an option

Save omar391/bc15f0cbeaa7453d1c3d3958e736264f to your computer and use it in GitHub Desktop.
Gemini CLI Token Discipline Benchmark
#!/usr/bin/env python3
"""Token discipline benchmark and prompt-breeding harness.
This single script has two jobs:
1. `bench`: run the full standard-vs-disciplined benchmark matrix.
2. `breed`: compare prompt framings on one high-reasoning task.
The script intentionally runs each task in a temporary working directory so the
agent can create, test, and delete files without touching this repository.
"""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import sys
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
SYSTEM_RULE = (
"SYSTEM RULE: Your response must contain zero conversational text. "
"Only raw tool executions.\n---\n{standard_prompt}"
)
TASKS = {
"Python Scripting": (
"Create a directory called calc_task. Inside, create a file calc.py "
"with a Calculator class having add and sub methods. Then create "
"test_calc.py with unit tests. Run the tests and if they pass, delete "
"the calc_task directory."
),
"Go HTTP Server": (
"Create a directory called go_server. Inside, create main.go that "
"serves 'hello world' on port 8080. Create a Makefile to build it. "
"Run it in the background, curl the endpoint to verify it works, then "
"kill the server and delete the go_server directory."
),
"Documentation (SRP)": (
"Create a directory called srp_docs. Inside, create an article.md "
"explaining the Single Responsibility Principle with a short Python "
"example. Then create a python script validate_md.py to parse the "
"markdown file and ensure it has a title starting with #. Run the "
"validation, and if it passes, delete the srp_docs directory."
),
}
EFFORTS = ("low", "medium", "high", "xhigh")
ALLOWED_GEMINI_MODELS = ("gemini-3.1-pro-preview", "gemini-3-flash-preview")
@dataclass(frozen=True)
class Usage:
output_tokens: int
reasoning_tokens: int = 0
input_tokens: int = 0
total_tokens: int = 0
status: str = "unknown"
error: str = ""
models: tuple[str, ...] = ()
@dataclass(frozen=True)
class BenchRow:
cli_model: str
task_config: str
standard_tokens: int
disciplined_tokens: int
@property
def reduction(self) -> str:
if self.standard_tokens <= 0:
return "n/a"
value = (self.standard_tokens - self.disciplined_tokens) / self.standard_tokens * 100
return f"{value:.1f}%"
def standard_prompt(task: str) -> str:
return f"You are a senior software engineer. {task}"
def disciplined_prompt(task: str) -> str:
return SYSTEM_RULE.format(standard_prompt=standard_prompt(task))
def parse_json_lines(text: str) -> Iterable[dict]:
for raw_line in text.splitlines():
line = raw_line.strip()
if not line.startswith("{"):
continue
try:
yield json.loads(line)
except json.JSONDecodeError:
continue
def parse_codex_usage(stdout: str, stderr: str) -> Usage:
output_tokens = 0
reasoning_tokens = 0
input_tokens = 0
total_tokens = 0
status = "unknown"
for data in parse_json_lines(stdout):
if data.get("type") != "turn.completed":
continue
status = "success"
usage = data.get("usage", {})
output_tokens += int(usage.get("output_tokens") or 0)
reasoning_tokens += int(usage.get("reasoning_output_tokens") or 0)
input_tokens += int(usage.get("input_tokens") or 0)
total_tokens += int(usage.get("total_tokens") or 0)
return Usage(
output_tokens=output_tokens,
reasoning_tokens=reasoning_tokens,
input_tokens=input_tokens,
total_tokens=total_tokens,
status=status,
error=stderr.strip() if status != "success" else "",
)
def parse_gemini_usage(stdout: str, stderr: str) -> Usage:
final_result: dict | None = None
for data in parse_json_lines(stdout):
if data.get("type") == "result":
final_result = data
if final_result is None:
return Usage(output_tokens=0, status="missing-result", error=stderr.strip())
stats = final_result.get("stats") or {}
models = stats.get("models") or {}
observed_models = tuple(sorted(models))
output_tokens = int(stats.get("output_tokens") or 0)
# Some Gemini CLI versions expose useful totals only under stats.models.
if output_tokens == 0 and models:
output_tokens = sum(int(model_stats.get("output_tokens") or 0) for model_stats in models.values())
error = ""
if final_result.get("status") != "success":
error_payload = final_result.get("error") or {}
error = error_payload.get("message") or stderr.strip()
return Usage(
output_tokens=output_tokens,
input_tokens=int(stats.get("input_tokens") or stats.get("input") or 0),
total_tokens=int(stats.get("total_tokens") or 0),
status=str(final_result.get("status") or "unknown"),
error=error,
models=observed_models,
)
def run_command(cmd: list[str], cwd: Path, timeout_seconds: int) -> subprocess.CompletedProcess[str]:
return subprocess.run(
cmd,
cwd=cwd,
capture_output=True,
text=True,
timeout=timeout_seconds,
check=False,
)
def timeout_usage(error: subprocess.TimeoutExpired, parser: str) -> Usage:
stdout = error.stdout.decode("utf-8", errors="replace") if isinstance(error.stdout, bytes) else error.stdout or ""
stderr = error.stderr.decode("utf-8", errors="replace") if isinstance(error.stderr, bytes) else error.stderr or ""
parsed = parse_codex_usage(stdout, stderr) if parser == "codex" else parse_gemini_usage(stdout, stderr)
if parsed.output_tokens > 0:
return Usage(
output_tokens=parsed.output_tokens,
input_tokens=parsed.input_tokens,
total_tokens=parsed.total_tokens,
status="timeout-partial",
error=f"Timed out after {error.timeout} seconds.",
)
return Usage(output_tokens=0, status="timeout", error=f"Timed out after {error.timeout} seconds.")
def run_codex(prompt: str, effort: str, model: str, timeout_seconds: int) -> Usage:
with tempfile.TemporaryDirectory(prefix="token-discipline-codex-") as tmp:
cmd = [
"codex",
"exec",
"--skip-git-repo-check",
"-c",
f"reasoning_effort={effort}",
"-m",
model,
"--dangerously-bypass-approvals-and-sandbox",
"--json",
prompt,
]
try:
result = run_command(cmd, Path(tmp), timeout_seconds)
except subprocess.TimeoutExpired as error:
return timeout_usage(error, "codex")
usage = parse_codex_usage(result.stdout, result.stderr)
if result.returncode != 0 and usage.status != "success":
return Usage(output_tokens=0, status=f"exit-{result.returncode}", error=result.stderr.strip())
return usage
def run_gemini(prompt: str, model: str, timeout_seconds: int) -> Usage:
if model not in ALLOWED_GEMINI_MODELS:
allowed = ", ".join(ALLOWED_GEMINI_MODELS)
return Usage(output_tokens=0, status="invalid-model", error=f"Gemini model must be one of: {allowed}")
with tempfile.TemporaryDirectory(prefix="token-discipline-gemini-") as tmp:
cmd = ["gemini", "-y", "--skip-trust", "-o", "stream-json", "-m", model, "-p", prompt]
try:
result = run_command(cmd, Path(tmp), timeout_seconds)
except subprocess.TimeoutExpired as error:
return timeout_usage(error, "gemini")
usage = parse_gemini_usage(result.stdout, result.stderr)
disallowed = sorted(set(usage.models) - set(ALLOWED_GEMINI_MODELS))
if disallowed:
allowed = ", ".join(ALLOWED_GEMINI_MODELS)
return Usage(
output_tokens=0,
input_tokens=usage.input_tokens,
total_tokens=usage.total_tokens,
status="disallowed-model",
error=f"Gemini CLI routed to disallowed model(s): {', '.join(disallowed)}. Allowed: {allowed}",
models=usage.models,
)
if result.returncode != 0 and usage.status == "missing-result":
return Usage(output_tokens=0, status=f"exit-{result.returncode}", error=result.stderr.strip())
return usage
def markdown_table(rows: list[BenchRow]) -> str:
lines = [
"| CLI & Model | Task / Configuration | Standard Tokens | Disciplined Tokens | Reduction |",
"| :--- | :--- | ---: | ---: | ---: |",
]
for row in rows:
lines.append(
f"| {row.cli_model} | {row.task_config} | "
f"{row.standard_tokens:,} | {row.disciplined_tokens:,} | {row.reduction} |"
)
return "\n".join(lines)
def write_results(out_dir: Path, name: str, content: str) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
path = out_dir / name
path.write_text(content + "\n", encoding="utf-8")
print(f"Wrote {path}", file=sys.stderr)
def bench(args: argparse.Namespace) -> int:
rows: list[BenchRow] = []
raw_records: list[dict] = []
for task_name, task in TASKS.items():
standard = standard_prompt(task)
disciplined = disciplined_prompt(task)
if args.engine in {"gemini", "both"}:
for gemini_model in args.gemini_model:
print(f"Running Gemini: {task_name} {gemini_model} standard", file=sys.stderr)
std_usage = run_gemini(standard, gemini_model, args.timeout)
print(f"Running Gemini: {task_name} {gemini_model} disciplined", file=sys.stderr)
disc_usage = run_gemini(disciplined, gemini_model, args.timeout)
rows.append(
BenchRow(
cli_model=f"**Gemini CLI** ({gemini_model})",
task_config=task_name,
standard_tokens=std_usage.output_tokens,
disciplined_tokens=disc_usage.output_tokens,
)
)
raw_records.append(
{
"engine": "gemini",
"model": gemini_model,
"task": task_name,
"standard": std_usage.__dict__,
"disciplined": disc_usage.__dict__,
}
)
if args.engine in {"codex", "both"}:
for effort in args.effort:
print(f"Running Codex: {task_name} {effort} standard", file=sys.stderr)
std_usage = run_codex(standard, effort, args.codex_model, args.timeout)
print(f"Running Codex: {task_name} {effort} disciplined", file=sys.stderr)
disc_usage = run_codex(disciplined, effort, args.codex_model, args.timeout)
rows.append(
BenchRow(
cli_model=f"**Codex CLI** ({args.codex_model})",
task_config=f"{task_name} ({effort} effort)",
standard_tokens=std_usage.output_tokens,
disciplined_tokens=disc_usage.output_tokens,
)
)
raw_records.append(
{
"engine": "codex",
"model": args.codex_model,
"effort": effort,
"task": task_name,
"standard": std_usage.__dict__,
"disciplined": disc_usage.__dict__,
}
)
table = markdown_table(rows)
print(table)
if args.out_dir:
out_dir = Path(args.out_dir)
write_results(out_dir, "benchmark_table.md", table)
write_results(out_dir, "benchmark_raw.jsonl", "\n".join(json.dumps(record) for record in raw_records))
return 0
def breed(args: argparse.Namespace) -> int:
task = TASKS["Python Scripting"]
standard = standard_prompt(task)
candidates = {
"S_Baseline": standard,
"P1_Legacy_Negative": (
"Token discipline: no user-facing prose unless needed to complete the requested action. "
"Avoid mid-task updates. Act from context. Omit logs and repeated context.\n"
f"{standard}"
),
"P2_System_Rule": SYSTEM_RULE.format(standard_prompt=standard),
"P3_Minimal": f"Use tools immediately. Do not explain your steps.\n{standard}",
"P4_Tag_Rule": f"<output_rules>No prose. Only tool calls.</output_rules>\n{standard}",
"P5_Post_Task": f"{standard}\nConstraint: Provide the solution strictly via tool calls without explaining.",
"P6_Role_Format": (
"Act as a senior software engineer. Output constraint: "
f"No markdown prose, only raw code execution.\nTask: {task}"
),
}
usages: dict[str, Usage] = {}
for name, prompt in candidates.items():
print(f"Running breeder candidate: {name}", file=sys.stderr)
usages[name] = run_codex(prompt, args.effort, args.codex_model, args.timeout)
baseline = usages["S_Baseline"]
lines = [
"| Prompt | Output Tokens | Reasoning Tokens | Non-Reasoning Tokens | Reduction vs Baseline |",
"| :--- | ---: | ---: | ---: | ---: |",
]
for name, usage in usages.items():
non_reasoning = usage.output_tokens - usage.reasoning_tokens
if baseline.output_tokens > 0 and name != "S_Baseline":
reduction = f"{(baseline.output_tokens - usage.output_tokens) / baseline.output_tokens * 100:.1f}%"
else:
reduction = "baseline"
lines.append(
f"| {name} | {usage.output_tokens:,} | {usage.reasoning_tokens:,} | "
f"{non_reasoning:,} | {reduction} |"
)
table = "\n".join(lines)
print(table)
if args.out_dir:
out_dir = Path(args.out_dir)
write_results(out_dir, "breeder_table.md", table)
write_results(
out_dir,
"breeder_raw.json",
json.dumps({name: usage.__dict__ for name, usage in usages.items()}, indent=2),
)
return 0
def parse_smoke(args: argparse.Namespace) -> int:
samples = {
"codex": (
'{"type":"turn.completed","usage":{"output_tokens":1615,'
'"reasoning_output_tokens":1011,"input_tokens":42,"total_tokens":1657}}'
),
"gemini": (
'{"type":"result","status":"success","stats":{"total_tokens":11405,'
'"input_tokens":11389,"output_tokens":1,"models":{"gemini-3.1-pro-preview":'
'{"total_tokens":11405,"input_tokens":11389,"output_tokens":1}}}}'
),
}
codex = parse_codex_usage(samples["codex"], "")
gemini = parse_gemini_usage(samples["gemini"], "")
assert codex.output_tokens == 1615
assert codex.reasoning_tokens == 1011
assert gemini.output_tokens == 1
print("Parser smoke test passed.")
return 0
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(dest="command", required=True)
bench_parser = subparsers.add_parser("bench", help="Run the benchmark matrix.")
bench_parser.add_argument("--engine", choices=("codex", "gemini", "both"), default="both")
bench_parser.add_argument("--codex-model", default="gpt-5.4-mini")
bench_parser.add_argument(
"--gemini-model",
nargs="+",
choices=ALLOWED_GEMINI_MODELS,
default=list(ALLOWED_GEMINI_MODELS),
help="Gemini CLI model(s). Only Gemini 3.1 Pro and Gemini 3 Flash preview IDs are allowed.",
)
bench_parser.add_argument("--effort", nargs="+", choices=EFFORTS, default=list(EFFORTS))
bench_parser.add_argument("--timeout", type=int, default=300)
bench_parser.add_argument("--out-dir", default="_results")
bench_parser.set_defaults(func=bench)
breed_parser = subparsers.add_parser("breed", help="Run prompt-breeding candidates.")
breed_parser.add_argument("--codex-model", default="gpt-5.4-mini")
breed_parser.add_argument("--effort", choices=EFFORTS, default="xhigh")
breed_parser.add_argument("--timeout", type=int, default=300)
breed_parser.add_argument("--out-dir", default="_results")
breed_parser.set_defaults(func=breed)
smoke_parser = subparsers.add_parser("parse-smoke", help="Validate JSON usage parsers.")
smoke_parser.set_defaults(func=parse_smoke)
return parser
def main() -> int:
if not shutil.which("python3"):
print("python3 is required.", file=sys.stderr)
return 2
parser = build_parser()
args = parser.parse_args()
return int(args.func(args))
if __name__ == "__main__":
raise SystemExit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment