ruvnet · May 5, 2026 04:14
diff --git a/README.md b/README.md
diff --git a/bench-result-latest.json b/bench-result-latest.json
 {
  "summary": {
    "runAt": "2026-05-05T04:13:48.562Z",
    "corpusVersion": 3,
    "corpusSize": 25,
    "tier1Cases": 18,
    "adversarialCases": 7,
    "winRate": 1,
    "winRatePct": "100.0%",
    "overallCorrect": 0.72,
    "escalationRate": 1,
    "escalationRatePct": "100.0%",
    "successCount": 18,
    "avgLatencyMs": 0.36,
    "p50LatencyMs": 0,
    "p99LatencyMs": 5,
    "maxLatencyMs": 5,
    "avgWallMs": 0.36,
    "avgConfidence": 0.552444064617157,
    "minConfidence": 0,
    "confidenceThreshold": 0.5,
    "aboveThresholdCount": 18,
    "structuralCostUsd": 0,
    "llmBaseline": {
      "model": "models/gemini-2.0-flash",
      "baseUrl": "https://generativelanguage.googleapis.com/v1beta/openai/",
      "total": 25,
      "passed": 21,
      "winRate": 0.84,
      "avgLatencyMs": 807.56,
      "totalTokensIn": 3084,
      "totalTokensOut": 952,
      "avgTokensIn": 123.36,
      "avgTokensOut": 38.08,
      "totalCostUsd": 0.0006892000000000001,
      "avgCostUsdPerEdit": 0.000027568,
      "pricing": {
        "input_per_1M": 0.1,
        "output_per_1M": 0.4
      }
    },
    "speedupVsLlm": 2243.222222222222,
    "costDeltaUsdPerEdit": 0.000027568,
    "anthropic": {
      "claude-sonnet-4-6": {
        "model": "claude-sonnet-4-6",
        "total": 25,
        "passed": 20,
        "winRate": 0.8,
        "avgLatencyMs": 1270.64,
        "totalTokensIn": 2896,
        "totalTokensOut": 975,
        "avgTokensIn": 115.84,
        "avgTokensOut": 39,
        "totalCostUsd": 0.023313,
        "avgCostUsdPerEdit": 0.00093252,
        "pricing": {
          "input": 3,
          "output": 15
        },
        "speedupVsBooster": 3529.555555555556,
        "costSavedPerEditUsd": 0.00093252
      },
      "claude-opus-4-7": {
        "model": "claude-opus-4-7",
        "total": 25,
        "passed": 23,
        "winRate": 0.92,
        "avgLatencyMs": 1563.72,
        "totalTokensIn": 4330,
        "totalTokensOut": 1115,
        "avgTokensIn": 173.2,
        "avgTokensOut": 44.6,
        "totalCostUsd": 0.148575,
        "avgCostUsdPerEdit": 0.005943,
        "pricing": {
          "input": 15,
          "output": 75
        },
        "speedupVsBooster": 4343.666666666667,
        "costSavedPerEditUsd": 0.005943
      }
    }
  },
  "results": [
    {
      "id": "var-to-const-1",
      "intent": "var-to-const",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 5,
      "wallMs": 5,
      "confidence": 0.6465277671813965,
      "strategy": "fuzzy_replace",
      "tokensIn": 6,
      "tokensOut": 7
    },
    {
      "id": "var-to-const-2",
      "intent": "var-to-const",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 1,
      "wallMs": 1,
      "confidence": 0.8513461351394653,
      "strategy": "fuzzy_replace",
      "tokensIn": 13,
      "tokensOut": 13
    },
    {
      "id": "add-types-1",
      "intent": "add-types",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.6412500143051147,
      "strategy": "fuzzy_replace",
      "tokensIn": 9,
      "tokensOut": 15
    },
    {
      "id": "add-types-2",
      "intent": "add-types",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.6055588126182556,
      "strategy": "fuzzy_replace",
      "tokensIn": 13,
      "tokensOut": 23
    },
    {
      "id": "remove-console-1",
      "intent": "remove-console",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.6954487562179565,
      "strategy": "fuzzy_replace",
      "tokensIn": 12,
      "tokensOut": 7
    },
    {
      "id": "remove-console-2",
      "intent": "remove-console",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 1,
      "wallMs": 1,
      "confidence": 0.5508301258087158,
      "strategy": "fuzzy_replace",
      "tokensIn": 17,
      "tokensOut": 7
    },
    {
      "id": "add-error-handling-1",
      "intent": "add-error-handling",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8500000238418579,
      "strategy": "exact_replace",
      "tokensIn": 10,
      "tokensOut": 19
    },
    {
      "id": "add-error-handling-2",
      "intent": "add-error-handling",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8500000238418579,
      "strategy": "exact_replace",
      "tokensIn": 11,
      "tokensOut": 20
    },
    {
      "id": "async-await-1",
      "intent": "async-await",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8500000238418579,
      "strategy": "exact_replace",
      "tokensIn": 14,
      "tokensOut": 17
    },
    {
      "id": "async-await-2",
      "intent": "async-await",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8500000238418579,
      "strategy": "exact_replace",
      "tokensIn": 12,
      "tokensOut": 17
    },
    {
      "id": "add-logging-1",
      "intent": "add-logging",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.7152736783027649,
      "strategy": "fuzzy_replace",
      "tokensIn": 8,
      "tokensOut": 13
    },
    {
      "id": "add-logging-2",
      "intent": "add-logging",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.6442474126815796,
      "strategy": "fuzzy_replace",
      "tokensIn": 8,
      "tokensOut": 14
    },
    {
      "id": "adversarial-extract-function",
      "intent": "extract-function",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "function process(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } if (total > 100)",
      "expectedPrefix": "function sum_active(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } return total;"
    },
    {
      "id": "adversarial-type-narrowing",
      "intent": "type-narrowing",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { return r.",
      "expectedPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { if (r.kin"
    },
    {
      "id": "adversarial-cross-method-rename",
      "intent": "rename-symbol",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 1,
      "wallMs": 1,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "class Cache { get(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: Reco",
      "expectedPrefix": "class Cache { lookup(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: R"
    },
    {
      "id": "adversarial-recursive-rewrite",
      "intent": "recursive-to-iterative",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }",
      "expectedPrefix": "function factorial(n) { let result = 1; for (let i = 2; i <= n; i++) { result *= i; } return result; }"
    },
    {
      "id": "var-to-let-1",
      "intent": "var-to-let",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.9179807305335999,
      "strategy": "exact_replace",
      "tokensIn": 12,
      "tokensOut": 12
    },
    {
      "id": "double-quote-to-single",
      "intent": "string-quote-style",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.9124999642372131,
      "strategy": "exact_replace",
      "tokensIn": 12,
      "tokensOut": 12
    },
    {
      "id": "add-readonly-prop",
      "intent": "add-readonly",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8140649795532227,
      "strategy": "fuzzy_replace",
      "tokensIn": 8,
      "tokensOut": 11
    },
    {
      "id": "remove-debugger",
      "intent": "remove-debug",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.7668138742446899,
      "strategy": "fuzzy_replace",
      "tokensIn": 10,
      "tokensOut": 7
    },
    {
      "id": "import-add-named",
      "intent": "import-add",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8162500262260437,
      "strategy": "fuzzy_replace",
      "tokensIn": 9,
      "tokensOut": 10
    },
    {
      "id": "remove-trailing-semis",
      "intent": "format-tweak",
      "expectedTier1": true,
      "success": true,
      "correct": true,
      "escalatedCorrectly": false,
      "lowConfidence": false,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0.8330092430114746,
      "strategy": "fuzzy_replace",
      "tokensIn": 7,
      "tokensOut": 7
    },
    {
      "id": "adversarial-extract-class",
      "intent": "extract-class",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "class UserService { saveUser(u: any) { db.put(u); audit.log('save', u.id); } private db = realDb; private audit = { log:",
      "expectedPrefix": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { saveUser(u: any) { db.put(u); t"
    },
    {
      "id": "adversarial-callback-to-promise",
      "intent": "callback-to-promise",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 1,
      "wallMs": 1,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "function loadConfig(cb) { fs.readFile('./c.json', (err, data) => { if (err) cb(err); else cb(null, JSON.parse(data)); })",
      "expectedPrefix": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json'); return JSON.parse(data); } catc"
    },
    {
      "id": "adversarial-deeply-nested-conditional",
      "intent": "early-return-flatten",
      "expectedTier1": false,
      "success": false,
      "correct": false,
      "escalatedCorrectly": true,
      "lowConfidence": true,
      "latencyMs": 0,
      "wallMs": 0,
      "confidence": 0,
      "strategy": "failed",
      "tokensIn": 0,
      "tokensOut": 0,
      "actualPrefix": "function process(x) { if (x) { if (x.valid) { if (x.size > 0) { return doWork(x); } else { return null; } } else { retur",
      "expectedPrefix": "function process(x) { if (!x) return null; if (!x.valid) return null; if (x.size <= 0) return null; return doWork(x); }"
    }
  ],
  "llmResults": [
    {
      "id": "var-to-const-1",
      "intent": "var-to-const",
      "correct": true,
      "wallMs": 751,
      "tokensIn": 105,
      "tokensOut": 17,
      "costUsd": 0.0000173
    },
    {
      "id": "var-to-const-2",
      "intent": "var-to-const",
      "correct": true,
      "wallMs": 1589,
      "tokensIn": 113,
      "tokensOut": 21,
      "costUsd": 0.000019699999999999998
    },
    {
      "id": "add-types-1",
      "intent": "add-types",
      "correct": true,
      "wallMs": 1036,
      "tokensIn": 114,
      "tokensOut": 24,
      "costUsd": 0.000021000000000000002
    },
    {
      "id": "add-types-2",
      "intent": "add-types",
      "correct": true,
      "wallMs": 611,
      "tokensIn": 130,
      "tokensOut": 35,
      "costUsd": 0.000027
    },
    {
      "id": "remove-console-1",
      "intent": "remove-console",
      "correct": true,
      "wallMs": 485,
      "tokensIn": 103,
      "tokensOut": 13,
      "costUsd": 0.0000155
    },
    {
      "id": "remove-console-2",
      "intent": "remove-console",
      "correct": true,
      "wallMs": 534,
      "tokensIn": 107,
      "tokensOut": 12,
      "costUsd": 0.0000155
    },
    {
      "id": "add-error-handling-1",
      "intent": "add-error-handling",
      "correct": true,
      "wallMs": 570,
      "tokensIn": 113,
      "tokensOut": 27,
      "costUsd": 0.000022100000000000002
    },
    {
      "id": "add-error-handling-2",
      "intent": "add-error-handling",
      "correct": true,
      "wallMs": 590,
      "tokensIn": 121,
      "tokensOut": 31,
      "costUsd": 0.000024500000000000003
    },
    {
      "id": "async-await-1",
      "intent": "async-await",
      "correct": true,
      "wallMs": 708,
      "tokensIn": 118,
      "tokensOut": 24,
      "costUsd": 0.000021400000000000002
    },
    {
      "id": "async-await-2",
      "intent": "async-await",
      "correct": true,
      "wallMs": 515,
      "tokensIn": 110,
      "tokensOut": 22,
      "costUsd": 0.0000198
    },
    {
      "id": "add-logging-1",
      "intent": "add-logging",
      "correct": true,
      "wallMs": 670,
      "tokensIn": 103,
      "tokensOut": 19,
      "costUsd": 0.0000179
    },
    {
      "id": "add-logging-2",
      "intent": "add-logging",
      "correct": true,
      "wallMs": 562,
      "tokensIn": 115,
      "tokensOut": 26,
      "costUsd": 0.000021900000000000004
    },
    {
      "id": "adversarial-extract-function",
      "intent": "extract-function",
      "correct": false,
      "wallMs": 1119,
      "tokensIn": 184,
      "tokensOut": 141,
      "costUsd": 0.0000748,
      "actualPrefix": "function sum_active(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } return total;",
      "expectedPrefix": "function sum_active(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } return total;"
    },
    {
      "id": "adversarial-type-narrowing",
      "intent": "type-narrowing",
      "correct": true,
      "wallMs": 749,
      "tokensIn": 150,
      "tokensOut": 58,
      "costUsd": 0.0000382
    },
    {
      "id": "adversarial-cross-method-rename",
      "intent": "rename-symbol",
      "correct": true,
      "wallMs": 1005,
      "tokensIn": 171,
      "tokensOut": 81,
      "costUsd": 0.000049500000000000004
    },
    {
      "id": "adversarial-recursive-rewrite",
      "intent": "recursive-to-iterative",
      "correct": true,
      "wallMs": 809,
      "tokensIn": 115,
      "tokensOut": 52,
      "costUsd": 0.0000323
    },
    {
      "id": "var-to-let-1",
      "intent": "var-to-let",
      "correct": true,
      "wallMs": 790,
      "tokensIn": 135,
      "tokensOut": 32,
      "costUsd": 0.000026300000000000002
    },
    {
      "id": "double-quote-to-single",
      "intent": "string-quote-style",
      "correct": true,
      "wallMs": 476,
      "tokensIn": 109,
      "tokensOut": 19,
      "costUsd": 0.000018500000000000002
    },
    {
      "id": "add-readonly-prop",
      "intent": "add-readonly",
      "correct": true,
      "wallMs": 1162,
      "tokensIn": 104,
      "tokensOut": 17,
      "costUsd": 0.0000172
    },
    {
      "id": "remove-debugger",
      "intent": "remove-debug",
      "correct": true,
      "wallMs": 721,
      "tokensIn": 99,
      "tokensOut": 13,
      "costUsd": 0.000015100000000000001
    },
    {
      "id": "import-add-named",
      "intent": "import-add",
      "correct": true,
      "wallMs": 1400,
      "tokensIn": 103,
      "tokensOut": 17,
      "costUsd": 0.0000171
    },
    {
      "id": "remove-trailing-semis",
      "intent": "format-tweak",
      "correct": true,
      "wallMs": 627,
      "tokensIn": 105,
      "tokensOut": 17,
      "costUsd": 0.0000173
    },
    {
      "id": "adversarial-extract-class",
      "intent": "extract-class",
      "correct": false,
      "wallMs": 1132,
      "tokensIn": 160,
      "tokensOut": 97,
      "costUsd": 0.000054800000000000004,
      "actualPrefix": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { private db = realDb; private au",
      "expectedPrefix": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { saveUser(u: any) { db.put(u); t"
    },
    {
      "id": "adversarial-callback-to-promise",
      "intent": "callback-to-promise",
      "correct": false,
      "wallMs": 804,
      "tokensIn": 145,
      "tokensOut": 64,
      "costUsd": 0.0000401,
      "actualPrefix": "async function loadConfig(): Promise<Config> { try { const data = await fs.promises.readFile('./c.json'); return JSON.pa",
      "expectedPrefix": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json'); return JSON.parse(data); } catc"
    },
    {
      "id": "adversarial-deeply-nested-conditional",
      "intent": "early-return-flatten",
      "correct": false,
      "wallMs": 774,
      "tokensIn": 152,
      "tokensOut": 73,
      "costUsd": 0.0000444,
      "actualPrefix": "function process(x) { if (!x) { return null; } if (!x.valid) { return null; } if (x.size <= 0) { return null; } return d",
      "expectedPrefix": "function process(x) { if (!x) return null; if (!x.valid) return null; if (x.size <= 0) return null; return doWork(x); }"
    }
  ],
  "anthropicResults": {
    "claude-sonnet-4-6": [
      {
        "id": "var-to-const-1",
        "intent": "var-to-const",
        "error": "fetch failed",
        "correct": false,
        "wallMs": 257
      },
      {
        "id": "var-to-const-2",
        "intent": "var-to-const",
        "error": "fetch failed",
        "correct": false,
        "wallMs": 258
      },
      {
        "id": "add-types-1",
        "intent": "add-types",
        "correct": true,
        "wallMs": 1067,
        "tokensIn": 114,
        "tokensOut": 27,
        "costUsd": 0.000747
      },
      {
        "id": "add-types-2",
        "intent": "add-types",
        "correct": true,
        "wallMs": 1689,
        "tokensIn": 130,
        "tokensOut": 38,
        "costUsd": 0.0009599999999999999
      },
      {
        "id": "remove-console-1",
        "intent": "remove-console",
        "correct": true,
        "wallMs": 905,
        "tokensIn": 103,
        "tokensOut": 16,
        "costUsd": 0.000549
      },
      {
        "id": "remove-console-2",
        "intent": "remove-console",
        "correct": true,
        "wallMs": 1154,
        "tokensIn": 107,
        "tokensOut": 15,
        "costUsd": 0.000546
      },
      {
        "id": "add-error-handling-1",
        "intent": "add-error-handling",
        "correct": true,
        "wallMs": 2995,
        "tokensIn": 113,
        "tokensOut": 30,
        "costUsd": 0.000789
      },
      {
        "id": "add-error-handling-2",
        "intent": "add-error-handling",
        "correct": true,
        "wallMs": 821,
        "tokensIn": 121,
        "tokensOut": 34,
        "costUsd": 0.0008730000000000001
      },
      {
        "id": "async-await-1",
        "intent": "async-await",
        "correct": true,
        "wallMs": 896,
        "tokensIn": 118,
        "tokensOut": 27,
        "costUsd": 0.0007589999999999999
      },
      {
        "id": "async-await-2",
        "intent": "async-await",
        "correct": true,
        "wallMs": 1174,
        "tokensIn": 110,
        "tokensOut": 25,
        "costUsd": 0.000705
      },
      {
        "id": "add-logging-1",
        "intent": "add-logging",
        "correct": true,
        "wallMs": 869,
        "tokensIn": 103,
        "tokensOut": 22,
        "costUsd": 0.000639
      },
      {
        "id": "add-logging-2",
        "intent": "add-logging",
        "correct": true,
        "wallMs": 2055,
        "tokensIn": 115,
        "tokensOut": 29,
        "costUsd": 0.0007800000000000001
      },
      {
        "id": "adversarial-extract-function",
        "intent": "extract-function",
        "correct": true,
        "wallMs": 1832,
        "tokensIn": 184,
        "tokensOut": 142,
        "costUsd": 0.002682
      },
      {
        "id": "adversarial-type-narrowing",
        "intent": "type-narrowing",
        "correct": false,
        "wallMs": 1469,
        "tokensIn": 156,
        "tokensOut": 61,
        "costUsd": 0.0013830000000000001,
        "actualPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { return r.",
        "expectedPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { if (r.kin"
      },
      {
        "id": "adversarial-cross-method-rename",
        "intent": "rename-symbol",
        "correct": true,
        "wallMs": 1564,
        "tokensIn": 173,
        "tokensOut": 85,
        "costUsd": 0.001794
      },
      {
        "id": "adversarial-recursive-rewrite",
        "intent": "recursive-to-iterative",
        "correct": true,
        "wallMs": 1240,
        "tokensIn": 121,
        "tokensOut": 55,
        "costUsd": 0.001188
      },
      {
        "id": "var-to-let-1",
        "intent": "var-to-let",
        "correct": true,
        "wallMs": 860,
        "tokensIn": 133,
        "tokensOut": 34,
        "costUsd": 0.0009090000000000001
      },
      {
        "id": "double-quote-to-single",
        "intent": "string-quote-style",
        "correct": true,
        "wallMs": 1221,
        "tokensIn": 112,
        "tokensOut": 24,
        "costUsd": 0.000696
      },
      {
        "id": "add-readonly-prop",
        "intent": "add-readonly",
        "correct": true,
        "wallMs": 982,
        "tokensIn": 104,
        "tokensOut": 20,
        "costUsd": 0.000612
      },
      {
        "id": "remove-debugger",
        "intent": "remove-debug",
        "correct": true,
        "wallMs": 1131,
        "tokensIn": 100,
        "tokensOut": 16,
        "costUsd": 0.00054
      },
      {
        "id": "import-add-named",
        "intent": "import-add",
        "correct": true,
        "wallMs": 1139,
        "tokensIn": 103,
        "tokensOut": 20,
        "costUsd": 0.000609
      },
      {
        "id": "remove-trailing-semis",
        "intent": "format-tweak",
        "correct": true,
        "wallMs": 1158,
        "tokensIn": 106,
        "tokensOut": 20,
        "costUsd": 0.0006180000000000001
      },
      {
        "id": "adversarial-extract-class",
        "intent": "extract-class",
        "correct": false,
        "wallMs": 2399,
        "tokensIn": 165,
        "tokensOut": 110,
        "costUsd": 0.002145,
        "actualPrefix": "class AuditService { log(action: string, id: any): void { console.log(action, id); } } class UserService { private db = ",
        "expectedPrefix": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { saveUser(u: any) { db.put(u); t"
      },
      {
        "id": "adversarial-callback-to-promise",
        "intent": "callback-to-promise",
        "correct": false,
        "wallMs": 1623,
        "tokensIn": 149,
        "tokensOut": 67,
        "costUsd": 0.001452,
        "actualPrefix": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json', 'utf8'); return JSON.parse(data)",
        "expectedPrefix": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json'); return JSON.parse(data); } catc"
      },
      {
        "id": "adversarial-deeply-nested-conditional",
        "intent": "early-return-flatten",
        "correct": true,
        "wallMs": 1523,
        "tokensIn": 156,
        "tokensOut": 58,
        "costUsd": 0.001338
      }
    ],
    "claude-opus-4-7": [
      {
        "id": "var-to-const-1",
        "intent": "var-to-const",
        "correct": true,
        "wallMs": 1347,
        "tokensIn": 147,
        "tokensOut": 24,
        "costUsd": 0.004005
      },
      {
        "id": "var-to-const-2",
        "intent": "var-to-const",
        "correct": true,
        "wallMs": 1252,
        "tokensIn": 159,
        "tokensOut": 30,
        "costUsd": 0.004635
      },
      {
        "id": "add-types-1",
        "intent": "add-types",
        "correct": true,
        "wallMs": 2566,
        "tokensIn": 156,
        "tokensOut": 31,
        "costUsd": 0.004665000000000001
      },
      {
        "id": "add-types-2",
        "intent": "add-types",
        "correct": true,
        "wallMs": 1278,
        "tokensIn": 176,
        "tokensOut": 44,
        "costUsd": 0.00594
      },
      {
        "id": "remove-console-1",
        "intent": "remove-console",
        "correct": true,
        "wallMs": 1042,
        "tokensIn": 150,
        "tokensOut": 22,
        "costUsd": 0.0039
      },
      {
        "id": "remove-console-2",
        "intent": "remove-console",
        "correct": true,
        "wallMs": 1406,
        "tokensIn": 153,
        "tokensOut": 20,
        "costUsd": 0.003795
      },
      {
        "id": "add-error-handling-1",
        "intent": "add-error-handling",
        "correct": true,
        "wallMs": 1158,
        "tokensIn": 162,
        "tokensOut": 38,
        "costUsd": 0.00528
      },
      {
        "id": "add-error-handling-2",
        "intent": "add-error-handling",
        "correct": true,
        "wallMs": 2162,
        "tokensIn": 168,
        "tokensOut": 41,
        "costUsd": 0.005594999999999999
      },
      {
        "id": "async-await-1",
        "intent": "async-await",
        "correct": true,
        "wallMs": 1300,
        "tokensIn": 165,
        "tokensOut": 34,
        "costUsd": 0.005025
      },
      {
        "id": "async-await-2",
        "intent": "async-await",
        "correct": true,
        "wallMs": 2641,
        "tokensIn": 155,
        "tokensOut": 31,
        "costUsd": 0.00465
      },
      {
        "id": "add-logging-1",
        "intent": "add-logging",
        "correct": true,
        "wallMs": 1161,
        "tokensIn": 148,
        "tokensOut": 28,
        "costUsd": 0.004319999999999999
      },
      {
        "id": "add-logging-2",
        "intent": "add-logging",
        "correct": true,
        "wallMs": 1216,
        "tokensIn": 157,
        "tokensOut": 33,
        "costUsd": 0.00483
      },
      {
        "id": "adversarial-extract-function",
        "intent": "extract-function",
        "correct": true,
        "wallMs": 2760,
        "tokensIn": 239,
        "tokensOut": 121,
        "costUsd": 0.01266
      },
      {
        "id": "adversarial-type-narrowing",
        "intent": "type-narrowing",
        "correct": false,
        "wallMs": 1676,
        "tokensIn": 216,
        "tokensOut": 76,
        "costUsd": 0.00894,
        "actualPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { return r.",
        "expectedPrefix": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { if (r.kin"
      },
      {
        "id": "adversarial-cross-method-rename",
        "intent": "rename-symbol",
        "correct": true,
        "wallMs": 2152,
        "tokensIn": 232,
        "tokensOut": 102,
        "costUsd": 0.01113
      },
      {
        "id": "adversarial-recursive-rewrite",
        "intent": "recursive-to-iterative",
        "correct": false,
        "wallMs": 1247,
        "tokensIn": 174,
        "tokensOut": 51,
        "costUsd": 0.006435,
        "actualPrefix": "function factorial(n) { let result = 1; for (let i = 2; i <= n; i++) result *= i; return result; }",
        "expectedPrefix": "function factorial(n) { let result = 1; for (let i = 2; i <= n; i++) { result *= i; } return result; }"
      },
      {
        "id": "var-to-let-1",
        "intent": "var-to-let",
        "correct": true,
        "wallMs": 1408,
        "tokensIn": 179,
        "tokensOut": 40,
        "costUsd": 0.005685
      },
      {
        "id": "double-quote-to-single",
        "intent": "string-quote-style",
        "correct": true,
        "wallMs": 1101,
        "tokensIn": 157,
        "tokensOut": 30,
        "costUsd": 0.004605
      },
      {
        "id": "add-readonly-prop",
        "intent": "add-readonly",
        "correct": true,
        "wallMs": 1118,
        "tokensIn": 150,
        "tokensOut": 27,
        "costUsd": 0.004274999999999999
      },
      {
        "id": "remove-debugger",
        "intent": "remove-debug",
        "correct": true,
        "wallMs": 1076,
        "tokensIn": 147,
        "tokensOut": 22,
        "costUsd": 0.003855
      },
      {
        "id": "import-add-named",
        "intent": "import-add",
        "correct": true,
        "wallMs": 1535,
        "tokensIn": 152,
        "tokensOut": 28,
        "costUsd": 0.00438
      },
      {
        "id": "remove-trailing-semis",
        "intent": "format-tweak",
        "correct": true,
        "wallMs": 1316,
        "tokensIn": 149,
        "tokensOut": 24,
        "costUsd": 0.004035
      },
      {
        "id": "adversarial-extract-class",
        "intent": "extract-class",
        "correct": true,
        "wallMs": 1653,
        "tokensIn": 220,
        "tokensOut": 94,
        "costUsd": 0.01035
      },
      {
        "id": "adversarial-callback-to-promise",
        "intent": "callback-to-promise",
        "correct": true,
        "wallMs": 1441,
        "tokensIn": 213,
        "tokensOut": 66,
        "costUsd": 0.008145
      },
      {
        "id": "adversarial-deeply-nested-conditional",
        "intent": "early-return-flatten",
        "correct": true,
        "wallMs": 2081,
        "tokensIn": 206,
        "tokensOut": 58,
        "costUsd": 0.0074399999999999996
      }
    ]
  }
 }
diff --git a/bench.mjs b/bench.mjs
 #!/usr/bin/env node
 // Verified-corpus benchmark for ruflo-cost-tracker.
 // Runs every case in bench/booster-corpus.json through agent-booster.apply()
 // and records: correctness vs. golden expected, latency, confidence, strategy.
 // Output: JSON to docs/benchmarks/runs/<ISO>.json + markdown summary to stdout.
 //
 // Resolution: must be run from a directory where `agent-booster` resolves
 // (typically anywhere under `v3/`). Run via:
 //
 //   ( cd v3 && node ../plugins/ruflo-cost-tracker/scripts/bench.mjs )
 //
 // Optional env:
 //   BENCH_LLM_BASELINE=1   -- run the same corpus through Gemini 2.0 Flash (or another OpenAI-compat model)
 //   BENCH_LLM_MODEL=...    -- override (default: models/gemini-2.0-flash)
 //   BENCH_LLM_BASE_URL=... -- override (default: deployed ruvocal Gemini OpenAI shim)
 //   BENCH_LLM_API_KEY=...  -- override (default: from gcloud secret GOOGLE_AI_API_KEY)
 //   BENCH_LLM_PRICE_IN/OUT -- $/1M tokens override (default Gemini 2.0 Flash: 0.10 / 0.40)
 //
 //   BENCH_ANTHROPIC=1      -- run the same corpus through Anthropic claude models
 //   BENCH_ANTHROPIC_MODELS=claude-sonnet-4-6,claude-opus-4-7   -- comma-separated list
 //   BENCH_ANTHROPIC_API_KEY=... -- override (default: from gcloud secret ANTHROPIC_API_KEY)
 //
 //   BENCH_OUT=<path>       -- override output JSON path
 //   BENCH_QUIET=1          -- suppress markdown summary
 //
 // Pricing built-in (per 1M tokens, USD): Sonnet 4.6 = 3/15, Opus 4.7 = 15/75, Haiku 4.5 = 1/5.

 import { readFileSync, writeFileSync, mkdirSync } from 'node:fs';
 import { fileURLToPath, pathToFileURL } from 'node:url';
 import { createRequire } from 'node:module';
 import { dirname, join, resolve } from 'node:path';

 const HERE = dirname(fileURLToPath(import.meta.url));
 const PLUGIN_ROOT = resolve(HERE, '..');
 const CORPUS = join(PLUGIN_ROOT, 'bench', 'booster-corpus.json');
 const RUNS_DIR = join(PLUGIN_ROOT, 'docs', 'benchmarks', 'runs');

 const norm = (s) => String(s ?? '').trim().replace(/\s+/g, ' ');
 const pct = (n) => `${(n * 100).toFixed(1)}%`;
 const ms = (n) => `${n.toFixed(2)}ms`;
 const pcent = (arr, p) => {
  if (!arr.length) return 0;
  const sorted = [...arr].sort((a, b) => a - b);
  const idx = Math.min(sorted.length - 1, Math.floor((p / 100) * sorted.length));
  return sorted[idx];
 };

 async function main() {
  const corpus = JSON.parse(readFileSync(CORPUS, 'utf-8'));
  // Resolve agent-booster from process.cwd() (so users can `cd v3 && node ../...`)
  // rather than from the script's directory (which is outside v3/node_modules).
  let AgentBoosterMod;
  try {
    const requireFromCwd = createRequire(join(process.cwd(), 'package.json'));
    const resolvedPath = requireFromCwd.resolve('agent-booster');
    AgentBoosterMod = await import(pathToFileURL(resolvedPath).href);
  } catch (err) {
    console.error(`agent-booster import failed: ${err.message}`);
    console.error(`cwd was: ${process.cwd()}`);
    console.error('Run from a directory where the package resolves, e.g.:');
    console.error('  ( cd v3 && node ../plugins/ruflo-cost-tracker/scripts/bench.mjs )');
    process.exit(2);
  }
  const { AgentBooster } = AgentBoosterMod;
  const booster = new AgentBooster();

  const results = [];
  for (const c of corpus.cases) {
    const expectedTier1 = c.expectedTier1 !== false; // default true (corpus v1 compat)
    const t0 = Date.now();
    let out;
    try {
      out = await booster.apply({ code: c.code, edit: c.edit, language: c.language });
    } catch (e) {
      results.push({ id: c.id, intent: c.intent, expectedTier1, error: String(e.message).slice(0, 200), correct: false });
      continue;
    }
    const wallMs = Date.now() - t0;
    const correct = norm(out.output) === norm(c.expected);
    const lowConfidence = (out.confidence ?? 0) < 0.5;
    // For Tier 1 cases: "good" = correct.
    // For non-Tier 1 cases: "good" = booster correctly *escalates* (incorrect output OR low confidence).
    const escalatedCorrectly = !expectedTier1 && (!correct || lowConfidence);
    results.push({
      id: c.id,
      intent: c.intent,
      expectedTier1,
      success: !!out.success,
      correct,
      escalatedCorrectly,
      lowConfidence,
      latencyMs: out.latency ?? null,
      wallMs,
      confidence: out.confidence ?? null,
      strategy: out.strategy ?? null,
      tokensIn: out.tokens?.input ?? null,
      tokensOut: out.tokens?.output ?? null,
      ...(correct ? {} : { actualPrefix: norm(out.output).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
    });
  }

  const tier1Cases = results.filter((r) => r.expectedTier1);
  const advCases = results.filter((r) => !r.expectedTier1);
  const tier1Passed = tier1Cases.filter((r) => r.correct).length;
  const tier1Total = tier1Cases.length;
  const advEscalated = advCases.filter((r) => r.escalatedCorrectly).length;
  const advTotal = advCases.length;

  const passed = results.filter((r) => r.correct).length;
  const total = results.length;
  const winRate = tier1Total ? tier1Passed / tier1Total : (total ? passed / total : 0);
  const escalationRate = advTotal ? advEscalated / advTotal : null;
  const successCount = results.filter((r) => r.success).length;

  const latencies = results.map((r) => r.latencyMs).filter((x) => typeof x === 'number');
  const wallTimes = results.map((r) => r.wallMs).filter((x) => typeof x === 'number');
  const confidences = results.map((r) => r.confidence).filter((x) => typeof x === 'number');

  const avg = (arr) => (arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0);

  // Optional: run the same corpus through one or more LLMs and record the comparison.
  let llmSummary = null;
  let llmResults = null;
  if (process.env.BENCH_LLM_BASELINE === '1') {
    ({ llmSummary, llmResults } = await runLlmBaseline(corpus.cases));
  }

  let anthropicSummaries = null; // Map<model, summary>
  let anthropicResults = null;   // Map<model, per-case-results>
  if (process.env.BENCH_ANTHROPIC === '1') {
    ({ anthropicSummaries, anthropicResults } = await runAnthropicBaseline(corpus.cases));
  }

  const summary = {
    runAt: new Date().toISOString(),
    corpusVersion: corpus.version,
    corpusSize: total,
    tier1Cases: tier1Total,
    adversarialCases: advTotal,
    winRate, // win rate over Tier 1 cases (the gate metric)
    winRatePct: pct(winRate),
    overallCorrect: total ? passed / total : 0, // diagnostic only
    escalationRate, // null if no adversarial cases; otherwise (correctly-escalated / adversarial)
    escalationRatePct: escalationRate == null ? 'n/a' : pct(escalationRate),
    successCount,
    avgLatencyMs: avg(latencies),
    p50LatencyMs: pcent(latencies, 50),
    p99LatencyMs: pcent(latencies, 99),
    maxLatencyMs: latencies.length ? Math.max(...latencies) : 0,
    avgWallMs: avg(wallTimes),
    avgConfidence: avg(confidences),
    minConfidence: confidences.length ? Math.min(...confidences) : 0,
    confidenceThreshold: 0.5,
    aboveThresholdCount: confidences.filter((c) => c >= 0.5).length,
    structuralCostUsd: 0, // no LLM call → no billing
    llmBaseline: llmSummary || (process.env.BENCH_LLM_BASELINE === '1' ? 'enabled-but-failed' : 'skipped'),
  };
  if (llmSummary) {
    // Direct apples-to-apples speedup ratio (booster vs LLM)
    summary.speedupVsLlm = llmSummary.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001);
    summary.costDeltaUsdPerEdit = llmSummary.avgCostUsdPerEdit; // booster side is $0
  }
  if (anthropicSummaries) {
    summary.anthropic = {};
    for (const [model, s] of Object.entries(anthropicSummaries)) {
      summary.anthropic[model] = {
        ...s,
        speedupVsBooster: s.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001),
        costSavedPerEditUsd: s.avgCostUsdPerEdit, // booster side is $0
      };
    }
  }

  const outDir = RUNS_DIR;
  mkdirSync(outDir, { recursive: true });
  const stamp = summary.runAt.replace(/[:.]/g, '-');
  const outPath = process.env.BENCH_OUT || join(outDir, `${stamp}.json`);
  const latestPath = join(outDir, 'latest.json');
  const payload = {
    summary,
    results,
    ...(llmResults ? { llmResults } : {}),
    ...(anthropicResults ? { anthropicResults } : {}),
  };
  writeFileSync(outPath, JSON.stringify(payload, null, 2));
  writeFileSync(latestPath, JSON.stringify(payload, null, 2));

  if (process.env.BENCH_QUIET !== '1') {
    console.log(`# Booster benchmark — ${summary.runAt}`);
    console.log('');
    console.log(`| Metric | Value |`);
    console.log(`|---|---:|`);
    console.log(`| Corpus size | ${total} (Tier 1: ${tier1Total}, adversarial: ${advTotal}) |`);
    console.log(`| **Win rate (Tier 1 cases)** | **${pct(winRate)}** (${tier1Passed}/${tier1Total}) |`);
    if (advTotal) {
      console.log(`| Escalation rate (adversarial) | ${pct(escalationRate)} (${advEscalated}/${advTotal}) |`);
    }
    console.log(`| Overall correct (diagnostic) | ${pct(total ? passed/total : 0)} (${passed}/${total}) |`);
    console.log(`| Success flag | ${successCount}/${total} |`);
    console.log(`| Avg latency | ${ms(summary.avgLatencyMs)} |`);
    console.log(`| p50 latency | ${ms(summary.p50LatencyMs)} |`);
    console.log(`| p99 latency | ${ms(summary.p99LatencyMs)} |`);
    console.log(`| Max latency | ${ms(summary.maxLatencyMs)} |`);
    console.log(`| Avg confidence | ${summary.avgConfidence.toFixed(3)} |`);
    console.log(`| Min confidence | ${summary.minConfidence.toFixed(3)} |`);
    console.log(`| Above 0.5 threshold | ${summary.aboveThresholdCount}/${confidences.length} |`);
    console.log(`| Structural cost | $${summary.structuralCostUsd} (no LLM call) |`);
    if (llmSummary) {
      console.log(`| LLM baseline model | ${llmSummary.model} |`);
      console.log(`| LLM avg latency | ${ms(llmSummary.avgLatencyMs)} |`);
      console.log(`| LLM win rate | ${pct(llmSummary.winRate)} (${llmSummary.passed}/${llmSummary.total}) |`);
      console.log(`| LLM avg cost/edit | $${llmSummary.avgCostUsdPerEdit.toFixed(6)} |`);
      console.log(`| **Measured speedup (booster vs LLM)** | **${summary.speedupVsLlm.toFixed(1)}×** |`);
      console.log(`| **Cost saved per edit** | **$${summary.costDeltaUsdPerEdit.toFixed(6)}** (100%) |`);
    } else {
      console.log(`| LLM baseline | ${summary.llmBaseline} |`);
    }
    console.log(``);
    if (anthropicSummaries) {
      console.log(`## Anthropic baseline\n`);
      console.log(`| Model | Avg latency | Win rate | Avg tokens (in/out) | Avg cost/edit | Speedup vs booster | Cost saved/edit |`);
      console.log(`|---|---:|---:|---:|---:|---:|---:|`);
      for (const [model, s] of Object.entries(anthropicSummaries)) {
        const speedup = (s.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001)).toFixed(1);
        console.log(`| \`${model}\` | ${ms(s.avgLatencyMs)} | ${pct(s.winRate)} (${s.passed}/${s.total}) | ${s.avgTokensIn.toFixed(0)} / ${s.avgTokensOut.toFixed(0)} | $${s.avgCostUsdPerEdit.toFixed(6)} | **${speedup}×** | **$${s.avgCostUsdPerEdit.toFixed(6)}** |`);
      }
      console.log(``);
    }
    const failed = results.filter((r) => !r.correct);
    if (failed.length) {
      console.log(`## Failures (${failed.length})`);
      for (const f of failed) {
        console.log(`- \`${f.id}\` (${f.intent}): ${f.error || `actual="${f.actualPrefix}" vs expected="${f.expectedPrefix}"`}`);
      }
    }
    console.log(``);
    console.log(`Saved: ${outPath}`);
    console.log(`Latest pointer: ${latestPath}`);
  }
 }

 // ─── LLM baseline ────────────────────────────────────────────────────────────

 async function runLlmBaseline(cases) {
  const baseUrl = process.env.BENCH_LLM_BASE_URL || 'https://generativelanguage.googleapis.com/v1beta/openai/';
  const model = process.env.BENCH_LLM_MODEL || 'models/gemini-2.0-flash';
  const priceIn = parseFloat(process.env.BENCH_LLM_PRICE_IN || '0.10'); // $/1M input
  const priceOut = parseFloat(process.env.BENCH_LLM_PRICE_OUT || '0.40'); // $/1M output
  let apiKey = process.env.BENCH_LLM_API_KEY;
  if (!apiKey) {
    // Try to pull from gcloud (deployed ruvocal uses GOOGLE_AI_API_KEY)
    try {
      const { execSync } = await import('node:child_process');
      apiKey = execSync('gcloud secrets versions access latest --secret=GOOGLE_AI_API_KEY 2>/dev/null', { encoding: 'utf-8' }).trim();
    } catch { /* fall through */ }
  }
  if (!apiKey) {
    return { llmSummary: { error: 'no-api-key', baseUrl, model }, llmResults: [] };
  }

  const sys = `You apply code edits deterministically. Return ONLY the resulting code as a single fenced \`\`\`<lang> code block. No explanation, no commentary, no extra blocks. The output of the code block is the final source.`;
  const user = (c) => `Apply this edit. Return only the resulting code.\n\nLanguage: ${c.language}\n\nOriginal code:\n\`\`\`${c.language}\n${c.code}\n\`\`\`\n\nEdit instruction (target snippet):\n\`\`\`${c.language}\n${c.edit}\n\`\`\``;

  const fenceRe = /```(?:[a-zA-Z]+\n)?([\s\S]*?)```/;
  const out = [];
  let totIn = 0, totOut = 0, totLatencyMs = 0, passed = 0;
  for (const c of cases) {
    const t0 = Date.now();
    let body = null;
    try {
      const resp = await fetch(`${baseUrl.replace(/\/$/, '')}/chat/completions`, {
        method: 'POST',
        headers: { 'Authorization': `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
        body: JSON.stringify({
          model,
          messages: [{ role: 'system', content: sys }, { role: 'user', content: user(c) }],
          max_tokens: 1024,
          temperature: 0,
        }),
      });
      body = await resp.json();
    } catch (e) {
      out.push({ id: c.id, intent: c.intent, error: String(e.message).slice(0, 200), correct: false, wallMs: Date.now() - t0 });
      continue;
    }
    const wallMs = Date.now() - t0;
    const content = body?.choices?.[0]?.message?.content ?? '';
    const m = fenceRe.exec(content);
    const extracted = (m ? m[1] : content).trim();
    const correct = norm(extracted) === norm(c.expected);
    if (correct) passed++;
    const inT = body?.usage?.prompt_tokens ?? 0;
    const outT = body?.usage?.completion_tokens ?? 0;
    totIn += inT; totOut += outT; totLatencyMs += wallMs;
    const cost = inT / 1e6 * priceIn + outT / 1e6 * priceOut;
    out.push({
      id: c.id, intent: c.intent, correct, wallMs,
      tokensIn: inT, tokensOut: outT, costUsd: cost,
      ...(correct ? {} : { actualPrefix: norm(extracted).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
    });
  }
  const total = cases.length;
  const totalCost = out.reduce((s, r) => s + (r.costUsd || 0), 0);
  return {
    llmSummary: {
      model, baseUrl, total, passed,
      winRate: total ? passed / total : 0,
      avgLatencyMs: total ? totLatencyMs / total : 0,
      totalTokensIn: totIn,
      totalTokensOut: totOut,
      avgTokensIn: total ? totIn / total : 0,
      avgTokensOut: total ? totOut / total : 0,
      totalCostUsd: totalCost,
      avgCostUsdPerEdit: total ? totalCost / total : 0,
      pricing: { input_per_1M: priceIn, output_per_1M: priceOut },
    },
    llmResults: out,
  };
 }

 // ─── Anthropic baseline ──────────────────────────────────────────────────────

 // Built-in pricing per 1M tokens (USD). Override with BENCH_ANTHROPIC_PRICING JSON env.
 const ANTHROPIC_PRICING = {
  'claude-sonnet-4-6':  { input: 3.00,  output: 15.00 },
  'claude-opus-4-7':    { input: 15.00, output: 75.00 },
  'claude-haiku-4-5':   { input: 1.00,  output: 5.00 },
  'claude-haiku-4-5-20251001': { input: 1.00, output: 5.00 },
 };

 async function runAnthropicBaseline(cases) {
  let apiKey = process.env.BENCH_ANTHROPIC_API_KEY;
  if (!apiKey) {
    try {
      const { execSync } = await import('node:child_process');
      apiKey = execSync('gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY 2>/dev/null', { encoding: 'utf-8' }).trim();
    } catch { /* fall through */ }
  }
  if (!apiKey) {
    return { anthropicSummaries: { _error: 'no-api-key' }, anthropicResults: {} };
  }

  let pricingOverride = {};
  if (process.env.BENCH_ANTHROPIC_PRICING) {
    try { pricingOverride = JSON.parse(process.env.BENCH_ANTHROPIC_PRICING); } catch { /* ignore */ }
  }
  const models = (process.env.BENCH_ANTHROPIC_MODELS || 'claude-sonnet-4-6,claude-opus-4-7')
    .split(',').map((s) => s.trim()).filter(Boolean);

  const sys = `You apply code edits deterministically. Return ONLY the resulting code as a single fenced \`\`\`<lang> code block. No explanation, no commentary, no extra blocks.`;
  const user = (c) => `Apply this edit. Return only the resulting code.\n\nLanguage: ${c.language}\n\nOriginal code:\n\`\`\`${c.language}\n${c.code}\n\`\`\`\n\nEdit instruction (target snippet):\n\`\`\`${c.language}\n${c.edit}\n\`\`\``;
  const fenceRe = /```(?:[a-zA-Z]+\n)?([\s\S]*?)```/;

  const summaries = {};
  const allResults = {};
  for (const model of models) {
    const pricing = pricingOverride[model] || ANTHROPIC_PRICING[model] || { input: 3, output: 15 };
    const out = [];
    let totIn = 0, totOut = 0, totLatencyMs = 0, passed = 0;
    for (const c of cases) {
      const t0 = Date.now();
      let body = null;
      try {
        const resp = await fetch('https://api.anthropic.com/v1/messages', {
          method: 'POST',
          headers: {
            'x-api-key': apiKey,
            'anthropic-version': '2023-06-01',
            'Content-Type': 'application/json',
          },
          body: JSON.stringify({
            model,
            max_tokens: 1024,
            system: sys,
            messages: [{ role: 'user', content: user(c) }],
          }),
        });
        body = await resp.json();
      } catch (e) {
        out.push({ id: c.id, intent: c.intent, error: String(e.message).slice(0, 200), correct: false, wallMs: Date.now() - t0 });
        continue;
      }
      const wallMs = Date.now() - t0;
      const text = body?.content?.[0]?.text ?? '';
      const m = fenceRe.exec(text);
      const extracted = (m ? m[1] : text).trim();
      const correct = norm(extracted) === norm(c.expected);
      if (correct) passed++;
      const inT = body?.usage?.input_tokens ?? 0;
      const outT = body?.usage?.output_tokens ?? 0;
      totIn += inT; totOut += outT; totLatencyMs += wallMs;
      const cost = inT / 1e6 * pricing.input + outT / 1e6 * pricing.output;
      out.push({
        id: c.id, intent: c.intent, correct, wallMs,
        tokensIn: inT, tokensOut: outT, costUsd: cost,
        ...(correct ? {} : { actualPrefix: norm(extracted).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
      });
    }
    const total = cases.length;
    const totalCost = out.reduce((s, r) => s + (r.costUsd || 0), 0);
    summaries[model] = {
      model, total, passed,
      winRate: total ? passed / total : 0,
      avgLatencyMs: total ? totLatencyMs / total : 0,
      totalTokensIn: totIn,
      totalTokensOut: totOut,
      avgTokensIn: total ? totIn / total : 0,
      avgTokensOut: total ? totOut / total : 0,
      totalCostUsd: totalCost,
      avgCostUsdPerEdit: total ? totalCost / total : 0,
      pricing,
    };
    allResults[model] = out;
  }

  return { anthropicSummaries: summaries, anthropicResults: allResults };
 }

 main().catch((e) => {
  console.error('bench failed:', e);
  process.exit(1);
 });
diff --git a/booster-corpus.json b/booster-corpus.json
 {
  "version": 3,
  "description": "Golden corpus for cost-tracker's Agent Booster verification. Two case classes — `expectedTier1: true` cases SHOULD apply via booster; `expectedTier1: false` cases SHOULD escalate (low confidence or output != expected) so the router/skill can route them to Tier 2/3.",
  "normalize": "trim, collapse whitespace runs to single space",
  "metrics": {
    "winRateTier1": "correct / count(expectedTier1==true)",
    "escalationRate": "(low-confidence OR incorrect) / count(expectedTier1==false)  -- a high escalation rate is the desired signal on adversarial cases",
    "overallCorrect": "correct / total  -- diagnostic; not the gate"
  },
  "cases": [
    {
      "id": "var-to-const-1",
      "intent": "var-to-const",
      "language": "javascript",
      "expectedTier1": true,
      "code": "var x = 1; var y = 2;",
      "edit": "const x = 1; const y = 2;",
      "expected": "const x = 1; const y = 2;"
    },
    {
      "id": "var-to-const-2",
      "intent": "var-to-const",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function go() { var n = compute(); return n + 1; }",
      "edit": "function go() { const n = compute(); return n + 1; }",
      "expected": "function go() { const n = compute(); return n + 1; }"
    },
    {
      "id": "add-types-1",
      "intent": "add-types",
      "language": "typescript",
      "expectedTier1": true,
      "code": "function add(a, b) { return a + b; }",
      "edit": "function add(a: number, b: number): number { return a + b; }",
      "expected": "function add(a: number, b: number): number { return a + b; }"
    },
    {
      "id": "add-types-2",
      "intent": "add-types",
      "language": "typescript",
      "expectedTier1": true,
      "code": "function name(p) { return p.first + ' ' + p.last; }",
      "edit": "function name(p: { first: string; last: string }): string { return p.first + ' ' + p.last; }",
      "expected": "function name(p: { first: string; last: string }): string { return p.first + ' ' + p.last; }"
    },
    {
      "id": "remove-console-1",
      "intent": "remove-console",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function go() { console.log(\"x\"); doWork(); }",
      "edit": "function go() { doWork(); }",
      "expected": "function go() { doWork(); }"
    },
    {
      "id": "remove-console-2",
      "intent": "remove-console",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function init() { console.log('start'); console.error('e'); run(); }",
      "edit": "function init() { run(); }",
      "expected": "function init() { run(); }"
    },
    {
      "id": "add-error-handling-1",
      "intent": "add-error-handling",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function fetch() { return api.get(); }",
      "edit": "function fetch() { try { return api.get(); } catch (e) { return null; } }",
      "expected": "function fetch() { try { return api.get(); } catch (e) { return null; } }"
    },
    {
      "id": "add-error-handling-2",
      "intent": "add-error-handling",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function load(p) { return parse(read(p)); }",
      "edit": "function load(p) { try { return parse(read(p)); } catch (e) { return null; } }",
      "expected": "function load(p) { try { return parse(read(p)); } catch (e) { return null; } }"
    },
    {
      "id": "async-await-1",
      "intent": "async-await",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function fetch() { return api.get().then(r => r.data); }",
      "edit": "async function fetch() { const r = await api.get(); return r.data; }",
      "expected": "async function fetch() { const r = await api.get(); return r.data; }"
    },
    {
      "id": "async-await-2",
      "intent": "async-await",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function load() { return read().then(parse); }",
      "edit": "async function load() { const r = await read(); return parse(r); }",
      "expected": "async function load() { const r = await read(); return parse(r); }"
    },
    {
      "id": "add-logging-1",
      "intent": "add-logging",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function go() { return work(); }",
      "edit": "function go() { console.log('go'); return work(); }",
      "expected": "function go() { console.log('go'); return work(); }"
    },
    {
      "id": "add-logging-2",
      "intent": "add-logging",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function save(x) { db.put(x); }",
      "edit": "function save(x) { console.log('save', x); db.put(x); }",
      "expected": "function save(x) { console.log('save', x); db.put(x); }"
    },
    {
      "id": "adversarial-extract-function",
      "intent": "extract-function",
      "language": "javascript",
      "expectedTier1": false,
      "comment": "Multi-statement extraction — booster's pattern-replace can't reason about control-flow boundaries. LLM should escalate.",
      "code": "function process(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } if (total > 100) { console.log('big'); } else { console.log('small'); } return total; }",
      "edit": "Extract the per-item summation into a helper called `sum_active(items)`, and the threshold-log into `log_threshold(total)`. Update process() to call them.",
      "expected": "function sum_active(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } return total; } function log_threshold(total) { if (total > 100) { console.log('big'); } else { console.log('small'); } } function process(items) { const total = sum_active(items); log_threshold(total); return total; }"
    },
    {
      "id": "adversarial-type-narrowing",
      "intent": "type-narrowing",
      "language": "typescript",
      "expectedTier1": false,
      "comment": "Requires understanding union types + control flow. Booster pattern-replace will likely fail or produce a low-confidence merge.",
      "code": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { return r.value; }",
      "edit": "Narrow `r` before accessing `value` — only return `r.value` when `r.kind === 'ok'`, otherwise return -1.",
      "expected": "type Result = { kind: 'ok'; value: number } | { kind: 'err'; message: string }; function describe(r: Result) { if (r.kind === 'ok') { return r.value; } return -1; }"
    },
    {
      "id": "adversarial-cross-method-rename",
      "intent": "rename-symbol",
      "language": "typescript",
      "expectedTier1": false,
      "comment": "Renaming requires reasoning about ALL call sites; booster sees only the snippet boundary.",
      "code": "class Cache { get(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: Record<string, any> = {}; } const c = new Cache(); c.set('a', 1); console.log(c.get('a'));",
      "edit": "Rename Cache.get to Cache.lookup and update all call sites.",
      "expected": "class Cache { lookup(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: Record<string, any> = {}; } const c = new Cache(); c.set('a', 1); console.log(c.lookup('a'));"
    },
    {
      "id": "adversarial-recursive-rewrite",
      "intent": "recursive-to-iterative",
      "language": "javascript",
      "expectedTier1": false,
      "comment": "Algorithmic transformation — requires reasoning, not pattern replacement.",
      "code": "function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }",
      "edit": "Rewrite as an iterative loop with no recursion.",
      "expected": "function factorial(n) { let result = 1; for (let i = 2; i <= n; i++) { result *= i; } return result; }"
    },
    {
      "id": "var-to-let-1",
      "intent": "var-to-let",
      "language": "javascript",
      "expectedTier1": true,
      "code": "var i = 0; for (i = 0; i < 10; i++) { sink(i); }",
      "edit": "let i = 0; for (i = 0; i < 10; i++) { sink(i); }",
      "expected": "let i = 0; for (i = 0; i < 10; i++) { sink(i); }"
    },
    {
      "id": "double-quote-to-single",
      "intent": "string-quote-style",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function greet(name) { return \"hello \" + name; }",
      "edit": "function greet(name) { return 'hello ' + name; }",
      "expected": "function greet(name) { return 'hello ' + name; }"
    },
    {
      "id": "add-readonly-prop",
      "intent": "add-readonly",
      "language": "typescript",
      "expectedTier1": true,
      "code": "class Box { value: number = 0; }",
      "edit": "class Box { readonly value: number = 0; }",
      "expected": "class Box { readonly value: number = 0; }"
    },
    {
      "id": "remove-debugger",
      "intent": "remove-debug",
      "language": "javascript",
      "expectedTier1": true,
      "code": "function go() { debugger; doWork(); }",
      "edit": "function go() { doWork(); }",
      "expected": "function go() { doWork(); }"
    },
    {
      "id": "import-add-named",
      "intent": "import-add",
      "language": "typescript",
      "expectedTier1": true,
      "code": "import { foo } from './x'; foo();",
      "edit": "import { foo, bar } from './x'; foo();",
      "expected": "import { foo, bar } from './x'; foo();"
    },
    {
      "id": "remove-trailing-semis",
      "intent": "format-tweak",
      "language": "javascript",
      "expectedTier1": true,
      "code": "const a = 1;; const b = 2;;",
      "edit": "const a = 1; const b = 2;",
      "expected": "const a = 1; const b = 2;"
    },
    {
      "id": "adversarial-extract-class",
      "intent": "extract-class",
      "language": "typescript",
      "expectedTier1": false,
      "comment": "Splitting one class into two with delegation requires reasoning about responsibility boundaries.",
      "code": "class UserService { saveUser(u: any) { db.put(u); audit.log('save', u.id); } private db = realDb; private audit = { log: (k: string, v: any) => console.log(k, v) }; }",
      "edit": "Extract the `audit` concern into its own AuditService class and have UserService delegate to it.",
      "expected": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { saveUser(u: any) { db.put(u); this.audit.log('save', u.id); } private db = realDb; private audit = new AuditService(); }"
    },
    {
      "id": "adversarial-callback-to-promise",
      "intent": "callback-to-promise",
      "language": "javascript",
      "expectedTier1": false,
      "comment": "Restructuring control flow from callbacks to async/await requires understanding execution semantics, not pattern matching.",
      "code": "function loadConfig(cb) { fs.readFile('./c.json', (err, data) => { if (err) cb(err); else cb(null, JSON.parse(data)); }); }",
      "edit": "Rewrite as an async function that returns a Promise<Config> using fs.promises and try/catch.",
      "expected": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json'); return JSON.parse(data); } catch (err) { throw err; } }"
    },
    {
      "id": "adversarial-deeply-nested-conditional",
      "intent": "early-return-flatten",
      "language": "javascript",
      "expectedTier1": false,
      "comment": "Flattening nested conditionals into guard clauses requires reasoning about boolean logic + control flow.",
      "code": "function process(x) { if (x) { if (x.valid) { if (x.size > 0) { return doWork(x); } else { return null; } } else { return null; } } else { return null; } }",
      "edit": "Flatten using early returns / guard clauses — return null at each failed precondition.",
      "expected": "function process(x) { if (!x) return null; if (!x.valid) return null; if (x.size <= 0) return null; return doWork(x); }"
    }
  ]
 }
Endpoint	Tier 1 win	Adversarial	Avg latency	Cost / edit	Speedup vs Booster
Agent Booster (WASM, local)	18 / 18 ✓	escalates 7/7 ✓	0.36 ms	$0	—
Gemini 2.0 Flash	18 / 18 ✓	3 / 7	807.56 ms	$0.000028	2243.2×
Claude Sonnet 4.6	18 / 18 ✓	2 / 7	1270.64 ms	$0.000933	3529.6×
Claude Opus 4.7	18 / 18 ✓	5 / 7	1563.72 ms	$0.005943	4343.7×
Replaced by Booster	Wall-time saved	Cost saved
Gemini 2.0 Flash floor	~22.4 hours	$2.80
Claude Sonnet 4.6	~35.3 hours	$93.30
Claude Opus 4.7	~43.4 hours	$594.30
Version	Capability shipped
0.4.0	corpus v1 (12 cases) + booster integration baseline
0.5.0	cost-track — auto-capture per-session token usage to cost-tracking namespace
0.6.0	cost-budget-check — real budget enforcement with 50/75/90/100% alert ladder
0.7.0	outcome.mjs — auto-emit hooks_model-outcome from cost-optimize
0.8.0	compact.mjs — drops inline Node block from cost-compact-context
0.9.0	cost-trend — drift detection across all bench runs
0.10.0	corpus v2 → v3 (16 → 25 cases incl. adversarial split)
0.11.0	GitHub Actions — smoke + booster-only bench on PR
0.12.0	cost-conversation — per-conversation lens
0.13.0	cost-export — Prometheus textfile + webhook
0.14.0	cost-federation — ADR-097 Phase 3 consumer (ready when upstream emits)
0.15.0	cost-summary — programmatic JSON contract
	#!/usr/bin/env node
	// Verified-corpus benchmark for ruflo-cost-tracker.
	// Runs every case in bench/booster-corpus.json through agent-booster.apply()
	// and records: correctness vs. golden expected, latency, confidence, strategy.
	// Output: JSON to docs/benchmarks/runs/<ISO>.json + markdown summary to stdout.
	//
	// Resolution: must be run from a directory where `agent-booster` resolves
	// (typically anywhere under `v3/`). Run via:
	//
	// ( cd v3 && node ../plugins/ruflo-cost-tracker/scripts/bench.mjs )
	//
	// Optional env:
	// BENCH_LLM_BASELINE=1 -- run the same corpus through Gemini 2.0 Flash (or another OpenAI-compat model)
	// BENCH_LLM_MODEL=... -- override (default: models/gemini-2.0-flash)
	// BENCH_LLM_BASE_URL=... -- override (default: deployed ruvocal Gemini OpenAI shim)
	// BENCH_LLM_API_KEY=... -- override (default: from gcloud secret GOOGLE_AI_API_KEY)
	// BENCH_LLM_PRICE_IN/OUT -- $/1M tokens override (default Gemini 2.0 Flash: 0.10 / 0.40)
	//
	// BENCH_ANTHROPIC=1 -- run the same corpus through Anthropic claude models
	// BENCH_ANTHROPIC_MODELS=claude-sonnet-4-6,claude-opus-4-7 -- comma-separated list
	// BENCH_ANTHROPIC_API_KEY=... -- override (default: from gcloud secret ANTHROPIC_API_KEY)
	//
	// BENCH_OUT=<path> -- override output JSON path
	// BENCH_QUIET=1 -- suppress markdown summary
	//
	// Pricing built-in (per 1M tokens, USD): Sonnet 4.6 = 3/15, Opus 4.7 = 15/75, Haiku 4.5 = 1/5.

	import { readFileSync, writeFileSync, mkdirSync } from 'node:fs';
	import { fileURLToPath, pathToFileURL } from 'node:url';
	import { createRequire } from 'node:module';
	import { dirname, join, resolve } from 'node:path';

	const HERE = dirname(fileURLToPath(import.meta.url));
	const PLUGIN_ROOT = resolve(HERE, '..');
	const CORPUS = join(PLUGIN_ROOT, 'bench', 'booster-corpus.json');
	const RUNS_DIR = join(PLUGIN_ROOT, 'docs', 'benchmarks', 'runs');

	const norm = (s) => String(s ?? '').trim().replace(/\s+/g, ' ');
	const pct = (n) => `${(n * 100).toFixed(1)}%`;
	const ms = (n) => `${n.toFixed(2)}ms`;
	const pcent = (arr, p) => {
	if (!arr.length) return 0;
	const sorted = [...arr].sort((a, b) => a - b);
	const idx = Math.min(sorted.length - 1, Math.floor((p / 100) * sorted.length));
	return sorted[idx];
	};

	async function main() {
	const corpus = JSON.parse(readFileSync(CORPUS, 'utf-8'));
	// Resolve agent-booster from process.cwd() (so users can `cd v3 && node ../...`)
	// rather than from the script's directory (which is outside v3/node_modules).
	let AgentBoosterMod;
	try {
	const requireFromCwd = createRequire(join(process.cwd(), 'package.json'));
	const resolvedPath = requireFromCwd.resolve('agent-booster');
	AgentBoosterMod = await import(pathToFileURL(resolvedPath).href);
	} catch (err) {
	console.error(`agent-booster import failed: ${err.message}`);
	console.error(`cwd was: ${process.cwd()}`);
	console.error('Run from a directory where the package resolves, e.g.:');
	console.error(' ( cd v3 && node ../plugins/ruflo-cost-tracker/scripts/bench.mjs )');
	process.exit(2);
	}
	const { AgentBooster } = AgentBoosterMod;
	const booster = new AgentBooster();

	const results = [];
	for (const c of corpus.cases) {
	const expectedTier1 = c.expectedTier1 !== false; // default true (corpus v1 compat)
	const t0 = Date.now();
	let out;
	try {
	out = await booster.apply({ code: c.code, edit: c.edit, language: c.language });
	} catch (e) {
	results.push({ id: c.id, intent: c.intent, expectedTier1, error: String(e.message).slice(0, 200), correct: false });
	continue;
	}
	const wallMs = Date.now() - t0;
	const correct = norm(out.output) === norm(c.expected);
	const lowConfidence = (out.confidence ?? 0) < 0.5;
	// For Tier 1 cases: "good" = correct.
	// For non-Tier 1 cases: "good" = booster correctly escalates (incorrect output OR low confidence).
	const escalatedCorrectly = !expectedTier1 && (!correct \|\| lowConfidence);
	results.push({
	id: c.id,
	intent: c.intent,
	expectedTier1,
	success: !!out.success,
	correct,
	escalatedCorrectly,
	lowConfidence,
	latencyMs: out.latency ?? null,
	wallMs,
	confidence: out.confidence ?? null,
	strategy: out.strategy ?? null,
	tokensIn: out.tokens?.input ?? null,
	tokensOut: out.tokens?.output ?? null,
	...(correct ? {} : { actualPrefix: norm(out.output).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
	});
	}

	const tier1Cases = results.filter((r) => r.expectedTier1);
	const advCases = results.filter((r) => !r.expectedTier1);
	const tier1Passed = tier1Cases.filter((r) => r.correct).length;
	const tier1Total = tier1Cases.length;
	const advEscalated = advCases.filter((r) => r.escalatedCorrectly).length;
	const advTotal = advCases.length;

	const passed = results.filter((r) => r.correct).length;
	const total = results.length;
	const winRate = tier1Total ? tier1Passed / tier1Total : (total ? passed / total : 0);
	const escalationRate = advTotal ? advEscalated / advTotal : null;
	const successCount = results.filter((r) => r.success).length;

	const latencies = results.map((r) => r.latencyMs).filter((x) => typeof x === 'number');
	const wallTimes = results.map((r) => r.wallMs).filter((x) => typeof x === 'number');
	const confidences = results.map((r) => r.confidence).filter((x) => typeof x === 'number');

	const avg = (arr) => (arr.length ? arr.reduce((a, b) => a + b, 0) / arr.length : 0);

	// Optional: run the same corpus through one or more LLMs and record the comparison.
	let llmSummary = null;
	let llmResults = null;
	if (process.env.BENCH_LLM_BASELINE === '1') {
	({ llmSummary, llmResults } = await runLlmBaseline(corpus.cases));
	}

	let anthropicSummaries = null; // Map<model, summary>
	let anthropicResults = null; // Map<model, per-case-results>
	if (process.env.BENCH_ANTHROPIC === '1') {
	({ anthropicSummaries, anthropicResults } = await runAnthropicBaseline(corpus.cases));
	}

	const summary = {
	runAt: new Date().toISOString(),
	corpusVersion: corpus.version,
	corpusSize: total,
	tier1Cases: tier1Total,
	adversarialCases: advTotal,
	winRate, // win rate over Tier 1 cases (the gate metric)
	winRatePct: pct(winRate),
	overallCorrect: total ? passed / total : 0, // diagnostic only
	escalationRate, // null if no adversarial cases; otherwise (correctly-escalated / adversarial)
	escalationRatePct: escalationRate == null ? 'n/a' : pct(escalationRate),
	successCount,
	avgLatencyMs: avg(latencies),
	p50LatencyMs: pcent(latencies, 50),
	p99LatencyMs: pcent(latencies, 99),
	maxLatencyMs: latencies.length ? Math.max(...latencies) : 0,
	avgWallMs: avg(wallTimes),
	avgConfidence: avg(confidences),
	minConfidence: confidences.length ? Math.min(...confidences) : 0,
	confidenceThreshold: 0.5,
	aboveThresholdCount: confidences.filter((c) => c >= 0.5).length,
	structuralCostUsd: 0, // no LLM call → no billing
	llmBaseline: llmSummary \|\| (process.env.BENCH_LLM_BASELINE === '1' ? 'enabled-but-failed' : 'skipped'),
	};
	if (llmSummary) {
	// Direct apples-to-apples speedup ratio (booster vs LLM)
	summary.speedupVsLlm = llmSummary.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001);
	summary.costDeltaUsdPerEdit = llmSummary.avgCostUsdPerEdit; // booster side is $0
	}
	if (anthropicSummaries) {
	summary.anthropic = {};
	for (const [model, s] of Object.entries(anthropicSummaries)) {
	summary.anthropic[model] = {
	...s,
	speedupVsBooster: s.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001),
	costSavedPerEditUsd: s.avgCostUsdPerEdit, // booster side is $0
	};
	}
	}

	const outDir = RUNS_DIR;
	mkdirSync(outDir, { recursive: true });
	const stamp = summary.runAt.replace(/[:.]/g, '-');
	const outPath = process.env.BENCH_OUT \|\| join(outDir, `${stamp}.json`);
	const latestPath = join(outDir, 'latest.json');
	const payload = {
	summary,
	results,
	...(llmResults ? { llmResults } : {}),
	...(anthropicResults ? { anthropicResults } : {}),
	};
	writeFileSync(outPath, JSON.stringify(payload, null, 2));
	writeFileSync(latestPath, JSON.stringify(payload, null, 2));

	if (process.env.BENCH_QUIET !== '1') {
	console.log(`# Booster benchmark — ${summary.runAt}`);
	console.log('');
	console.log(`\| Metric \| Value \|`);
	console.log(`\|---\|---:\|`);
	console.log(`\| Corpus size \| ${total} (Tier 1: ${tier1Total}, adversarial: ${advTotal}) \|`);
	console.log(`\| Win rate (Tier 1 cases) \| ${pct(winRate)} (${tier1Passed}/${tier1Total}) \|`);
	if (advTotal) {
	console.log(`\| Escalation rate (adversarial) \| ${pct(escalationRate)} (${advEscalated}/${advTotal}) \|`);
	}
	console.log(`\| Overall correct (diagnostic) \| ${pct(total ? passed/total : 0)} (${passed}/${total}) \|`);
	console.log(`\| Success flag \| ${successCount}/${total} \|`);
	console.log(`\| Avg latency \| ${ms(summary.avgLatencyMs)} \|`);
	console.log(`\| p50 latency \| ${ms(summary.p50LatencyMs)} \|`);
	console.log(`\| p99 latency \| ${ms(summary.p99LatencyMs)} \|`);
	console.log(`\| Max latency \| ${ms(summary.maxLatencyMs)} \|`);
	console.log(`\| Avg confidence \| ${summary.avgConfidence.toFixed(3)} \|`);
	console.log(`\| Min confidence \| ${summary.minConfidence.toFixed(3)} \|`);
	console.log(`\| Above 0.5 threshold \| ${summary.aboveThresholdCount}/${confidences.length} \|`);
	console.log(`\| Structural cost \| $${summary.structuralCostUsd} (no LLM call) \|`);
	if (llmSummary) {
	console.log(`\| LLM baseline model \| ${llmSummary.model} \|`);
	console.log(`\| LLM avg latency \| ${ms(llmSummary.avgLatencyMs)} \|`);
	console.log(`\| LLM win rate \| ${pct(llmSummary.winRate)} (${llmSummary.passed}/${llmSummary.total}) \|`);
	console.log(`\| LLM avg cost/edit \| $${llmSummary.avgCostUsdPerEdit.toFixed(6)} \|`);
	console.log(`\| Measured speedup (booster vs LLM) \| ${summary.speedupVsLlm.toFixed(1)}× \|`);
	console.log(`\| Cost saved per edit \| $${summary.costDeltaUsdPerEdit.toFixed(6)} (100%) \|`);
	} else {
	console.log(`\| LLM baseline \| ${summary.llmBaseline} \|`);
	}
	console.log(``);
	if (anthropicSummaries) {
	console.log(`## Anthropic baseline\n`);
	console.log(`\| Model \| Avg latency \| Win rate \| Avg tokens (in/out) \| Avg cost/edit \| Speedup vs booster \| Cost saved/edit \|`);
	console.log(`\|---\|---:\|---:\|---:\|---:\|---:\|---:\|`);
	for (const [model, s] of Object.entries(anthropicSummaries)) {
	const speedup = (s.avgLatencyMs / Math.max(summary.avgLatencyMs, 0.001)).toFixed(1);
	console.log(`\| \`${model}\` \| ${ms(s.avgLatencyMs)} \| ${pct(s.winRate)} (${s.passed}/${s.total}) \| ${s.avgTokensIn.toFixed(0)} / ${s.avgTokensOut.toFixed(0)} \| $${s.avgCostUsdPerEdit.toFixed(6)} \| ${speedup}× \| $${s.avgCostUsdPerEdit.toFixed(6)} \|`);
	}
	console.log(``);
	}
	const failed = results.filter((r) => !r.correct);
	if (failed.length) {
	console.log(`## Failures (${failed.length})`);
	for (const f of failed) {
	console.log(`- \`${f.id}\` (${f.intent}): ${f.error \|\| `actual="${f.actualPrefix}" vs expected="${f.expectedPrefix}"`}`);
	}
	}
	console.log(``);
	console.log(`Saved: ${outPath}`);
	console.log(`Latest pointer: ${latestPath}`);
	}
	}

	// ─── LLM baseline ────────────────────────────────────────────────────────────

	async function runLlmBaseline(cases) {
	const baseUrl = process.env.BENCH_LLM_BASE_URL \|\| 'https://generativelanguage.googleapis.com/v1beta/openai/';
	const model = process.env.BENCH_LLM_MODEL \|\| 'models/gemini-2.0-flash';
	const priceIn = parseFloat(process.env.BENCH_LLM_PRICE_IN \|\| '0.10'); // $/1M input
	const priceOut = parseFloat(process.env.BENCH_LLM_PRICE_OUT \|\| '0.40'); // $/1M output
	let apiKey = process.env.BENCH_LLM_API_KEY;
	if (!apiKey) {
	// Try to pull from gcloud (deployed ruvocal uses GOOGLE_AI_API_KEY)
	try {
	const { execSync } = await import('node:child_process');
	apiKey = execSync('gcloud secrets versions access latest --secret=GOOGLE_AI_API_KEY 2>/dev/null', { encoding: 'utf-8' }).trim();
	} catch { /* fall through */ }
	}
	if (!apiKey) {
	return { llmSummary: { error: 'no-api-key', baseUrl, model }, llmResults: [] };
	}

	const sys = `You apply code edits deterministically. Return ONLY the resulting code as a single fenced \`\`\`<lang> code block. No explanation, no commentary, no extra blocks. The output of the code block is the final source.`;
	const user = (c) => `Apply this edit. Return only the resulting code.\n\nLanguage: ${c.language}\n\nOriginal code:\n\`\`\`${c.language}\n${c.code}\n\`\`\`\n\nEdit instruction (target snippet):\n\`\`\`${c.language}\n${c.edit}\n\`\`\``;

	const fenceRe = /```(?:[a-zA-Z]+\n)?([\s\S]*?)```/;
	const out = [];
	let totIn = 0, totOut = 0, totLatencyMs = 0, passed = 0;
	for (const c of cases) {
	const t0 = Date.now();
	let body = null;
	try {
	const resp = await fetch(`${baseUrl.replace(/\/$/, '')}/chat/completions`, {
	method: 'POST',
	headers: { 'Authorization': `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
	body: JSON.stringify({
	model,
	messages: [{ role: 'system', content: sys }, { role: 'user', content: user(c) }],
	max_tokens: 1024,
	temperature: 0,
	}),
	});
	body = await resp.json();
	} catch (e) {
	out.push({ id: c.id, intent: c.intent, error: String(e.message).slice(0, 200), correct: false, wallMs: Date.now() - t0 });
	continue;
	}
	const wallMs = Date.now() - t0;
	const content = body?.choices?.[0]?.message?.content ?? '';
	const m = fenceRe.exec(content);
	const extracted = (m ? m[1] : content).trim();
	const correct = norm(extracted) === norm(c.expected);
	if (correct) passed++;
	const inT = body?.usage?.prompt_tokens ?? 0;
	const outT = body?.usage?.completion_tokens ?? 0;
	totIn += inT; totOut += outT; totLatencyMs += wallMs;
	const cost = inT / 1e6 * priceIn + outT / 1e6 * priceOut;
	out.push({
	id: c.id, intent: c.intent, correct, wallMs,
	tokensIn: inT, tokensOut: outT, costUsd: cost,
	...(correct ? {} : { actualPrefix: norm(extracted).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
	});
	}
	const total = cases.length;
	const totalCost = out.reduce((s, r) => s + (r.costUsd \|\| 0), 0);
	return {
	llmSummary: {
	model, baseUrl, total, passed,
	winRate: total ? passed / total : 0,
	avgLatencyMs: total ? totLatencyMs / total : 0,
	totalTokensIn: totIn,
	totalTokensOut: totOut,
	avgTokensIn: total ? totIn / total : 0,
	avgTokensOut: total ? totOut / total : 0,
	totalCostUsd: totalCost,
	avgCostUsdPerEdit: total ? totalCost / total : 0,
	pricing: { input_per_1M: priceIn, output_per_1M: priceOut },
	},
	llmResults: out,
	};
	}

	// ─── Anthropic baseline ──────────────────────────────────────────────────────

	// Built-in pricing per 1M tokens (USD). Override with BENCH_ANTHROPIC_PRICING JSON env.
	const ANTHROPIC_PRICING = {
	'claude-sonnet-4-6': { input: 3.00, output: 15.00 },
	'claude-opus-4-7': { input: 15.00, output: 75.00 },
	'claude-haiku-4-5': { input: 1.00, output: 5.00 },
	'claude-haiku-4-5-20251001': { input: 1.00, output: 5.00 },
	};

	async function runAnthropicBaseline(cases) {
	let apiKey = process.env.BENCH_ANTHROPIC_API_KEY;
	if (!apiKey) {
	try {
	const { execSync } = await import('node:child_process');
	apiKey = execSync('gcloud secrets versions access latest --secret=ANTHROPIC_API_KEY 2>/dev/null', { encoding: 'utf-8' }).trim();
	} catch { /* fall through */ }
	}
	if (!apiKey) {
	return { anthropicSummaries: { _error: 'no-api-key' }, anthropicResults: {} };
	}

	let pricingOverride = {};
	if (process.env.BENCH_ANTHROPIC_PRICING) {
	try { pricingOverride = JSON.parse(process.env.BENCH_ANTHROPIC_PRICING); } catch { /* ignore */ }
	}
	const models = (process.env.BENCH_ANTHROPIC_MODELS \|\| 'claude-sonnet-4-6,claude-opus-4-7')
	.split(',').map((s) => s.trim()).filter(Boolean);

	const sys = `You apply code edits deterministically. Return ONLY the resulting code as a single fenced \`\`\`<lang> code block. No explanation, no commentary, no extra blocks.`;
	const user = (c) => `Apply this edit. Return only the resulting code.\n\nLanguage: ${c.language}\n\nOriginal code:\n\`\`\`${c.language}\n${c.code}\n\`\`\`\n\nEdit instruction (target snippet):\n\`\`\`${c.language}\n${c.edit}\n\`\`\``;
	const fenceRe = /```(?:[a-zA-Z]+\n)?([\s\S]*?)```/;

	const summaries = {};
	const allResults = {};
	for (const model of models) {
	const pricing = pricingOverride[model] \|\| ANTHROPIC_PRICING[model] \|\| { input: 3, output: 15 };
	const out = [];
	let totIn = 0, totOut = 0, totLatencyMs = 0, passed = 0;
	for (const c of cases) {
	const t0 = Date.now();
	let body = null;
	try {
	const resp = await fetch('https://api.anthropic.com/v1/messages', {
	method: 'POST',
	headers: {
	'x-api-key': apiKey,
	'anthropic-version': '2023-06-01',
	'Content-Type': 'application/json',
	},
	body: JSON.stringify({
	model,
	max_tokens: 1024,
	system: sys,
	messages: [{ role: 'user', content: user(c) }],
	}),
	});
	body = await resp.json();
	} catch (e) {
	out.push({ id: c.id, intent: c.intent, error: String(e.message).slice(0, 200), correct: false, wallMs: Date.now() - t0 });
	continue;
	}
	const wallMs = Date.now() - t0;
	const text = body?.content?.[0]?.text ?? '';
	const m = fenceRe.exec(text);
	const extracted = (m ? m[1] : text).trim();
	const correct = norm(extracted) === norm(c.expected);
	if (correct) passed++;
	const inT = body?.usage?.input_tokens ?? 0;
	const outT = body?.usage?.output_tokens ?? 0;
	totIn += inT; totOut += outT; totLatencyMs += wallMs;
	const cost = inT / 1e6 * pricing.input + outT / 1e6 * pricing.output;
	out.push({
	id: c.id, intent: c.intent, correct, wallMs,
	tokensIn: inT, tokensOut: outT, costUsd: cost,
	...(correct ? {} : { actualPrefix: norm(extracted).slice(0, 120), expectedPrefix: norm(c.expected).slice(0, 120) }),
	});
	}
	const total = cases.length;
	const totalCost = out.reduce((s, r) => s + (r.costUsd \|\| 0), 0);
	summaries[model] = {
	model, total, passed,
	winRate: total ? passed / total : 0,
	avgLatencyMs: total ? totLatencyMs / total : 0,
	totalTokensIn: totIn,
	totalTokensOut: totOut,
	avgTokensIn: total ? totIn / total : 0,
	avgTokensOut: total ? totOut / total : 0,
	totalCostUsd: totalCost,
	avgCostUsdPerEdit: total ? totalCost / total : 0,
	pricing,
	};
	allResults[model] = out;
	}

	return { anthropicSummaries: summaries, anthropicResults: allResults };
	}

	main().catch((e) => {
	console.error('bench failed:', e);
	process.exit(1);
	});
	{
	"version": 3,
	"description": "Golden corpus for cost-tracker's Agent Booster verification. Two case classes — `expectedTier1: true` cases SHOULD apply via booster; `expectedTier1: false` cases SHOULD escalate (low confidence or output != expected) so the router/skill can route them to Tier 2/3.",
	"normalize": "trim, collapse whitespace runs to single space",
	"metrics": {
	"winRateTier1": "correct / count(expectedTier1==true)",
	"escalationRate": "(low-confidence OR incorrect) / count(expectedTier1==false) -- a high escalation rate is the desired signal on adversarial cases",
	"overallCorrect": "correct / total -- diagnostic; not the gate"
	},
	"cases": [
	{
	"id": "var-to-const-1",
	"intent": "var-to-const",
	"language": "javascript",
	"expectedTier1": true,
	"code": "var x = 1; var y = 2;",
	"edit": "const x = 1; const y = 2;",
	"expected": "const x = 1; const y = 2;"
	},
	{
	"id": "var-to-const-2",
	"intent": "var-to-const",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function go() { var n = compute(); return n + 1; }",
	"edit": "function go() { const n = compute(); return n + 1; }",
	"expected": "function go() { const n = compute(); return n + 1; }"
	},
	{
	"id": "add-types-1",
	"intent": "add-types",
	"language": "typescript",
	"expectedTier1": true,
	"code": "function add(a, b) { return a + b; }",
	"edit": "function add(a: number, b: number): number { return a + b; }",
	"expected": "function add(a: number, b: number): number { return a + b; }"
	},
	{
	"id": "add-types-2",
	"intent": "add-types",
	"language": "typescript",
	"expectedTier1": true,
	"code": "function name(p) { return p.first + ' ' + p.last; }",
	"edit": "function name(p: { first: string; last: string }): string { return p.first + ' ' + p.last; }",
	"expected": "function name(p: { first: string; last: string }): string { return p.first + ' ' + p.last; }"
	},
	{
	"id": "remove-console-1",
	"intent": "remove-console",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function go() { console.log(\"x\"); doWork(); }",
	"edit": "function go() { doWork(); }",
	"expected": "function go() { doWork(); }"
	},
	{
	"id": "remove-console-2",
	"intent": "remove-console",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function init() { console.log('start'); console.error('e'); run(); }",
	"edit": "function init() { run(); }",
	"expected": "function init() { run(); }"
	},
	{
	"id": "add-error-handling-1",
	"intent": "add-error-handling",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function fetch() { return api.get(); }",
	"edit": "function fetch() { try { return api.get(); } catch (e) { return null; } }",
	"expected": "function fetch() { try { return api.get(); } catch (e) { return null; } }"
	},
	{
	"id": "add-error-handling-2",
	"intent": "add-error-handling",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function load(p) { return parse(read(p)); }",
	"edit": "function load(p) { try { return parse(read(p)); } catch (e) { return null; } }",
	"expected": "function load(p) { try { return parse(read(p)); } catch (e) { return null; } }"
	},
	{
	"id": "async-await-1",
	"intent": "async-await",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function fetch() { return api.get().then(r => r.data); }",
	"edit": "async function fetch() { const r = await api.get(); return r.data; }",
	"expected": "async function fetch() { const r = await api.get(); return r.data; }"
	},
	{
	"id": "async-await-2",
	"intent": "async-await",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function load() { return read().then(parse); }",
	"edit": "async function load() { const r = await read(); return parse(r); }",
	"expected": "async function load() { const r = await read(); return parse(r); }"
	},
	{
	"id": "add-logging-1",
	"intent": "add-logging",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function go() { return work(); }",
	"edit": "function go() { console.log('go'); return work(); }",
	"expected": "function go() { console.log('go'); return work(); }"
	},
	{
	"id": "add-logging-2",
	"intent": "add-logging",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function save(x) { db.put(x); }",
	"edit": "function save(x) { console.log('save', x); db.put(x); }",
	"expected": "function save(x) { console.log('save', x); db.put(x); }"
	},
	{
	"id": "adversarial-extract-function",
	"intent": "extract-function",
	"language": "javascript",
	"expectedTier1": false,
	"comment": "Multi-statement extraction — booster's pattern-replace can't reason about control-flow boundaries. LLM should escalate.",
	"code": "function process(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } if (total > 100) { console.log('big'); } else { console.log('small'); } return total; }",
	"edit": "Extract the per-item summation into a helper called `sum_active(items)`, and the threshold-log into `log_threshold(total)`. Update process() to call them.",
	"expected": "function sum_active(items) { let total = 0; for (const i of items) { if (i.active) { total += i.value; } } return total; } function log_threshold(total) { if (total > 100) { console.log('big'); } else { console.log('small'); } } function process(items) { const total = sum_active(items); log_threshold(total); return total; }"
	},
	{
	"id": "adversarial-type-narrowing",
	"intent": "type-narrowing",
	"language": "typescript",
	"expectedTier1": false,
	"comment": "Requires understanding union types + control flow. Booster pattern-replace will likely fail or produce a low-confidence merge.",
	"code": "type Result = { kind: 'ok'; value: number } \| { kind: 'err'; message: string }; function describe(r: Result) { return r.value; }",
	"edit": "Narrow `r` before accessing `value` — only return `r.value` when `r.kind === 'ok'`, otherwise return -1.",
	"expected": "type Result = { kind: 'ok'; value: number } \| { kind: 'err'; message: string }; function describe(r: Result) { if (r.kind === 'ok') { return r.value; } return -1; }"
	},
	{
	"id": "adversarial-cross-method-rename",
	"intent": "rename-symbol",
	"language": "typescript",
	"expectedTier1": false,
	"comment": "Renaming requires reasoning about ALL call sites; booster sees only the snippet boundary.",
	"code": "class Cache { get(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: Record<string, any> = {}; } const c = new Cache(); c.set('a', 1); console.log(c.get('a'));",
	"edit": "Rename Cache.get to Cache.lookup and update all call sites.",
	"expected": "class Cache { lookup(k: string) { return this.store[k]; } set(k: string, v: any) { this.store[k] = v; } private store: Record<string, any> = {}; } const c = new Cache(); c.set('a', 1); console.log(c.lookup('a'));"
	},
	{
	"id": "adversarial-recursive-rewrite",
	"intent": "recursive-to-iterative",
	"language": "javascript",
	"expectedTier1": false,
	"comment": "Algorithmic transformation — requires reasoning, not pattern replacement.",
	"code": "function factorial(n) { return n <= 1 ? 1 : n * factorial(n - 1); }",
	"edit": "Rewrite as an iterative loop with no recursion.",
	"expected": "function factorial(n) { let result = 1; for (let i = 2; i <= n; i++) { result *= i; } return result; }"
	},
	{
	"id": "var-to-let-1",
	"intent": "var-to-let",
	"language": "javascript",
	"expectedTier1": true,
	"code": "var i = 0; for (i = 0; i < 10; i++) { sink(i); }",
	"edit": "let i = 0; for (i = 0; i < 10; i++) { sink(i); }",
	"expected": "let i = 0; for (i = 0; i < 10; i++) { sink(i); }"
	},
	{
	"id": "double-quote-to-single",
	"intent": "string-quote-style",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function greet(name) { return \"hello \" + name; }",
	"edit": "function greet(name) { return 'hello ' + name; }",
	"expected": "function greet(name) { return 'hello ' + name; }"
	},
	{
	"id": "add-readonly-prop",
	"intent": "add-readonly",
	"language": "typescript",
	"expectedTier1": true,
	"code": "class Box { value: number = 0; }",
	"edit": "class Box { readonly value: number = 0; }",
	"expected": "class Box { readonly value: number = 0; }"
	},
	{
	"id": "remove-debugger",
	"intent": "remove-debug",
	"language": "javascript",
	"expectedTier1": true,
	"code": "function go() { debugger; doWork(); }",
	"edit": "function go() { doWork(); }",
	"expected": "function go() { doWork(); }"
	},
	{
	"id": "import-add-named",
	"intent": "import-add",
	"language": "typescript",
	"expectedTier1": true,
	"code": "import { foo } from './x'; foo();",
	"edit": "import { foo, bar } from './x'; foo();",
	"expected": "import { foo, bar } from './x'; foo();"
	},
	{
	"id": "remove-trailing-semis",
	"intent": "format-tweak",
	"language": "javascript",
	"expectedTier1": true,
	"code": "const a = 1;; const b = 2;;",
	"edit": "const a = 1; const b = 2;",
	"expected": "const a = 1; const b = 2;"
	},
	{
	"id": "adversarial-extract-class",
	"intent": "extract-class",
	"language": "typescript",
	"expectedTier1": false,
	"comment": "Splitting one class into two with delegation requires reasoning about responsibility boundaries.",
	"code": "class UserService { saveUser(u: any) { db.put(u); audit.log('save', u.id); } private db = realDb; private audit = { log: (k: string, v: any) => console.log(k, v) }; }",
	"edit": "Extract the `audit` concern into its own AuditService class and have UserService delegate to it.",
	"expected": "class AuditService { log(k: string, v: any) { console.log(k, v); } } class UserService { saveUser(u: any) { db.put(u); this.audit.log('save', u.id); } private db = realDb; private audit = new AuditService(); }"
	},
	{
	"id": "adversarial-callback-to-promise",
	"intent": "callback-to-promise",
	"language": "javascript",
	"expectedTier1": false,
	"comment": "Restructuring control flow from callbacks to async/await requires understanding execution semantics, not pattern matching.",
	"code": "function loadConfig(cb) { fs.readFile('./c.json', (err, data) => { if (err) cb(err); else cb(null, JSON.parse(data)); }); }",
	"edit": "Rewrite as an async function that returns a Promise<Config> using fs.promises and try/catch.",
	"expected": "async function loadConfig() { try { const data = await fs.promises.readFile('./c.json'); return JSON.parse(data); } catch (err) { throw err; } }"
	},
	{
	"id": "adversarial-deeply-nested-conditional",
	"intent": "early-return-flatten",
	"language": "javascript",
	"expectedTier1": false,
	"comment": "Flattening nested conditionals into guard clauses requires reasoning about boolean logic + control flow.",
	"code": "function process(x) { if (x) { if (x.valid) { if (x.size > 0) { return doWork(x); } else { return null; } } else { return null; } } else { return null; } }",
	"edit": "Flatten using early returns / guard clauses — return null at each failed precondition.",
	"expected": "function process(x) { if (!x) return null; if (!x.valid) return null; if (x.size <= 0) return null; return doWork(x); }"
	}
	]
	}