Skip to content

Instantly share code, notes, and snippets.

@mikasenghaas
Last active May 28, 2026 19:53
Show Gist options
  • Select an option

  • Save mikasenghaas/4d470f0537f5fc8d3c8117ea80d6e01a to your computer and use it in GitHub Desktop.

Select an option

Save mikasenghaas/4d470f0537f5fc8d3c8117ea80d6e01a to your computer and use it in GitHub Desktop.
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = ["renderers", "transformers>=4.46", "jinja2", "rich"]
# ///
"""
Replay the TITO failure modes from tito_failure_modes.py using
PrimeIntellect's `renderers` package (hand-coded per-model bridges)
instead of the naive `apply_chat_template`-delta algorithm.
Verdict per case:
✓ bridge succeeded — the failure mode is handled
⏭ bridge returned None — graceful MITO fallback (intended)
❌ bridge crashed or buffer broken
Comparing to HF's `apply_chat_template` is not the right baseline anymore:
the renderer's bridge intentionally preserves engine bytes that canonical
re-render would strip (sampled tokens, stop markers, past `reasoning_content`).
Each case prints the buffer the renderer produced so you can see it.
"""
from transformers import AutoTokenizer
from renderers import create_renderer
from rich.console import Console
from rich.text import Text
console = Console()
def tito_with_bridge(r, messages, tools=None, truncate_assistant=None):
truncate_assistant = truncate_assistant or {}
buffer = list(r.render_ids([messages[0]], tools=tools, add_generation_prompt=True))
last_prompt_ids = list(buffer)
last_completion_ids: list[int] = []
for i in range(1, len(messages)):
msg = messages[i]
if msg["role"] == "assistant":
full = r.render_ids(messages[: i + 1], tools=tools)
prefix_with_gen = r.render_ids(
messages[:i], tools=tools, add_generation_prompt=True
)
sampled = full[len(prefix_with_gen):]
n = truncate_assistant.get(i, 0)
if n:
sampled = sampled[:-n]
buffer.extend(sampled)
last_completion_ids = list(sampled)
else:
bridge = r.bridge_to_next_turn(
last_prompt_ids, last_completion_ids, [msg], tools=tools
)
if bridge is None:
return None
buffer = list(bridge.token_ids)
last_prompt_ids = list(buffer)
last_completion_ids = []
return buffer
def report(tok, label, buffer):
console.print()
mark = "[yellow]⏭[/yellow]" if buffer is None else "[green]✓[/green]"
console.rule(f"{mark} [bold]{label}[/bold]")
console.print()
if buffer is None:
console.print("[yellow]→ bridge returned None — graceful MITO fallback[/yellow]")
return
console.print("[bold]BRIDGE BUFFER[/bold]")
console.print(tok.decode(buffer))
CALC_TOOL = {
"type": "function",
"function": {
"name": "calc",
"description": "Compute an arithmetic expression.",
"parameters": {
"type": "object",
"properties": {"expr": {"type": "string"}},
"required": ["expr"],
},
},
}
TOOL_TRACE = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "",
"tool_calls": [{"type": "function",
"function": {"name": "calc", "arguments": {"expr": "2+2"}}}]},
{"role": "tool", "name": "calc", "content": "4"},
{"role": "assistant", "content": "It's 4."},
]
def make_renderer(model_id, **kwargs):
tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
return tok, create_renderer(tok, **kwargs)
def case_glm45_reasoning_preserved():
tok, r = make_renderer("zai-org/GLM-4.5-Air", preserve_all_thinking=True)
trace = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "It's 4.",
"reasoning_content": "Two plus two is four."},
{"role": "user", "content": "And 3+3?"},
{"role": "assistant", "content": "It's 6.",
"reasoning_content": "Three plus three is six."},
]
buf = tito_with_bridge(r, trace)
report(tok, "GLM-4.5-Air — reasoning_content preserved across user turns", buf)
def case_truncation():
tok, r = make_renderer("Qwen/Qwen3-4B") # use a model with hand-coded renderer
trace = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "It's"},
{"role": "user", "content": "Are you sure?"},
{"role": "assistant", "content": "Yes, 4."},
]
# Chop the trailing close so the sampled assistant doesn't end in a stop token.
full = r.render_ids(trace[:2])
prefix = r.render_ids(trace[:1], add_generation_prompt=True)
sampled_close_len = len(full) - len(prefix) - len(tok.encode("It's", add_special_tokens=False))
buf = tito_with_bridge(r, trace, truncate_assistant={1: max(sampled_close_len, 1)})
report(tok, "Qwen3 — assistant truncated mid-turn (no EOS)", buf)
def case_observation_duplication():
tok, r = make_renderer("zai-org/GLM-5.1")
obs_id = tok.convert_tokens_to_ids("<|observation|>")
pre = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "",
"tool_calls": [{"type": "function",
"function": {"name": "calc",
"arguments": {"expr": "2+2"}}}]},
]
tool_msg = {"role": "tool", "name": "calc", "content": "4"}
initial_prompt = r.render_ids([pre[0]], tools=[CALC_TOOL], add_generation_prompt=True)
full_pre = r.render_ids(pre, tools=[CALC_TOOL])
# Simulate engine stopping on <|observation|>: append it to the completion.
assistant_completion = list(full_pre[len(initial_prompt):]) + [obs_id]
bridge = r.bridge_to_next_turn(
previous_prompt_ids=initial_prompt,
previous_completion_ids=assistant_completion,
new_messages=[tool_msg],
tools=[CALC_TOOL],
)
buf = list(bridge.token_ids) if bridge is not None else None
report(tok, "GLM-5.1 — engine stops on <|observation|> (would duplicate naively)",
buf)
if buf is not None:
count = buf.count(obs_id)
verdict = ("[green]no duplication[/green]" if count == 1
else f"[red]{count}× <|observation|>[/red]")
console.print(f"\n[bold]<|observation|> count in buffer:[/bold] {verdict}")
if __name__ == "__main__":
case_observation_duplication()
case_glm45_reasoning_preserved()
case_truncation()
@mikasenghaas
Copy link
Copy Markdown
Author

mikasenghaas commented May 28, 2026

Run with

curl -sSL https://gist.githubusercontent.com/mikasenghaas/4d470f0537f5fc8d3c8117ea80d6e01a/raw/c8083be297bb356349771a4231d906a5d915250b/tito_with_renderers.py | uv run --script -

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment