Last active
May 28, 2026 19:53
-
-
Save mikasenghaas/4d470f0537f5fc8d3c8117ea80d6e01a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = ["renderers", "transformers>=4.46", "jinja2", "rich"] | |
| # /// | |
| """ | |
| Replay the TITO failure modes from tito_failure_modes.py using | |
| PrimeIntellect's `renderers` package (hand-coded per-model bridges) | |
| instead of the naive `apply_chat_template`-delta algorithm. | |
| Verdict per case: | |
| ✓ bridge succeeded — the failure mode is handled | |
| ⏭ bridge returned None — graceful MITO fallback (intended) | |
| ❌ bridge crashed or buffer broken | |
| Comparing to HF's `apply_chat_template` is not the right baseline anymore: | |
| the renderer's bridge intentionally preserves engine bytes that canonical | |
| re-render would strip (sampled tokens, stop markers, past `reasoning_content`). | |
| Each case prints the buffer the renderer produced so you can see it. | |
| """ | |
| from transformers import AutoTokenizer | |
| from renderers import create_renderer | |
| from rich.console import Console | |
| from rich.text import Text | |
| console = Console() | |
| def tito_with_bridge(r, messages, tools=None, truncate_assistant=None): | |
| truncate_assistant = truncate_assistant or {} | |
| buffer = list(r.render_ids([messages[0]], tools=tools, add_generation_prompt=True)) | |
| last_prompt_ids = list(buffer) | |
| last_completion_ids: list[int] = [] | |
| for i in range(1, len(messages)): | |
| msg = messages[i] | |
| if msg["role"] == "assistant": | |
| full = r.render_ids(messages[: i + 1], tools=tools) | |
| prefix_with_gen = r.render_ids( | |
| messages[:i], tools=tools, add_generation_prompt=True | |
| ) | |
| sampled = full[len(prefix_with_gen):] | |
| n = truncate_assistant.get(i, 0) | |
| if n: | |
| sampled = sampled[:-n] | |
| buffer.extend(sampled) | |
| last_completion_ids = list(sampled) | |
| else: | |
| bridge = r.bridge_to_next_turn( | |
| last_prompt_ids, last_completion_ids, [msg], tools=tools | |
| ) | |
| if bridge is None: | |
| return None | |
| buffer = list(bridge.token_ids) | |
| last_prompt_ids = list(buffer) | |
| last_completion_ids = [] | |
| return buffer | |
| def report(tok, label, buffer): | |
| console.print() | |
| mark = "[yellow]⏭[/yellow]" if buffer is None else "[green]✓[/green]" | |
| console.rule(f"{mark} [bold]{label}[/bold]") | |
| console.print() | |
| if buffer is None: | |
| console.print("[yellow]→ bridge returned None — graceful MITO fallback[/yellow]") | |
| return | |
| console.print("[bold]BRIDGE BUFFER[/bold]") | |
| console.print(tok.decode(buffer)) | |
| CALC_TOOL = { | |
| "type": "function", | |
| "function": { | |
| "name": "calc", | |
| "description": "Compute an arithmetic expression.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": {"expr": {"type": "string"}}, | |
| "required": ["expr"], | |
| }, | |
| }, | |
| } | |
| TOOL_TRACE = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| {"role": "assistant", "content": "", | |
| "tool_calls": [{"type": "function", | |
| "function": {"name": "calc", "arguments": {"expr": "2+2"}}}]}, | |
| {"role": "tool", "name": "calc", "content": "4"}, | |
| {"role": "assistant", "content": "It's 4."}, | |
| ] | |
| def make_renderer(model_id, **kwargs): | |
| tok = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| return tok, create_renderer(tok, **kwargs) | |
| def case_glm45_reasoning_preserved(): | |
| tok, r = make_renderer("zai-org/GLM-4.5-Air", preserve_all_thinking=True) | |
| trace = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| {"role": "assistant", "content": "It's 4.", | |
| "reasoning_content": "Two plus two is four."}, | |
| {"role": "user", "content": "And 3+3?"}, | |
| {"role": "assistant", "content": "It's 6.", | |
| "reasoning_content": "Three plus three is six."}, | |
| ] | |
| buf = tito_with_bridge(r, trace) | |
| report(tok, "GLM-4.5-Air — reasoning_content preserved across user turns", buf) | |
| def case_truncation(): | |
| tok, r = make_renderer("Qwen/Qwen3-4B") # use a model with hand-coded renderer | |
| trace = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| {"role": "assistant", "content": "It's"}, | |
| {"role": "user", "content": "Are you sure?"}, | |
| {"role": "assistant", "content": "Yes, 4."}, | |
| ] | |
| # Chop the trailing close so the sampled assistant doesn't end in a stop token. | |
| full = r.render_ids(trace[:2]) | |
| prefix = r.render_ids(trace[:1], add_generation_prompt=True) | |
| sampled_close_len = len(full) - len(prefix) - len(tok.encode("It's", add_special_tokens=False)) | |
| buf = tito_with_bridge(r, trace, truncate_assistant={1: max(sampled_close_len, 1)}) | |
| report(tok, "Qwen3 — assistant truncated mid-turn (no EOS)", buf) | |
| def case_observation_duplication(): | |
| tok, r = make_renderer("zai-org/GLM-5.1") | |
| obs_id = tok.convert_tokens_to_ids("<|observation|>") | |
| pre = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| {"role": "assistant", "content": "", | |
| "tool_calls": [{"type": "function", | |
| "function": {"name": "calc", | |
| "arguments": {"expr": "2+2"}}}]}, | |
| ] | |
| tool_msg = {"role": "tool", "name": "calc", "content": "4"} | |
| initial_prompt = r.render_ids([pre[0]], tools=[CALC_TOOL], add_generation_prompt=True) | |
| full_pre = r.render_ids(pre, tools=[CALC_TOOL]) | |
| # Simulate engine stopping on <|observation|>: append it to the completion. | |
| assistant_completion = list(full_pre[len(initial_prompt):]) + [obs_id] | |
| bridge = r.bridge_to_next_turn( | |
| previous_prompt_ids=initial_prompt, | |
| previous_completion_ids=assistant_completion, | |
| new_messages=[tool_msg], | |
| tools=[CALC_TOOL], | |
| ) | |
| buf = list(bridge.token_ids) if bridge is not None else None | |
| report(tok, "GLM-5.1 — engine stops on <|observation|> (would duplicate naively)", | |
| buf) | |
| if buf is not None: | |
| count = buf.count(obs_id) | |
| verdict = ("[green]no duplication[/green]" if count == 1 | |
| else f"[red]{count}× <|observation|>[/red]") | |
| console.print(f"\n[bold]<|observation|> count in buffer:[/bold] {verdict}") | |
| if __name__ == "__main__": | |
| case_observation_duplication() | |
| case_glm45_reasoning_preserved() | |
| case_truncation() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run with
curl -sSL https://gist.githubusercontent.com/mikasenghaas/4d470f0537f5fc8d3c8117ea80d6e01a/raw/c8083be297bb356349771a4231d906a5d915250b/tito_with_renderers.py | uv run --script -