Skip to content

Instantly share code, notes, and snippets.

@mikasenghaas
Last active May 28, 2026 19:54
Show Gist options
  • Select an option

  • Save mikasenghaas/e336d15761cf49af5e4eb662356a5d78 to your computer and use it in GitHub Desktop.

Select an option

Save mikasenghaas/e336d15761cf49af5e4eb662356a5d78 to your computer and use it in GitHub Desktop.
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = ["transformers", "jinja2", "rich"]
# ///
from transformers import AutoTokenizer
from rich.console import Console
from rich.text import Text
console = Console()
def render_factory(tok, tools=None):
def render(msgs, **kw):
if tools is not None:
kw["tools"] = tools
return tok.apply_chat_template(msgs, tokenize=True, return_dict=False, **kw)
return render
def tito_simulation(tok, messages, tools=None, truncate_assistant=None):
"""Simulate the TITO algorithm turn-by-turn.
Assistant turns: append "sampled" tokens (here approximated by re-render).
User/tool turns: append delta = render(prefix+msg, add_gen_prompt=True) - render(prefix).
"""
truncate_assistant = truncate_assistant or {}
render = render_factory(tok, tools)
incremental = render([messages[0]], add_generation_prompt=True)
for i in range(1, len(messages)):
if messages[i]["role"] == "assistant":
with_msg = render(messages[: i + 1])
without_msg = render(messages[:i], add_generation_prompt=True)
sampled = with_msg[len(without_msg) :]
n = truncate_assistant.get(i, 0)
if n:
sampled = sampled[:-n]
incremental += sampled
else:
prefix = render(messages[:i])
with_gen = render(messages[: i + 1], add_generation_prompt=True)
incremental += with_gen[len(prefix) :]
return incremental, render(messages)
def report(
tok,
label,
incremental,
expected,
description=None,
also_affects=None,
):
console.print()
ok = incremental == expected
mark = "[green]✓[/green]" if ok else "[red]❌[/red]"
console.rule(f"{mark} [bold]{label}[/bold]")
console.print()
msg = ""
if description:
msg += f"[dim]{description}[/dim]"
if also_affects:
msg += f"[dim] also affects: {also_affects}[/dim]"
if msg:
console.print(msg)
if ok:
return
n = min(len(incremental), len(expected))
div = next((j for j in range(n) if incremental[j] != expected[j]), n)
head = tok.decode(expected[:div])
tail_expected = tok.decode(expected[div:])
tail_incr = tok.decode(incremental[div:])
console.print("\n[bold]EXPECTED[/bold]")
t = Text()
t.append(head)
t.append(tail_expected, style="red")
console.print(t)
console.print("\n[bold]TITO BUFFER[/bold]")
t = Text()
t.append(head)
t.append(tail_incr, style="red")
console.print(t)
CALC_TOOL = {
"type": "function",
"function": {
"name": "calc",
"description": "Compute an arithmetic expression.",
"parameters": {
"type": "object",
"properties": {"expr": {"type": "string"}},
"required": ["expr"],
},
},
}
TOOL_TRACE = [
{"role": "user", "content": "What's 2+2?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"type": "function",
"function": {"name": "calc", "arguments": {"expr": "2+2"}},
}
],
},
{"role": "tool", "name": "calc", "content": "4"},
{"role": "assistant", "content": "It's 4."},
]
def case_non_preserve_thinking():
tok = AutoTokenizer.from_pretrained("zai-org/GLM-4.5", trust_remote_code=True)
trace = [
{"role": "user", "content": "What's 2+2?"},
{
"role": "assistant",
"content": "It's 4.",
"reasoning_content": "Two plus two is four.",
},
{"role": "user", "content": "And 3+3?"},
{
"role": "assistant",
"content": "It's 6.",
"reasoning_content": "Three plus three is six.",
},
]
incr, exp = tito_simulation(tok, trace)
report(
tok,
"GLM/Minimax/... - reasoning_content stripped on past assistant turns",
incr,
exp,
description=(
"Past assistant turns get their <think>{reasoning_content}</think> "
"stripped to empty <think></think>; the current/last assistant "
"keeps the reasoning. As a new user shifts an assistant from "
"current to past, the reasoning vanishes — the bridge slice "
"misaligns by exactly that length and eats into the next user "
"message."
),
also_affects="Any model that does drop or interleaved thinking. Proposed TITO forces preserve thinking.",
)
def case_truncation():
tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
trace = [
{"role": "user", "content": "What's 2+2?"},
{"role": "assistant", "content": "It's"},
{"role": "user", "content": "Are you sure?"},
{"role": "assistant", "content": "Yes, 4."},
]
eos_tail = len(tok.encode("<|im_end|>\n", add_special_tokens=False))
incr, exp = tito_simulation(tok, trace, truncate_assistant={1: eos_tail})
report(
tok,
"Any model — truncation drops EOS token",
incr,
exp,
description=(
"When the sampler hits max_tokens the assistant turn ends "
"without <|im_end|>. TITO's delta computes "
"render([U,A,U]) - render([U,A]); the template always closes A "
"with <|im_end|>\\n, so that closing sequence lands in the "
"prefix and never in the delta. Buffer has truncated content "
"butted directly against the next role marker — model trains "
"to skip EOS at turn boundaries."
),
also_affects="every model — truncation is a runtime event independent of template",
)
def case_glm_eos():
tok = AutoTokenizer.from_pretrained("zai-org/GLM-5.1", trust_remote_code=True)
render = render_factory(tok, tools=[CALC_TOOL])
obs_id = tok.convert_tokens_to_ids("<|observation|>")
pre = [
{"role": "user", "content": "What's 2+2?"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"type": "function",
"function": {"name": "calc", "arguments": {"expr": "2+2"}},
}
],
},
]
full_msgs = pre + [{"role": "tool", "name": "calc", "content": "4"}]
prefix_ids = render(pre)
full_ids = render(full_msgs, add_generation_prompt=True)
bridge = full_ids[len(prefix_ids) :]
engine_output = prefix_ids + [obs_id]
naive_stitch = engine_output + list(bridge)
report(
tok,
"GLM — naive bridge stitch duplicates <|observation|>",
naive_stitch,
full_ids,
description=(
"vLLM stops tool-calling generation on <|observation|> (it's "
"an EOS-like token in GLM's generation config), so the engine's "
"completion ends with it. The next-turn bridge then prepends "
"<|observation|> as the tool-message role marker. Naive stitch "
"yields two of them at the boundary."
),
also_affects="entire GLM family — GLM-4.5, GLM-4.5-Air, GLM-4.6, GLM-5, GLM-5.1 "
"all use <|observation|> as the tool-turn role marker",
)
if __name__ == "__main__":
case_glm_eos()
case_non_preserve_thinking()
case_truncation()
@mikasenghaas
Copy link
Copy Markdown
Author

mikasenghaas commented May 28, 2026

Run with

curl -sSL https://gist.githubusercontent.com/mikasenghaas/e336d15761cf49af5e4eb662356a5d78/raw/44635917bb6a363fad64e9506785cd9338f29ab2/tito_failure_modes.py | uv run --script -

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment