mikasenghaas · May 28, 2026 19:54 · mikasenghaas · May 28, 2026
diff --git a/tito_failure_modes.py b/tito_failure_modes.py
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.10"
 # dependencies = ["transformers", "jinja2", "rich"]
 # ///
 from transformers import AutoTokenizer
 from rich.console import Console
 from rich.text import Text

 console = Console()


 def render_factory(tok, tools=None):
    def render(msgs, **kw):
        if tools is not None:
            kw["tools"] = tools
        return tok.apply_chat_template(msgs, tokenize=True, return_dict=False, **kw)

    return render


 def tito_simulation(tok, messages, tools=None, truncate_assistant=None):
    """Simulate the TITO algorithm turn-by-turn.

    Assistant turns: append "sampled" tokens (here approximated by re-render).
    User/tool turns: append delta = render(prefix+msg, add_gen_prompt=True) - render(prefix).
    """
    truncate_assistant = truncate_assistant or {}
    render = render_factory(tok, tools)
    incremental = render([messages[0]], add_generation_prompt=True)
    for i in range(1, len(messages)):
        if messages[i]["role"] == "assistant":
            with_msg = render(messages[: i + 1])
            without_msg = render(messages[:i], add_generation_prompt=True)
            sampled = with_msg[len(without_msg) :]
            n = truncate_assistant.get(i, 0)
            if n:
                sampled = sampled[:-n]
            incremental += sampled
        else:
            prefix = render(messages[:i])
            with_gen = render(messages[: i + 1], add_generation_prompt=True)
            incremental += with_gen[len(prefix) :]
    return incremental, render(messages)


 def report(
    tok,
    label,
    incremental,
    expected,
    description=None,
    also_affects=None,
 ):
    console.print()
    ok = incremental == expected
    mark = "[green]✓[/green]" if ok else "[red]❌[/red]"
    console.rule(f"{mark} [bold]{label}[/bold]")
    console.print()
    msg = ""
    if description:
        msg += f"[dim]{description}[/dim]"
    if also_affects:
        msg += f"[dim] also affects: {also_affects}[/dim]"
    if msg:
        console.print(msg)
    if ok:
        return
    n = min(len(incremental), len(expected))
    div = next((j for j in range(n) if incremental[j] != expected[j]), n)
    head = tok.decode(expected[:div])
    tail_expected = tok.decode(expected[div:])
    tail_incr = tok.decode(incremental[div:])

    console.print("\n[bold]EXPECTED[/bold]")
    t = Text()
    t.append(head)
    t.append(tail_expected, style="red")
    console.print(t)

    console.print("\n[bold]TITO BUFFER[/bold]")
    t = Text()
    t.append(head)
    t.append(tail_incr, style="red")
    console.print(t)


 CALC_TOOL = {
    "type": "function",
    "function": {
        "name": "calc",
        "description": "Compute an arithmetic expression.",
        "parameters": {
            "type": "object",
            "properties": {"expr": {"type": "string"}},
            "required": ["expr"],
        },
    },
 }

 TOOL_TRACE = [
    {"role": "user", "content": "What's 2+2?"},
    {
        "role": "assistant",
        "content": "",
        "tool_calls": [
            {
                "type": "function",
                "function": {"name": "calc", "arguments": {"expr": "2+2"}},
            }
        ],
    },
    {"role": "tool", "name": "calc", "content": "4"},
    {"role": "assistant", "content": "It's 4."},
 ]


 def case_non_preserve_thinking():
    tok = AutoTokenizer.from_pretrained("zai-org/GLM-4.5", trust_remote_code=True)
    trace = [
        {"role": "user", "content": "What's 2+2?"},
        {
            "role": "assistant",
            "content": "It's 4.",
            "reasoning_content": "Two plus two is four.",
        },
        {"role": "user", "content": "And 3+3?"},
        {
            "role": "assistant",
            "content": "It's 6.",
            "reasoning_content": "Three plus three is six.",
        },
    ]
    incr, exp = tito_simulation(tok, trace)
    report(
        tok,
        "GLM/Minimax/... - reasoning_content stripped on past assistant turns",
        incr,
        exp,
        description=(
            "Past assistant turns get their <think>{reasoning_content}</think> "
            "stripped to empty <think></think>; the current/last assistant "
            "keeps the reasoning. As a new user shifts an assistant from "
            "current to past, the reasoning vanishes — the bridge slice "
            "misaligns by exactly that length and eats into the next user "
            "message."
        ),
        also_affects="Any model that does drop or interleaved thinking. Proposed TITO forces preserve thinking.",
    )


 def case_truncation():
    tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
    trace = [
        {"role": "user", "content": "What's 2+2?"},
        {"role": "assistant", "content": "It's"},
        {"role": "user", "content": "Are you sure?"},
        {"role": "assistant", "content": "Yes, 4."},
    ]
    eos_tail = len(tok.encode("<|im_end|>\n", add_special_tokens=False))
    incr, exp = tito_simulation(tok, trace, truncate_assistant={1: eos_tail})
    report(
        tok,
        "Any model — truncation drops EOS token",
        incr,
        exp,
        description=(
            "When the sampler hits max_tokens the assistant turn ends "
            "without <|im_end|>. TITO's delta computes "
            "render([U,A,U]) - render([U,A]); the template always closes A "
            "with <|im_end|>\\n, so that closing sequence lands in the "
            "prefix and never in the delta. Buffer has truncated content "
            "butted directly against the next role marker — model trains "
            "to skip EOS at turn boundaries."
        ),
        also_affects="every model — truncation is a runtime event independent of template",
    )


 def case_glm_eos():
    tok = AutoTokenizer.from_pretrained("zai-org/GLM-5.1", trust_remote_code=True)
    render = render_factory(tok, tools=[CALC_TOOL])
    obs_id = tok.convert_tokens_to_ids("<|observation|>")
    pre = [
        {"role": "user", "content": "What's 2+2?"},
        {
            "role": "assistant",
            "content": "",
            "tool_calls": [
                {
                    "type": "function",
                    "function": {"name": "calc", "arguments": {"expr": "2+2"}},
                }
            ],
        },
    ]
    full_msgs = pre + [{"role": "tool", "name": "calc", "content": "4"}]
    prefix_ids = render(pre)
    full_ids = render(full_msgs, add_generation_prompt=True)
    bridge = full_ids[len(prefix_ids) :]
    engine_output = prefix_ids + [obs_id]
    naive_stitch = engine_output + list(bridge)
    report(
        tok,
        "GLM — naive bridge stitch duplicates <|observation|>",
        naive_stitch,
        full_ids,
        description=(
            "vLLM stops tool-calling generation on <|observation|> (it's "
            "an EOS-like token in GLM's generation config), so the engine's "
            "completion ends with it. The next-turn bridge then prepends "
            "<|observation|> as the tool-message role marker. Naive stitch "
            "yields two of them at the boundary."
        ),
        also_affects="entire GLM family — GLM-4.5, GLM-4.5-Air, GLM-4.6, GLM-5, GLM-5.1 "
        "all use <|observation|> as the tool-turn role marker",
    )


 if __name__ == "__main__":
    case_glm_eos()
    case_non_preserve_thinking()
    case_truncation()
	#!/usr/bin/env -S uv run --script
	# /// script
	# requires-python = ">=3.10"
	# dependencies = ["transformers", "jinja2", "rich"]
	# ///
	from transformers import AutoTokenizer
	from rich.console import Console
	from rich.text import Text

	console = Console()


	def render_factory(tok, tools=None):
	def render(msgs, **kw):
	if tools is not None:
	kw["tools"] = tools
	return tok.apply_chat_template(msgs, tokenize=True, return_dict=False, **kw)

	return render


	def tito_simulation(tok, messages, tools=None, truncate_assistant=None):
	"""Simulate the TITO algorithm turn-by-turn.

	Assistant turns: append "sampled" tokens (here approximated by re-render).
	User/tool turns: append delta = render(prefix+msg, add_gen_prompt=True) - render(prefix).
	"""
	truncate_assistant = truncate_assistant or {}
	render = render_factory(tok, tools)
	incremental = render([messages[0]], add_generation_prompt=True)
	for i in range(1, len(messages)):
	if messages[i]["role"] == "assistant":
	with_msg = render(messages[: i + 1])
	without_msg = render(messages[:i], add_generation_prompt=True)
	sampled = with_msg[len(without_msg) :]
	n = truncate_assistant.get(i, 0)
	if n:
	sampled = sampled[:-n]
	incremental += sampled
	else:
	prefix = render(messages[:i])
	with_gen = render(messages[: i + 1], add_generation_prompt=True)
	incremental += with_gen[len(prefix) :]
	return incremental, render(messages)


	def report(
	tok,
	label,
	incremental,
	expected,
	description=None,
	also_affects=None,
	):
	console.print()
	ok = incremental == expected
	mark = "[green]✓[/green]" if ok else "[red]❌[/red]"
	console.rule(f"{mark} [bold]{label}[/bold]")
	console.print()
	msg = ""
	if description:
	msg += f"[dim]{description}[/dim]"
	if also_affects:
	msg += f"[dim] also affects: {also_affects}[/dim]"
	if msg:
	console.print(msg)
	if ok:
	return
	n = min(len(incremental), len(expected))
	div = next((j for j in range(n) if incremental[j] != expected[j]), n)
	head = tok.decode(expected[:div])
	tail_expected = tok.decode(expected[div:])
	tail_incr = tok.decode(incremental[div:])

	console.print("\n[bold]EXPECTED[/bold]")
	t = Text()
	t.append(head)
	t.append(tail_expected, style="red")
	console.print(t)

	console.print("\n[bold]TITO BUFFER[/bold]")
	t = Text()
	t.append(head)
	t.append(tail_incr, style="red")
	console.print(t)


	CALC_TOOL = {
	"type": "function",
	"function": {
	"name": "calc",
	"description": "Compute an arithmetic expression.",
	"parameters": {
	"type": "object",
	"properties": {"expr": {"type": "string"}},
	"required": ["expr"],
	},
	},
	}

	TOOL_TRACE = [
	{"role": "user", "content": "What's 2+2?"},
	{
	"role": "assistant",
	"content": "",
	"tool_calls": [
	{
	"type": "function",
	"function": {"name": "calc", "arguments": {"expr": "2+2"}},
	}
	],
	},
	{"role": "tool", "name": "calc", "content": "4"},
	{"role": "assistant", "content": "It's 4."},
	]


	def case_non_preserve_thinking():
	tok = AutoTokenizer.from_pretrained("zai-org/GLM-4.5", trust_remote_code=True)
	trace = [
	{"role": "user", "content": "What's 2+2?"},
	{
	"role": "assistant",
	"content": "It's 4.",
	"reasoning_content": "Two plus two is four.",
	},
	{"role": "user", "content": "And 3+3?"},
	{
	"role": "assistant",
	"content": "It's 6.",
	"reasoning_content": "Three plus three is six.",
	},
	]
	incr, exp = tito_simulation(tok, trace)
	report(
	tok,
	"GLM/Minimax/... - reasoning_content stripped on past assistant turns",
	incr,
	exp,
	description=(
	"Past assistant turns get their <think>{reasoning_content}</think> "
	"stripped to empty <think></think>; the current/last assistant "
	"keeps the reasoning. As a new user shifts an assistant from "
	"current to past, the reasoning vanishes — the bridge slice "
	"misaligns by exactly that length and eats into the next user "
	"message."
	),
	also_affects="Any model that does drop or interleaved thinking. Proposed TITO forces preserve thinking.",
	)


	def case_truncation():
	tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
	trace = [
	{"role": "user", "content": "What's 2+2?"},
	{"role": "assistant", "content": "It's"},
	{"role": "user", "content": "Are you sure?"},
	{"role": "assistant", "content": "Yes, 4."},
	]
	eos_tail = len(tok.encode("<\|im_end\|>\n", add_special_tokens=False))
	incr, exp = tito_simulation(tok, trace, truncate_assistant={1: eos_tail})
	report(
	tok,
	"Any model — truncation drops EOS token",
	incr,
	exp,
	description=(
	"When the sampler hits max_tokens the assistant turn ends "
	"without <\|im_end\|>. TITO's delta computes "
	"render([U,A,U]) - render([U,A]); the template always closes A "
	"with <\|im_end\|>\\n, so that closing sequence lands in the "
	"prefix and never in the delta. Buffer has truncated content "
	"butted directly against the next role marker — model trains "
	"to skip EOS at turn boundaries."
	),
	also_affects="every model — truncation is a runtime event independent of template",
	)


	def case_glm_eos():
	tok = AutoTokenizer.from_pretrained("zai-org/GLM-5.1", trust_remote_code=True)
	render = render_factory(tok, tools=[CALC_TOOL])
	obs_id = tok.convert_tokens_to_ids("<\|observation\|>")
	pre = [
	{"role": "user", "content": "What's 2+2?"},
	{
	"role": "assistant",
	"content": "",
	"tool_calls": [
	{
	"type": "function",
	"function": {"name": "calc", "arguments": {"expr": "2+2"}},
	}
	],
	},
	]
	full_msgs = pre + [{"role": "tool", "name": "calc", "content": "4"}]
	prefix_ids = render(pre)
	full_ids = render(full_msgs, add_generation_prompt=True)
	bridge = full_ids[len(prefix_ids) :]
	engine_output = prefix_ids + [obs_id]
	naive_stitch = engine_output + list(bridge)
	report(
	tok,
	"GLM — naive bridge stitch duplicates <\|observation\|>",
	naive_stitch,
	full_ids,
	description=(
	"vLLM stops tool-calling generation on <\|observation\|> (it's "
	"an EOS-like token in GLM's generation config), so the engine's "
	"completion ends with it. The next-turn bridge then prepends "
	"<\|observation\|> as the tool-message role marker. Naive stitch "
	"yields two of them at the boundary."
	),
	also_affects="entire GLM family — GLM-4.5, GLM-4.5-Air, GLM-4.6, GLM-5, GLM-5.1 "
	"all use <\|observation\|> as the tool-turn role marker",
	)


	if __name__ == "__main__":
	case_glm_eos()
	case_non_preserve_thinking()
	case_truncation()
No results found