Last active
May 28, 2026 19:54
-
-
Save mikasenghaas/e336d15761cf49af5e4eb662356a5d78 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S uv run --script | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = ["transformers", "jinja2", "rich"] | |
| # /// | |
| from transformers import AutoTokenizer | |
| from rich.console import Console | |
| from rich.text import Text | |
| console = Console() | |
| def render_factory(tok, tools=None): | |
| def render(msgs, **kw): | |
| if tools is not None: | |
| kw["tools"] = tools | |
| return tok.apply_chat_template(msgs, tokenize=True, return_dict=False, **kw) | |
| return render | |
| def tito_simulation(tok, messages, tools=None, truncate_assistant=None): | |
| """Simulate the TITO algorithm turn-by-turn. | |
| Assistant turns: append "sampled" tokens (here approximated by re-render). | |
| User/tool turns: append delta = render(prefix+msg, add_gen_prompt=True) - render(prefix). | |
| """ | |
| truncate_assistant = truncate_assistant or {} | |
| render = render_factory(tok, tools) | |
| incremental = render([messages[0]], add_generation_prompt=True) | |
| for i in range(1, len(messages)): | |
| if messages[i]["role"] == "assistant": | |
| with_msg = render(messages[: i + 1]) | |
| without_msg = render(messages[:i], add_generation_prompt=True) | |
| sampled = with_msg[len(without_msg) :] | |
| n = truncate_assistant.get(i, 0) | |
| if n: | |
| sampled = sampled[:-n] | |
| incremental += sampled | |
| else: | |
| prefix = render(messages[:i]) | |
| with_gen = render(messages[: i + 1], add_generation_prompt=True) | |
| incremental += with_gen[len(prefix) :] | |
| return incremental, render(messages) | |
| def report( | |
| tok, | |
| label, | |
| incremental, | |
| expected, | |
| description=None, | |
| also_affects=None, | |
| ): | |
| console.print() | |
| ok = incremental == expected | |
| mark = "[green]✓[/green]" if ok else "[red]❌[/red]" | |
| console.rule(f"{mark} [bold]{label}[/bold]") | |
| console.print() | |
| msg = "" | |
| if description: | |
| msg += f"[dim]{description}[/dim]" | |
| if also_affects: | |
| msg += f"[dim] also affects: {also_affects}[/dim]" | |
| if msg: | |
| console.print(msg) | |
| if ok: | |
| return | |
| n = min(len(incremental), len(expected)) | |
| div = next((j for j in range(n) if incremental[j] != expected[j]), n) | |
| head = tok.decode(expected[:div]) | |
| tail_expected = tok.decode(expected[div:]) | |
| tail_incr = tok.decode(incremental[div:]) | |
| console.print("\n[bold]EXPECTED[/bold]") | |
| t = Text() | |
| t.append(head) | |
| t.append(tail_expected, style="red") | |
| console.print(t) | |
| console.print("\n[bold]TITO BUFFER[/bold]") | |
| t = Text() | |
| t.append(head) | |
| t.append(tail_incr, style="red") | |
| console.print(t) | |
| CALC_TOOL = { | |
| "type": "function", | |
| "function": { | |
| "name": "calc", | |
| "description": "Compute an arithmetic expression.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": {"expr": {"type": "string"}}, | |
| "required": ["expr"], | |
| }, | |
| }, | |
| } | |
| TOOL_TRACE = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| { | |
| "role": "assistant", | |
| "content": "", | |
| "tool_calls": [ | |
| { | |
| "type": "function", | |
| "function": {"name": "calc", "arguments": {"expr": "2+2"}}, | |
| } | |
| ], | |
| }, | |
| {"role": "tool", "name": "calc", "content": "4"}, | |
| {"role": "assistant", "content": "It's 4."}, | |
| ] | |
| def case_non_preserve_thinking(): | |
| tok = AutoTokenizer.from_pretrained("zai-org/GLM-4.5", trust_remote_code=True) | |
| trace = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| { | |
| "role": "assistant", | |
| "content": "It's 4.", | |
| "reasoning_content": "Two plus two is four.", | |
| }, | |
| {"role": "user", "content": "And 3+3?"}, | |
| { | |
| "role": "assistant", | |
| "content": "It's 6.", | |
| "reasoning_content": "Three plus three is six.", | |
| }, | |
| ] | |
| incr, exp = tito_simulation(tok, trace) | |
| report( | |
| tok, | |
| "GLM/Minimax/... - reasoning_content stripped on past assistant turns", | |
| incr, | |
| exp, | |
| description=( | |
| "Past assistant turns get their <think>{reasoning_content}</think> " | |
| "stripped to empty <think></think>; the current/last assistant " | |
| "keeps the reasoning. As a new user shifts an assistant from " | |
| "current to past, the reasoning vanishes — the bridge slice " | |
| "misaligns by exactly that length and eats into the next user " | |
| "message." | |
| ), | |
| also_affects="Any model that does drop or interleaved thinking. Proposed TITO forces preserve thinking.", | |
| ) | |
| def case_truncation(): | |
| tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct") | |
| trace = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| {"role": "assistant", "content": "It's"}, | |
| {"role": "user", "content": "Are you sure?"}, | |
| {"role": "assistant", "content": "Yes, 4."}, | |
| ] | |
| eos_tail = len(tok.encode("<|im_end|>\n", add_special_tokens=False)) | |
| incr, exp = tito_simulation(tok, trace, truncate_assistant={1: eos_tail}) | |
| report( | |
| tok, | |
| "Any model — truncation drops EOS token", | |
| incr, | |
| exp, | |
| description=( | |
| "When the sampler hits max_tokens the assistant turn ends " | |
| "without <|im_end|>. TITO's delta computes " | |
| "render([U,A,U]) - render([U,A]); the template always closes A " | |
| "with <|im_end|>\\n, so that closing sequence lands in the " | |
| "prefix and never in the delta. Buffer has truncated content " | |
| "butted directly against the next role marker — model trains " | |
| "to skip EOS at turn boundaries." | |
| ), | |
| also_affects="every model — truncation is a runtime event independent of template", | |
| ) | |
| def case_glm_eos(): | |
| tok = AutoTokenizer.from_pretrained("zai-org/GLM-5.1", trust_remote_code=True) | |
| render = render_factory(tok, tools=[CALC_TOOL]) | |
| obs_id = tok.convert_tokens_to_ids("<|observation|>") | |
| pre = [ | |
| {"role": "user", "content": "What's 2+2?"}, | |
| { | |
| "role": "assistant", | |
| "content": "", | |
| "tool_calls": [ | |
| { | |
| "type": "function", | |
| "function": {"name": "calc", "arguments": {"expr": "2+2"}}, | |
| } | |
| ], | |
| }, | |
| ] | |
| full_msgs = pre + [{"role": "tool", "name": "calc", "content": "4"}] | |
| prefix_ids = render(pre) | |
| full_ids = render(full_msgs, add_generation_prompt=True) | |
| bridge = full_ids[len(prefix_ids) :] | |
| engine_output = prefix_ids + [obs_id] | |
| naive_stitch = engine_output + list(bridge) | |
| report( | |
| tok, | |
| "GLM — naive bridge stitch duplicates <|observation|>", | |
| naive_stitch, | |
| full_ids, | |
| description=( | |
| "vLLM stops tool-calling generation on <|observation|> (it's " | |
| "an EOS-like token in GLM's generation config), so the engine's " | |
| "completion ends with it. The next-turn bridge then prepends " | |
| "<|observation|> as the tool-message role marker. Naive stitch " | |
| "yields two of them at the boundary." | |
| ), | |
| also_affects="entire GLM family — GLM-4.5, GLM-4.5-Air, GLM-4.6, GLM-5, GLM-5.1 " | |
| "all use <|observation|> as the tool-turn role marker", | |
| ) | |
| if __name__ == "__main__": | |
| case_glm_eos() | |
| case_non_preserve_thinking() | |
| case_truncation() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run with
curl -sSL https://gist.githubusercontent.com/mikasenghaas/e336d15761cf49af5e4eb662356a5d78/raw/44635917bb6a363fad64e9506785cd9338f29ab2/tito_failure_modes.py | uv run --script -