Skip to content

Instantly share code, notes, and snippets.

@odashi
Last active March 19, 2026 01:10
Show Gist options
  • Select an option

  • Save odashi/e3e4811487d573b095bc30e552204e56 to your computer and use it in GitHub Desktop.

Select an option

Save odashi/e3e4811487d573b095bc30e552204e56 to your computer and use it in GitHub Desktop.
A simple lexical parser for OpenAI Harmony-like response format.
# A simple lexical parser for OpenAI Harmony-like response format.
# Author: Yusuke Oda
# Date: 2026-03-19
# License: MIT
#
# Usage:
#
# ```python
# tokenizer = transformers.AutoTokenizer.from_pretrained("model-with-harmony-like-tokenizer")
# tokens = tokenizer("""\
# <|start|>assistant<|channel|>analysis<|message|>Let's say hello.<|end|>\
# <|start|>assistant<|channel|>final<|message|>Hello!<|end|>\
# """)
# for message in iter_messages(tokenizer, tokens["input_ids"]):
# print(message)
# ```
#
# this snippet will output:
#
# ```
# {'role': 'assistant', 'channel': 'analysis', 'message': "Let's say hello.", 'end': 'end'}
# {'role': 'assistant', 'channel': 'final', 'message': 'Hello!', 'end': 'end'}
# ```
from transformers import PreTrainedTokenizerBase
from typing import Iterator
def iter_messages(tokenizer: PreTrainedTokenizerBase, tokens: list[int]) -> Iterator[list[int]]:
v = tokenizer.get_vocab()
start_id = v["<|start|>"]
start_ids = {
start_id: "role",
v["<|message|>"]: "message",
v["<|channel|>"]: "channel",
v["<|constrain|>"]: "constrain",
}
end_ids = {
v["<|end|>"]: "end",
v["<|return|>"]: "end",
v["<|call|>"]: "call",
}
message: dict[str, str] = {}
section: str | None = None
text_tokens: list[int] = []
# Perform lexical analysis on tokens
for token in tokens:
if token in start_ids:
if token == start_id:
message = {}
section = None
text_tokens = []
if section is not None:
message[section] = tokenizer.decode(text_tokens)
section = start_ids[token]
text_tokens = []
elif token in end_ids:
if section is not None:
message[section] = tokenizer.decode(text_tokens)
message["end"] = end_ids[token]
yield message
else:
text_tokens.append(token)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment