Last active
March 19, 2026 01:10
-
-
Save odashi/e3e4811487d573b095bc30e552204e56 to your computer and use it in GitHub Desktop.
A simple lexical parser for OpenAI Harmony-like response format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # A simple lexical parser for OpenAI Harmony-like response format. | |
| # Author: Yusuke Oda | |
| # Date: 2026-03-19 | |
| # License: MIT | |
| # | |
| # Usage: | |
| # | |
| # ```python | |
| # tokenizer = transformers.AutoTokenizer.from_pretrained("model-with-harmony-like-tokenizer") | |
| # tokens = tokenizer("""\ | |
| # <|start|>assistant<|channel|>analysis<|message|>Let's say hello.<|end|>\ | |
| # <|start|>assistant<|channel|>final<|message|>Hello!<|end|>\ | |
| # """) | |
| # for message in iter_messages(tokenizer, tokens["input_ids"]): | |
| # print(message) | |
| # ``` | |
| # | |
| # this snippet will output: | |
| # | |
| # ``` | |
| # {'role': 'assistant', 'channel': 'analysis', 'message': "Let's say hello.", 'end': 'end'} | |
| # {'role': 'assistant', 'channel': 'final', 'message': 'Hello!', 'end': 'end'} | |
| # ``` | |
| from transformers import PreTrainedTokenizerBase | |
| from typing import Iterator | |
| def iter_messages(tokenizer: PreTrainedTokenizerBase, tokens: list[int]) -> Iterator[list[int]]: | |
| v = tokenizer.get_vocab() | |
| start_id = v["<|start|>"] | |
| start_ids = { | |
| start_id: "role", | |
| v["<|message|>"]: "message", | |
| v["<|channel|>"]: "channel", | |
| v["<|constrain|>"]: "constrain", | |
| } | |
| end_ids = { | |
| v["<|end|>"]: "end", | |
| v["<|return|>"]: "end", | |
| v["<|call|>"]: "call", | |
| } | |
| message: dict[str, str] = {} | |
| section: str | None = None | |
| text_tokens: list[int] = [] | |
| # Perform lexical analysis on tokens | |
| for token in tokens: | |
| if token in start_ids: | |
| if token == start_id: | |
| message = {} | |
| section = None | |
| text_tokens = [] | |
| if section is not None: | |
| message[section] = tokenizer.decode(text_tokens) | |
| section = start_ids[token] | |
| text_tokens = [] | |
| elif token in end_ids: | |
| if section is not None: | |
| message[section] = tokenizer.decode(text_tokens) | |
| message["end"] = end_ids[token] | |
| yield message | |
| else: | |
| text_tokens.append(token) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment