odashi · March 19, 2026 01:10
diff --git a/parser.py b/parser.py
 # A simple lexical parser for OpenAI Harmony-like response format.
 # Author: Yusuke Oda
 # Date: 2026-03-19
 # License: MIT
 #
 # Usage:
 #
 # ```python
 # tokenizer = transformers.AutoTokenizer.from_pretrained("model-with-harmony-like-tokenizer")
 # tokens = tokenizer("""\
 # <|start|>assistant<|channel|>analysis<|message|>Let's say hello.<|end|>\
 # <|start|>assistant<|channel|>final<|message|>Hello!<|end|>\
 # """)
 # for message in iter_messages(tokenizer, tokens["input_ids"]):
 #     print(message)
 # ```
 #
 # this snippet will output:
 #
 # ```
 # {'role': 'assistant', 'channel': 'analysis', 'message': "Let's say hello.", 'end': 'end'}
 # {'role': 'assistant', 'channel': 'final', 'message': 'Hello!', 'end': 'end'}
 # ```

 from transformers import PreTrainedTokenizerBase
 from typing import Iterator

 def iter_messages(tokenizer: PreTrainedTokenizerBase, tokens: list[int]) -> Iterator[list[int]]:
    v = tokenizer.get_vocab()
    start_id = v["<|start|>"]
    start_ids = {
        start_id: "role",
        v["<|message|>"]: "message",
        v["<|channel|>"]: "channel",
        v["<|constrain|>"]: "constrain",
    }
    end_ids = {
        v["<|end|>"]: "end",
        v["<|return|>"]: "end",
        v["<|call|>"]: "call",
    }

    message: dict[str, str] = {}
    section: str | None = None
    text_tokens: list[int] = []

    # Perform lexical analysis on tokens
    for token in tokens:
        if token in start_ids:
            if token == start_id:
                message = {}
                section = None
                text_tokens = []
            if section is not None:
                message[section] = tokenizer.decode(text_tokens)
            section = start_ids[token]
            text_tokens = []
        elif token in end_ids:
            if section is not None:
                message[section] = tokenizer.decode(text_tokens)
            message["end"] = end_ids[token]
            yield message
        else:
            text_tokens.append(token)
	# A simple lexical parser for OpenAI Harmony-like response format.
	# Author: Yusuke Oda
	# Date: 2026-03-19
	# License: MIT
	#
	# Usage:
	#
	# ```python
	# tokenizer = transformers.AutoTokenizer.from_pretrained("model-with-harmony-like-tokenizer")
	# tokens = tokenizer("""\
	# <\|start\|>assistant<\|channel\|>analysis<\|message\|>Let's say hello.<\|end\|>\
	# <\|start\|>assistant<\|channel\|>final<\|message\|>Hello!<\|end\|>\
	# """)
	# for message in iter_messages(tokenizer, tokens["input_ids"]):
	# print(message)
	# ```
	#
	# this snippet will output:
	#
	# ```
	# {'role': 'assistant', 'channel': 'analysis', 'message': "Let's say hello.", 'end': 'end'}
	# {'role': 'assistant', 'channel': 'final', 'message': 'Hello!', 'end': 'end'}
	# ```

	from transformers import PreTrainedTokenizerBase
	from typing import Iterator

	def iter_messages(tokenizer: PreTrainedTokenizerBase, tokens: list[int]) -> Iterator[list[int]]:
	v = tokenizer.get_vocab()
	start_id = v["<\|start\|>"]
	start_ids = {
	start_id: "role",
	v["<\|message\|>"]: "message",
	v["<\|channel\|>"]: "channel",
	v["<\|constrain\|>"]: "constrain",
	}
	end_ids = {
	v["<\|end\|>"]: "end",
	v["<\|return\|>"]: "end",
	v["<\|call\|>"]: "call",
	}

	message: dict[str, str] = {}
	section: str \| None = None
	text_tokens: list[int] = []

	# Perform lexical analysis on tokens
	for token in tokens:
	if token in start_ids:
	if token == start_id:
	message = {}
	section = None
	text_tokens = []
	if section is not None:
	message[section] = tokenizer.decode(text_tokens)
	section = start_ids[token]
	text_tokens = []
	elif token in end_ids:
	if section is not None:
	message[section] = tokenizer.decode(text_tokens)
	message["end"] = end_ids[token]
	yield message
	else:
	text_tokens.append(token)
No results found