Created
March 21, 2026 10:49
-
-
Save odashi/2112baaf0ce2ca9022c3d5aa213b4547 to your computer and use it in GitHub Desktop.
Pilot implementation of tokenizer for llm-jp-4 models for transformers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Pilot implementation of tokenizer for llm-jp-4 models for transformers. | |
| # Author: Yusuke Oda | |
| # Date: 2026-03-21 | |
| # License: MIT | |
| # | |
| # Usage: | |
| # 1. Put this file onto llm-jp-4 model directory. | |
| # 2. Add the following entry to `tokenizer_config.json`: | |
| # { | |
| # "auto_map": { | |
| # "AutoTokenizer": ["llmjp4_tokenizer.Llmjp4Tokenizer", "llmjp4_tokenizer.Llmjp4Tokenizer"] | |
| # }, | |
| # ... | |
| # } | |
| # 3. Add `trust_remote_code=True` option when loading tokenizer. | |
| from transformers import LlamaTokenizerFast | |
| class Llmjp4Tokenizer(LlamaTokenizerFast): | |
| _HARMONY_TOKENS: set[str] = { | |
| "<|start|>", | |
| "<|message|>", | |
| "<|channel|>", | |
| "<|constrain|>", | |
| "<|end|>", | |
| "<|return|>", | |
| "<|call|>", | |
| } | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self._harmony_token_ids = { | |
| self.convert_tokens_to_ids(token) | |
| for token in self._HARMONY_TOKENS | |
| } | |
| def _decode(self, token_ids: int | list[int], *args, **kwargs): | |
| if isinstance(token_ids, int): | |
| token_ids = [token_ids] | |
| result: list[str] = [] | |
| prev_pos = 0 | |
| # NOTE(odashi): | |
| # Ensure that text tokens are decoded without preceding Harmony tokens | |
| # to avoid incorrect addition of whitespaces. | |
| for pos, token_id in enumerate(token_ids, start=1): | |
| if token_id in self._harmony_token_ids or pos == len(token_ids): | |
| result.append(super()._decode(token_ids[prev_pos:pos], *args, **kwargs)) | |
| prev_pos = pos | |
| return "".join(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment