Skip to content

Instantly share code, notes, and snippets.

@odashi
Created March 21, 2026 10:49
Show Gist options
  • Select an option

  • Save odashi/2112baaf0ce2ca9022c3d5aa213b4547 to your computer and use it in GitHub Desktop.

Select an option

Save odashi/2112baaf0ce2ca9022c3d5aa213b4547 to your computer and use it in GitHub Desktop.
Pilot implementation of tokenizer for llm-jp-4 models for transformers.
# Pilot implementation of tokenizer for llm-jp-4 models for transformers.
# Author: Yusuke Oda
# Date: 2026-03-21
# License: MIT
#
# Usage:
# 1. Put this file onto llm-jp-4 model directory.
# 2. Add the following entry to `tokenizer_config.json`:
# {
# "auto_map": {
# "AutoTokenizer": ["llmjp4_tokenizer.Llmjp4Tokenizer", "llmjp4_tokenizer.Llmjp4Tokenizer"]
# },
# ...
# }
# 3. Add `trust_remote_code=True` option when loading tokenizer.
from transformers import LlamaTokenizerFast
class Llmjp4Tokenizer(LlamaTokenizerFast):
_HARMONY_TOKENS: set[str] = {
"<|start|>",
"<|message|>",
"<|channel|>",
"<|constrain|>",
"<|end|>",
"<|return|>",
"<|call|>",
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._harmony_token_ids = {
self.convert_tokens_to_ids(token)
for token in self._HARMONY_TOKENS
}
def _decode(self, token_ids: int | list[int], *args, **kwargs):
if isinstance(token_ids, int):
token_ids = [token_ids]
result: list[str] = []
prev_pos = 0
# NOTE(odashi):
# Ensure that text tokens are decoded without preceding Harmony tokens
# to avoid incorrect addition of whitespaces.
for pos, token_id in enumerate(token_ids, start=1):
if token_id in self._harmony_token_ids or pos == len(token_ids):
result.append(super()._decode(token_ids[prev_pos:pos], *args, **kwargs))
prev_pos = pos
return "".join(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment