Files
openharmony-mlx/gpt_oss/tokenizer.py
Dominik Kundel 243a1b0276 Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com>
Co-authored-by: Maratyszcza <marat@openai.com>
Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
2025-08-05 08:19:49 -07:00

31 lines
1002 B
Python

import tiktoken
def get_tokenizer():
o200k_base = tiktoken.get_encoding("o200k_base")
tokenizer = tiktoken.Encoding(
name="o200k_harmony",
pat_str=o200k_base._pat_str,
mergeable_ranks=o200k_base._mergeable_ranks,
special_tokens={
**o200k_base._special_tokens,
"<|startoftext|>": 199998,
"<|endoftext|>": 199999,
"<|reserved_200000|>": 200000,
"<|reserved_200001|>": 200001,
"<|return|>": 200002,
"<|constrain|>": 200003,
"<|reserved_200004|>": 200004,
"<|channel|>": 200005,
"<|start|>": 200006,
"<|end|>": 200007,
"<|message|>": 200008,
"<|reserved_200009|>": 200009,
"<|reserved_200010|>": 200010,
"<|reserved_200011|>": 200011,
"<|call|>": 200012,
} | {
f"<|reserved_{i}|>": i for i in range(200013, 201088)
},
)
return tokenizer