Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com> Co-authored-by: Maratyszcza <marat@openai.com> Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
This commit is contained in:
30
gpt_oss/tokenizer.py
Normal file
30
gpt_oss/tokenizer.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import tiktoken
|
||||
|
||||
def get_tokenizer():
|
||||
o200k_base = tiktoken.get_encoding("o200k_base")
|
||||
tokenizer = tiktoken.Encoding(
|
||||
name="o200k_harmony",
|
||||
pat_str=o200k_base._pat_str,
|
||||
mergeable_ranks=o200k_base._mergeable_ranks,
|
||||
special_tokens={
|
||||
**o200k_base._special_tokens,
|
||||
"<|startoftext|>": 199998,
|
||||
"<|endoftext|>": 199999,
|
||||
"<|reserved_200000|>": 200000,
|
||||
"<|reserved_200001|>": 200001,
|
||||
"<|return|>": 200002,
|
||||
"<|constrain|>": 200003,
|
||||
"<|reserved_200004|>": 200004,
|
||||
"<|channel|>": 200005,
|
||||
"<|start|>": 200006,
|
||||
"<|end|>": 200007,
|
||||
"<|message|>": 200008,
|
||||
"<|reserved_200009|>": 200009,
|
||||
"<|reserved_200010|>": 200010,
|
||||
"<|reserved_200011|>": 200011,
|
||||
"<|call|>": 200012,
|
||||
} | {
|
||||
f"<|reserved_{i}|>": i for i in range(200013, 201088)
|
||||
},
|
||||
)
|
||||
return tokenizer
|
||||
Reference in New Issue
Block a user