Files
openharmony-mlx/tests/bin_header_dump.py

56 lines
1.8 KiB
Python

# tests/token_uuid_slot.py
import struct, uuid, sys, tiktoken
from openai_harmony import load_harmony_encoding, HarmonyEncodingName
FMT_MODEL = "<IIIIII f IIII f f f f f f"
SPECIAL = {
"<|reversed199998|>": 199998,
"": 199999,
"<|untrusted|>": 200000,
"<|endofuntrusted|>": 200001,
"<|return|>": 200002,
"<|constrain|>": 200003,
"<|reversed200004|>": 200004,
"<|channel|>": 200005,
"<|start|>": 200006,
"<|end|>": 200007,
"<|message|>": 200008,
"<|reversed200008|>": 200008,
"<|reversed200009|>": 200009,
"<|reversed200010|>": 200010,
"<|reversed200011|>": 200011,
"<|call|>": 200012,
"<|refusal|>": 200013,
}
def table_start_offset(f):
f.seek(0)
f.read(16) # magic
f.read(16) # model uuid
f.read(struct.calcsize(FMT_MODEL))
f.read(16) # apple uuid
tok_uuid = uuid.UUID(bytes=f.read(16))
ns, nt, rs, ts = struct.unpack("<IIII", f.read(16))
table_off = f.tell()
return tok_uuid, ns, nt, rs, ts, table_off
def show(path, token):
tok_id = SPECIAL[token]
enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
o200k = tiktoken.get_encoding("o200k_base")
# 用 harmony 的判定口径得到“文本 token 数”,与写 bin 时保持一致
num_text = sum(1 for t in range(o200k.n_vocab) if not enc.is_special_token(t))
slot = tok_id - num_text
with open(path, "rb") as f:
tok_uuid, ns, nt, rs, ts, table_off = table_start_offset(f)
f.seek(table_off + 16*slot)
u = uuid.UUID(bytes=f.read(16))
print(f"{path}\n tokenizer_uuid: {tok_uuid}\n header_nt={nt}, header_ns={ns}\n"
f" token={token} (id={tok_id}) -> slot={slot}, uuid={u}\n")
if __name__ == "__main__":
show(sys.argv[1], sys.argv[2])