增加权重转化时候的检查脚本
This commit is contained in:
62
tests/token_uuid_slot.py
Normal file
62
tests/token_uuid_slot.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# token_uuid_slot.py
|
||||
import struct, uuid, sys, pathlib, tiktoken
|
||||
from openai_harmony import load_harmony_encoding, HarmonyEncodingName
|
||||
|
||||
FMT_MODEL = "<IIIIII f IIII f f f f f"
|
||||
|
||||
SPECIAL = {
|
||||
"<|reversed199998|>": 199998,
|
||||
"": 199999,
|
||||
"<|untrusted|>": 200000,
|
||||
"<|endofuntrusted|>": 200001,
|
||||
"<|return|>": 200002,
|
||||
"<|constrain|>": 200003,
|
||||
"<|reversed200004|>": 200004,
|
||||
"<|channel|>": 200005,
|
||||
"<|start|>": 200006,
|
||||
"<|end|>": 200007,
|
||||
"<|message|>": 200008,
|
||||
"<|reversed200008|>": 200008,
|
||||
"<|reversed200009|>": 200009,
|
||||
"<|reversed200010|>": 200010,
|
||||
"<|reversed200011|>": 200011,
|
||||
"<|call|>": 200012,
|
||||
"<|refusal|>": 200013,
|
||||
}
|
||||
|
||||
def header_and_table_off(f):
|
||||
f.read(16) # magic
|
||||
f.read(16) # model uuid
|
||||
f.read(struct.calcsize(FMT_MODEL))
|
||||
f.read(16) # apple uuid
|
||||
tok_uuid = uuid.UUID(bytes=f.read(16))
|
||||
ns, nt, rs, ts = struct.unpack("<IIII", f.read(16))
|
||||
table_off = f.tell() # UUID 表的起始位置
|
||||
return tok_uuid, ns, nt, rs, ts, table_off
|
||||
|
||||
def show(path, token):
|
||||
tok_id = SPECIAL[token]
|
||||
enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||
o200k = tiktoken.get_encoding("o200k_base")
|
||||
# 与转权重时同源的“文本/特殊”划分
|
||||
num_text = sum(1 for t in range(o200k.n_vocab) if not enc.is_special_token(t))
|
||||
slot = tok_id - num_text
|
||||
with open(path, "rb") as f:
|
||||
tok_uuid, ns, nt, rs, ts, table_off = header_and_table_off(f)
|
||||
f.seek(table_off + 16*slot)
|
||||
u = uuid.UUID(bytes=f.read(16))
|
||||
print(f"{path}\n tokenizer_uuid: {tok_uuid}\n num_text={nt}, num_special={ns}")
|
||||
print(f" token={token} (id={tok_id}) -> slot={slot}, uuid={u}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 用法: python token_uuid_slot.py <bin> "<|channel|>"
|
||||
show(sys.argv[1], sys.argv[2])
|
||||
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|channel|>"
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|channel|>"
|
||||
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|message|>"
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|message|>"
|
||||
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|return|>"
|
||||
# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|return|>"
|
||||
Reference in New Issue
Block a user