Files
openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py

500 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GPTOSS → Metal bin converter (safe, shardedaware)
- Supports single or sharded safetensors (index.json)
- Compatible with OpenHarmonyMLX metal backend
- Works with finetuned GPTOSS20B variants (Jinx naming fallback)
- Safer MXFP4 scales write (int16 + clamp + bias)
- Tokenizer header/UUID table aligned to Harmony GPTOSS encoding
Usage:
python create-local-modelnew.py \
-s /path/to/ckpt_dir \
-d /path/to/output/model.bin
"""
import argparse
import os
import math
import sys
import json
import struct
from uuid import UUID
import contextlib
from contextlib import ExitStack
import torch
from safetensors import safe_open
import tiktoken
from tqdm import tqdm
from openai_harmony import load_harmony_encoding, HarmonyEncodingName
# ---------------- CLI ----------------
parser = argparse.ArgumentParser(
prog="create-local-modelnew.py",
description="Convert GPTOSS MXFP4 weights to Metal .bin (safe)"
)
parser.add_argument("-s", "--src", metavar="DIR", type=str, required=True, help="Path to safetensors dir (single or sharded)")
parser.add_argument("-d", "--dst", metavar="FILE", type=str, required=True, help="Path to output model.bin")
# --------------- Encoding (match GPTOSS) ---------------
o200k_base = tiktoken.get_encoding("o200k_base")
harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
# Compose the GPTOSS view over o200k with the expected special IDs
# NOTE: keep the official eot id as ""
o200k_gptoss = tiktoken.Encoding(
name="o200k_gptoss",
pat_str=o200k_base._pat_str,
mergeable_ranks=o200k_base._mergeable_ranks,
special_tokens={
"<|reversed199998|>": 199998,
"": 199999, # official eot id
"<|untrusted|>": 200000,
"<|endofuntrusted|>": 200001, # official spelling
"<|return|>": 200002,
"<|constrain|>": 200003,
"<|reversed200004|>": 200004,
"<|channel|>": 200005,
"<|start|>": 200006,
"<|end|>": 200007,
"<|message|>": 200008,
"<|reversed200008|>": 200008,
"<|reversed200009|>": 200009,
"<|reversed200010|>": 200010,
"<|reversed200011|>": 200011,
"<|call|>": 200012,
"<|refusal|>": 200013,
},
)
# --------------- File / token constants ---------------
FILE_MAGIC = struct.pack('ccccccccccccI', b'G', b'P', b'T', b'-', b'O', b'S', b'S', b' ', b'v', b'1', b'.', b'0', 0)
SPECIAL_TOKEN_UUID = {
'<|start|>': UUID('55a77c2f-8a01-4c54-8ac2-313bfc7e208d').bytes,
'<|message|>': UUID('16e40431-f47f-4b22-b59b-8b278fc30a54').bytes,
'<|end|>': UUID('fcac2f6d-4705-4f6b-b228-642accac7238').bytes,
'<|return|>': UUID('f799ff69-1992-43c4-a3d8-d831f475dc75').bytes,
'<|refusal|>': UUID('e15ba702-28c4-4292-ab8f-ffa434709128').bytes,
'<|constrain|>': UUID('c0bb14c7-6022-49da-ad08-792d67e8b470').bytes,
'<|channel|>': UUID('fd3dda11-c8ab-4033-876e-d93deb172c93').bytes,
'<|call|>': UUID('1220f796-e388-4de5-b487-fe2eb5fe03c0').bytes,
'<|untrusted|>': UUID('07d7da55-b346-4cff-8b37-7cefacf8a3e8').bytes,
'<|endofuntrusted|>': UUID('f265bd9c-c717-469e-a447-920687d65d90').bytes,
# NOTE: EOT ("") has no UUID
}
INCLUDE_SPECIAL_TOKENS = [
"<|start|>", "<|message|>", "<|end|>", "<|return|>",
"<|refusal|>", "<|constrain|>", "<|channel|>", "<|call|>",
"<|untrusted|>", "<|endofuntrusted|>",
]
GPTOSS_MODEL_UUID = UUID('df52dc86-1789-4ed0-a295-66f10508145b').bytes
APPLE_GPU_LAYOUT_UUID = UUID('229177a8-5775-4268-bfd8-588b351c56d'.replace('588','d588')).bytes # formatting guard
TIKTOKEN_TOKENIZER_UUID = UUID('7401aded-2a95-40cb-b782-9ccebaafe72b').bytes
UE8_OFFSET = 14 # bias for MXFP4 block scales (UE8)
# --------------- IO helpers ---------------
def write_file_header(f):
f.write(FILE_MAGIC)
def write_tokenizer_header(f, num_special_tokens:int, num_text_tokens:int, regex_size:int, tokens_size:int):
f.write(TIKTOKEN_TOKENIZER_UUID)
f.write(struct.pack('<I', num_special_tokens))
f.write(struct.pack('<I', num_text_tokens))
f.write(struct.pack('<I', regex_size))
f.write(struct.pack('<I', tokens_size))
# model header layout: 6*U32, 1*F32, 4*U32, 6*F32 (total 17 fields)
# Keep exactly in sync with the metal runtime reader.
def write_model_header(f,
context_length:int, num_blocks:int, num_experts:int, num_active_experts:int,
embedding_dim:int, mlp_dim:int, swiglu_limit:float, head_dim:int, num_heads:int,
num_kv_heads:int, attention_window:int, rope_theta:float, interpolation_scale:float,
yarn_offset:float, yarn_scale:float, yarn_multiplier:float, rmsnorm_epsilon:float,
):
f.write(GPTOSS_MODEL_UUID)
f.write(struct.pack('<I', context_length))
f.write(struct.pack('<I', num_blocks))
f.write(struct.pack('<I', num_experts))
f.write(struct.pack('<I', num_active_experts))
f.write(struct.pack('<I', embedding_dim))
f.write(struct.pack('<I', mlp_dim))
f.write(struct.pack('<f', swiglu_limit))
f.write(struct.pack('<I', head_dim))
f.write(struct.pack('<I', num_heads))
f.write(struct.pack('<I', num_kv_heads))
f.write(struct.pack('<I', attention_window))
f.write(struct.pack('<f', rope_theta))
f.write(struct.pack('<f', interpolation_scale))
f.write(struct.pack('<f', yarn_offset))
f.write(struct.pack('<f', yarn_scale))
f.write(struct.pack('<f', yarn_multiplier))
f.write(struct.pack('<f', rmsnorm_epsilon))
f.write(APPLE_GPU_LAYOUT_UUID)
def write_padding(out_file, alignment_multiple=16384):
off = out_file.tell()
pad = (-off) % alignment_multiple
if pad:
out_file.write(bytes(pad))
def write_bytes_from_tensor(out_file, t: torch.Tensor, align=16):
write_padding(out_file, align)
out_file.write(t.view(torch.uint8).numpy().tobytes())
def write_embedding_weight(out_file, weight: torch.Tensor):
assert weight.dtype in (torch.float8_e4m3fn, torch.bfloat16)
write_bytes_from_tensor(out_file, weight, 16)
def write_rmsnorm_gain(out_file, gain: torch.Tensor):
assert gain.dtype == torch.bfloat16
write_bytes_from_tensor(out_file, gain, 16)
def write_attn_sink(out_file, sink: torch.Tensor):
assert sink.dtype == torch.bfloat16
write_bytes_from_tensor(out_file, sink, 16)
def write_linear_weight(out_file, *args: torch.Tensor):
write_padding(out_file, 16)
for t in args:
out_file.write(t.view(torch.uint8).numpy().tobytes())
def write_ue8_with_bias(out_file, scales: torch.Tensor, align=16, offset: int = UE8_OFFSET):
# Avoid uint8 wraparound by accumulating in int16 then clamping to [0,255]
s = scales.to(torch.int16).add_(int(offset)).clamp_(0, 255).to(torch.uint8)
write_padding(out_file, align)
out_file.write(s.numpy().tobytes())
# ---- open single or sharded safetensors (index.json) ----
@contextlib.contextmanager
def open_st_reader(srcdir: str):
single = os.path.join(srcdir, "model.safetensors")
index = os.path.join(srcdir, "model.safetensors.index.json")
if os.path.exists(single):
with safe_open(single, framework="pt", device="cpu") as f:
class R:
def get_tensor(self, k): return f.get_tensor(k)
def keys(self): return list(f.keys())
yield R(); return
if os.path.exists(index):
idx = json.load(open(index, "r"))
wm = idx["weight_map"]
with ExitStack() as stack:
cache = {}
def get_handle(relpath):
if relpath not in cache:
cache[relpath] = stack.enter_context(
safe_open(os.path.join(srcdir, relpath), framework="pt", device="cpu")
)
return cache[relpath]
class R:
def get_tensor(self, k):
fp = wm[k]
return get_handle(fp).get_tensor(k)
def keys(self):
return list(wm.keys())
yield R(); return
raise FileNotFoundError("Neither model.safetensors nor model.safetensors.index.json found")
# --------------- main ---------------
def pick(config, *keys, default=None):
for k in keys:
v = config
ok = True
for part in k.split('.'):
if isinstance(v, dict) and part in v:
v = v[part]
else:
ok = False; break
if ok: return v
if default is not None: return default
raise KeyError(f"Missing any of keys: {keys}")
def get_tensor_first(src, cands):
last = None
for k in cands:
try:
return src.get_tensor(k), k
except KeyError as e:
last = e
sample = list(src.keys())[:20]
raise KeyError(f"none of {cands}; sample keys: {sample}") from last
def get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim):
# fused qkv
try:
w = src.get_tensor(f"block.{n}.attn.qkv.weight")
b = src.get_tensor(f"block.{n}.attn.qkv.bias")
return w, b, "qkv"
except KeyError:
# split projections
q_w,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.weight", f"model.layers.{n}.self_attn.q_proj.weight"])
k_w,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.weight", f"model.layers.{n}.self_attn.k_proj.weight"])
v_w,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.weight", f"model.layers.{n}.self_attn.v_proj.weight"])
w = torch.cat((q_w, k_w, v_w), dim=0).contiguous()
try:
q_b,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.bias", f"model.layers.{n}.self_attn.q_proj.bias"])
k_b,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.bias", f"model.layers.{n}.self_attn.k_proj.bias"])
v_b,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.bias", f"model.layers.{n}.self_attn.v_proj.bias"])
b = torch.cat((q_b, k_b, v_b), dim=0).contiguous()
except KeyError:
b = torch.zeros(w.shape[0], dtype=w.dtype)
return w, b, "qkv_split"
def main(argv):
opt = parser.parse_args(argv)
config = json.load(open(os.path.join(opt.src, "config.json"), "r"))
# ---- hyperparams (with fallbacks for popular finetune configs) ----
num_blocks = int(pick(config, "num_hidden_layers"))
num_experts = int(pick(config, "num_experts", "num_local_experts"))
num_active_experts = int(pick(config, "num_active_experts", "experts_per_token", "num_experts_per_tok", default=4))
num_q_heads = int(pick(config, "num_attention_heads"))
num_kv_heads = int(pick(config, "num_key_value_heads"))
head_dim = int(pick(config, "head_dim"))
embedding_dim = int(pick(config, "hidden_size"))
mlp_dim = int(pick(config, "intermediate_size"))
swiglu_limit = float(config.get("swiglu_limit", 7.0))
rope_theta = float(pick(config, "rope_theta"))
attention_window = int(pick(config, "sliding_window"))
initial_context_length = int(pick(config, "initial_context_length"))
rope_scaling = config.get("rope_scaling")
if isinstance(rope_scaling, dict):
rope_scaling_factor = float(rope_scaling.get("factor", 1.0))
rope_ntk_alpha = float(rope_scaling.get("ntk_alpha", 1.0))
rope_ntk_beta = float(rope_scaling.get("ntk_beta", 32.0))
else:
rope_scaling_factor = float(pick(config, "rope_scaling_factor", default=1.0))
rope_ntk_alpha = float(pick(config, "rope_ntk_alpha", default=1.0))
rope_ntk_beta = float(pick(config, "rope_ntk_beta", default=32.0))
# ---- tokenizer sizes ----
tokens_size = 0
num_text_tokens = 0
for t in range(o200k_gptoss.n_vocab):
if not harmony_encoding.is_special_token(t):
tb = o200k_gptoss.decode_single_token_bytes(t)
assert len(tb) > 0
tokens_size += len(tb) + 2 # uint16 length + data
num_text_tokens += 1
num_included_tokens = o200k_gptoss.n_vocab
print(f"[tokenizer] vocab={num_included_tokens} text={num_text_tokens} special={num_included_tokens - num_text_tokens}")
with open(opt.dst, "wb") as dst:
with open_st_reader(opt.src) as src:
write_file_header(dst)
# YARN params derived from NTK bounds
yarn_low = (head_dim/2) * math.log(initial_context_length / (rope_ntk_beta * 2 * math.pi)) / math.log(rope_theta)
yarn_high = (head_dim/2) * math.log(initial_context_length / (rope_ntk_alpha * 2 * math.pi)) / math.log(rope_theta)
write_model_header(
dst,
context_length=int(initial_context_length * rope_scaling_factor),
num_blocks=num_blocks,
num_experts=num_experts,
num_active_experts=num_active_experts,
embedding_dim=embedding_dim,
mlp_dim=mlp_dim,
swiglu_limit=swiglu_limit,
head_dim=head_dim,
num_heads=num_q_heads,
num_kv_heads=num_kv_heads,
attention_window=attention_window,
rope_theta=rope_theta,
interpolation_scale=1.0/rope_scaling_factor,
yarn_offset=-yarn_low/(yarn_high-yarn_low),
yarn_scale=1.0/(yarn_high-yarn_low),
yarn_multiplier=0.1*math.log(rope_scaling_factor)+1.0,
rmsnorm_epsilon=1e-5,
)
write_tokenizer_header(
dst,
num_special_tokens=num_included_tokens - num_text_tokens,
num_text_tokens=num_text_tokens,
regex_size=len(o200k_gptoss._pat_str.encode("ascii")) + 1,
tokens_size=tokens_size,
)
# UUID table for special tokens [num_text .. vocab)
for token_idx in range(num_text_tokens, num_included_tokens):
token = o200k_gptoss.decode_single_token_bytes(token_idx).decode('ascii', errors='ignore')
if token in INCLUDE_SPECIAL_TOKENS:
dst.write(SPECIAL_TOKEN_UUID[token])
else:
dst.write(bytes(16))
# regex + NUL
dst.write(o200k_gptoss._pat_str.encode("ascii"))
dst.write(struct.pack('B', 0))
# text tokens
written = 0
for t in range(num_text_tokens):
tb = o200k_gptoss.decode_single_token_bytes(t)
dst.write(struct.pack('<H', len(tb)))
dst.write(tb)
written += len(tb) + 2
assert written == tokens_size
write_padding(dst)
# ---- embedding ----
emb, emb_key = get_tensor_first(src, [
"embedding.weight", "model.embed_tokens.weight", "embed_tokens.weight", "tok_embeddings.weight", "model.wte.weight"
])
print(f"[ok] embedding: {emb_key} shape={tuple(emb.shape)}")
emb = emb[:min(emb.shape[0], num_included_tokens), :]
write_embedding_weight(dst, emb)
# ---- blocks ----
for n in tqdm(range(num_blocks), desc="blocks"):
attn_norm, key = get_tensor_first(src, [
f"block.{n}.attn.norm.scale",
f"model.layers.{n}.input_layernorm.weight",
])
if n == 0: print(f"[ok] attn norm: {key}")
write_rmsnorm_gain(dst, attn_norm)
qkv_w, qkv_b, src_kind = get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim)
if n == 0: print(f"[ok] attn qkv source: {src_kind}")
for qkv in (qkv_w, qkv_b):
qk = qkv[:head_dim*(num_q_heads+num_kv_heads), ...].contiguous()
v = qkv[head_dim*(num_q_heads+num_kv_heads):, ...].contiguous()
qk = qk.view(num_q_heads+num_kv_heads, 2, head_dim//2, -1).transpose(1,2).reshape(num_q_heads+num_kv_heads, head_dim, -1)
q = qk[:num_q_heads, ...]
k = qk[num_q_heads:, ...]
assert head_dim == 64, "assumes head_dim==64 for baked scale"
q *= 0.5; k *= 0.25
v = v.view(num_kv_heads, head_dim, -1)
qkv.copy_(torch.cat((q, k, v), dim=0).reshape(*qkv.shape))
write_linear_weight(dst, qkv_w, qkv_b)
sinks, sinks_key = get_tensor_first(src, [
f"block.{n}.attn.sinks",
f"model.layers.{n}.self_attn.sinks",
])
if n == 0: print(f"[ok] attn sinks: {sinks_key}")
write_attn_sink(dst, sinks)
attn_out_w,_ = get_tensor_first(src, [
f"block.{n}.attn.out.weight",
f"model.layers.{n}.self_attn.o_proj.weight",
])
try:
attn_out_b,_ = get_tensor_first(src, [
f"block.{n}.attn.out.bias",
f"model.layers.{n}.self_attn.o_proj.bias",
])
except KeyError:
attn_out_b = torch.zeros(attn_out_w.shape[0], dtype=attn_out_w.dtype)
write_linear_weight(dst, attn_out_w, attn_out_b)
mlp_norm,_ = get_tensor_first(src, [
f"block.{n}.mlp.norm.scale",
f"model.layers.{n}.post_attention_layernorm.weight",
])
if n == 0: print(f"[ok] mlp norm")
write_rmsnorm_gain(dst, mlp_norm)
router_w,_ = get_tensor_first(src, [
f"block.{n}.mlp.gate.weight", # router logits
f"model.layers.{n}.mlp.router.weight",
])
try:
router_b,_ = get_tensor_first(src, [
f"block.{n}.mlp.gate.bias",
f"model.layers.{n}.mlp.router.bias",
])
except KeyError:
router_b = torch.zeros(router_w.shape[0], dtype=router_w.dtype)
write_linear_weight(dst, router_w, router_b)
final_norm,_ = get_tensor_first(src, ["norm.scale", "model.norm.weight"])
print(f"[ok] final norm")
write_rmsnorm_gain(dst, final_norm)
unemb, unemb_key = get_tensor_first(src, ["unembedding.weight", "lm_head.weight", "model.lm_head.weight"])
print(f"[ok] unembedding: {unemb_key} shape={tuple(unemb.shape)}")
unemb = unemb[:min(unemb.shape[0], num_included_tokens), :]
write_linear_weight(dst, unemb)
# ---- MoE (per-expert grouped); support native + Jinx naming ----
for n in tqdm(range(num_blocks), desc="experts"):
try:
mlp1_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.blocks"])
mlp1_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.scales"])
mlp1_bias, _ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_bias"])
mlp2_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.blocks"])
mlp2_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.scales"])
mlp2_bias, _ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_bias"])
except KeyError:
# Jinx naming: fused gate+up becomes "mlp1"; down is "mlp2"
gate_up_blk,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_blocks"])
gate_up_scl,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_scales"])
gate_up_bia,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_bias"])
down_blk,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_blocks"])
down_scl,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_scales"])
down_bia,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_bias"])
mlp1_blocks = gate_up_blk
mlp1_scales = gate_up_scl
mlp1_bias = gate_up_bia
mlp2_blocks = down_blk
mlp2_scales = down_scl
mlp2_bias = down_bia
assert mlp1_blocks.shape[0] == mlp2_blocks.shape[0] == num_experts, \
f"experts dim mismatch at block {n}: {mlp1_blocks.shape} vs {mlp2_blocks.shape}"
write_padding(dst)
for e in range(num_experts):
write_bytes_from_tensor(dst, mlp1_blocks[e, ...], 16)
write_ue8_with_bias(dst, mlp1_scales[e, ...], 16)
write_bytes_from_tensor(dst, mlp1_bias[e, ...], 16)
write_bytes_from_tensor(dst, mlp2_blocks[e, ...], 16)
write_ue8_with_bias(dst, mlp2_scales[e, ...], 16)
write_bytes_from_tensor(dst, mlp2_bias[e, ...], 16)
print(f"[done] Wrote {opt.dst}")
if __name__ == "__main__":
main(sys.argv[1:])
"""
python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
-s /Volumes/long990max/gpustack_data/huihui-ai/Huihui-gpt-oss-20b-mxfp4-abliterated \
-d /Volumes/long990max/project/openharmony-mlx/model.bin
python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
-s /Volumes/long990max/gpustack_data/huizimao/gpt-oss-20b-uncensored-mxfp4 \
-d /Volumes/long990max/project/openharmony-mlx/model.bin
想要“拒绝更少”就选 huizimao/gpt-oss-20b-uncensored-mxfp4。
"""