增加权重转化时候的检查脚本

uncensored 版本的权重转化
add use description
2025-10-08 19:59:25 +08:00 · 2025-10-08 19:57:47 +08:00 · 2025-10-08 19:57:03 +08:00
7 changed files with 782 additions and 1 deletions
--- a/gpt_oss/metal/scripts/create-local-modelnew.py
+++ b/gpt_oss/metal/scripts/create-local-modelnew.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+GPT‑OSS → Metal bin converter (safe, sharded‑aware)
+- Supports single or sharded safetensors (index.json)
+- Compatible with OpenHarmony‑MLX metal backend
+- Works with finetuned GPT‑OSS‑20B variants (Jinx naming fallback)
+- Safer MXFP4 scales write (int16 + clamp + bias)
+- Tokenizer header/UUID table aligned to Harmony GPT‑OSS encoding
+
+Usage:
+  python create-local-modelnew.py \
+    -s /path/to/ckpt_dir \
+    -d /path/to/output/model.bin
+"""
+
+import argparse
+import os
+import math
+import sys
+import json
+import struct
+from uuid import UUID
+import contextlib
+from contextlib import ExitStack
+
+import torch
+from safetensors import safe_open
+import tiktoken
+from tqdm import tqdm
+from openai_harmony import load_harmony_encoding, HarmonyEncodingName
+
+# ---------------- CLI ----------------
+parser = argparse.ArgumentParser(
+    prog="create-local-modelnew.py",
+    description="Convert GPT‑OSS MXFP4 weights to Metal .bin (safe)"
+)
+parser.add_argument("-s", "--src", metavar="DIR", type=str, required=True, help="Path to safetensors dir (single or sharded)")
+parser.add_argument("-d", "--dst", metavar="FILE", type=str, required=True, help="Path to output model.bin")
+
+# --------------- Encoding (match GPT‑OSS) ---------------
+o200k_base = tiktoken.get_encoding("o200k_base")
+harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+# Compose the GPT‑OSS view over o200k with the expected special IDs
+# NOTE: keep the official eot id as ""
+o200k_gptoss = tiktoken.Encoding(
+    name="o200k_gptoss",
+    pat_str=o200k_base._pat_str,
+    mergeable_ranks=o200k_base._mergeable_ranks,
+    special_tokens={
+        "<|reversed199998|>": 199998,
+        "": 199999,  # official eot id
+        "<|untrusted|>": 200000,
+        "<|endofuntrusted|>": 200001,  # official spelling
+        "<|return|>": 200002,
+        "<|constrain|>": 200003,
+        "<|reversed200004|>": 200004,
+        "<|channel|>": 200005,
+        "<|start|>": 200006,
+        "<|end|>": 200007,
+        "<|message|>": 200008,
+        "<|reversed200008|>": 200008,
+        "<|reversed200009|>": 200009,
+        "<|reversed200010|>": 200010,
+        "<|reversed200011|>": 200011,
+        "<|call|>": 200012,
+        "<|refusal|>": 200013,
+    },
+)
+
+# --------------- File / token constants ---------------
+FILE_MAGIC = struct.pack('ccccccccccccI', b'G', b'P', b'T', b'-', b'O', b'S', b'S', b' ', b'v', b'1', b'.', b'0', 0)
+
+SPECIAL_TOKEN_UUID = {
+    '<|start|>': UUID('55a77c2f-8a01-4c54-8ac2-313bfc7e208d').bytes,
+    '<|message|>': UUID('16e40431-f47f-4b22-b59b-8b278fc30a54').bytes,
+    '<|end|>': UUID('fcac2f6d-4705-4f6b-b228-642accac7238').bytes,
+    '<|return|>': UUID('f799ff69-1992-43c4-a3d8-d831f475dc75').bytes,
+    '<|refusal|>': UUID('e15ba702-28c4-4292-ab8f-ffa434709128').bytes,
+    '<|constrain|>': UUID('c0bb14c7-6022-49da-ad08-792d67e8b470').bytes,
+    '<|channel|>': UUID('fd3dda11-c8ab-4033-876e-d93deb172c93').bytes,
+    '<|call|>': UUID('1220f796-e388-4de5-b487-fe2eb5fe03c0').bytes,
+    '<|untrusted|>': UUID('07d7da55-b346-4cff-8b37-7cefacf8a3e8').bytes,
+    '<|endofuntrusted|>': UUID('f265bd9c-c717-469e-a447-920687d65d90').bytes,
+    # NOTE: EOT ("") has no UUID
+}
+INCLUDE_SPECIAL_TOKENS = [
+    "<|start|>", "<|message|>", "<|end|>", "<|return|>",
+    "<|refusal|>", "<|constrain|>", "<|channel|>", "<|call|>",
+    "<|untrusted|>", "<|endofuntrusted|>",
+]
+
+GPTOSS_MODEL_UUID = UUID('df52dc86-1789-4ed0-a295-66f10508145b').bytes
+APPLE_GPU_LAYOUT_UUID = UUID('229177a8-5775-4268-bfd8-588b351c56d'.replace('588','d588')).bytes  # formatting guard
+TIKTOKEN_TOKENIZER_UUID = UUID('7401aded-2a95-40cb-b782-9ccebaafe72b').bytes
+
+UE8_OFFSET = 14  # bias for MXFP4 block scales (UE8)
+
+# --------------- IO helpers ---------------
+
+def write_file_header(f):
+    f.write(FILE_MAGIC)
+
+def write_tokenizer_header(f, num_special_tokens:int, num_text_tokens:int, regex_size:int, tokens_size:int):
+    f.write(TIKTOKEN_TOKENIZER_UUID)
+    f.write(struct.pack('<I', num_special_tokens))
+    f.write(struct.pack('<I', num_text_tokens))
+    f.write(struct.pack('<I', regex_size))
+    f.write(struct.pack('<I', tokens_size))
+
+# model header layout: 6*U32, 1*F32, 4*U32, 6*F32 (total 17 fields)
+# Keep exactly in sync with the metal runtime reader.
+
+def write_model_header(f,
+    context_length:int, num_blocks:int, num_experts:int, num_active_experts:int,
+    embedding_dim:int, mlp_dim:int, swiglu_limit:float, head_dim:int, num_heads:int,
+    num_kv_heads:int, attention_window:int, rope_theta:float, interpolation_scale:float,
+    yarn_offset:float, yarn_scale:float, yarn_multiplier:float, rmsnorm_epsilon:float,
+):
+    f.write(GPTOSS_MODEL_UUID)
+    f.write(struct.pack('<I', context_length))
+    f.write(struct.pack('<I', num_blocks))
+    f.write(struct.pack('<I', num_experts))
+    f.write(struct.pack('<I', num_active_experts))
+    f.write(struct.pack('<I', embedding_dim))
+    f.write(struct.pack('<I', mlp_dim))
+    f.write(struct.pack('<f', swiglu_limit))
+    f.write(struct.pack('<I', head_dim))
+    f.write(struct.pack('<I', num_heads))
+    f.write(struct.pack('<I', num_kv_heads))
+    f.write(struct.pack('<I', attention_window))
+    f.write(struct.pack('<f', rope_theta))
+    f.write(struct.pack('<f', interpolation_scale))
+    f.write(struct.pack('<f', yarn_offset))
+    f.write(struct.pack('<f', yarn_scale))
+    f.write(struct.pack('<f', yarn_multiplier))
+    f.write(struct.pack('<f', rmsnorm_epsilon))
+    f.write(APPLE_GPU_LAYOUT_UUID)
+
+
+def write_padding(out_file, alignment_multiple=16384):
+    off = out_file.tell()
+    pad = (-off) % alignment_multiple
+    if pad:
+        out_file.write(bytes(pad))
+
+
+def write_bytes_from_tensor(out_file, t: torch.Tensor, align=16):
+    write_padding(out_file, align)
+    out_file.write(t.view(torch.uint8).numpy().tobytes())
+
+
+def write_embedding_weight(out_file, weight: torch.Tensor):
+    assert weight.dtype in (torch.float8_e4m3fn, torch.bfloat16)
+    write_bytes_from_tensor(out_file, weight, 16)
+
+
+def write_rmsnorm_gain(out_file, gain: torch.Tensor):
+    assert gain.dtype == torch.bfloat16
+    write_bytes_from_tensor(out_file, gain, 16)
+
+
+def write_attn_sink(out_file, sink: torch.Tensor):
+    assert sink.dtype == torch.bfloat16
+    write_bytes_from_tensor(out_file, sink, 16)
+
+
+def write_linear_weight(out_file, *args: torch.Tensor):
+    write_padding(out_file, 16)
+    for t in args:
+        out_file.write(t.view(torch.uint8).numpy().tobytes())
+
+
+def write_ue8_with_bias(out_file, scales: torch.Tensor, align=16, offset: int = UE8_OFFSET):
+    # Avoid uint8 wraparound by accumulating in int16 then clamping to [0,255]
+    s = scales.to(torch.int16).add_(int(offset)).clamp_(0, 255).to(torch.uint8)
+    write_padding(out_file, align)
+    out_file.write(s.numpy().tobytes())
+
+# ---- open single or sharded safetensors (index.json) ----
+@contextlib.contextmanager
+def open_st_reader(srcdir: str):
+    single = os.path.join(srcdir, "model.safetensors")
+    index  = os.path.join(srcdir, "model.safetensors.index.json")
+
+    if os.path.exists(single):
+        with safe_open(single, framework="pt", device="cpu") as f:
+            class R:
+                def get_tensor(self, k): return f.get_tensor(k)
+                def keys(self): return list(f.keys())
+            yield R(); return
+
+    if os.path.exists(index):
+        idx = json.load(open(index, "r"))
+        wm = idx["weight_map"]
+        with ExitStack() as stack:
+            cache = {}
+            def get_handle(relpath):
+                if relpath not in cache:
+                    cache[relpath] = stack.enter_context(
+                        safe_open(os.path.join(srcdir, relpath), framework="pt", device="cpu")
+                    )
+                return cache[relpath]
+            class R:
+                def get_tensor(self, k):
+                    fp = wm[k]
+                    return get_handle(fp).get_tensor(k)
+                def keys(self):
+                    return list(wm.keys())
+            yield R(); return
+
+    raise FileNotFoundError("Neither model.safetensors nor model.safetensors.index.json found")
+
+# --------------- main ---------------
+
+def pick(config, *keys, default=None):
+    for k in keys:
+        v = config
+        ok = True
+        for part in k.split('.'):
+            if isinstance(v, dict) and part in v:
+                v = v[part]
+            else:
+                ok = False; break
+        if ok: return v
+    if default is not None: return default
+    raise KeyError(f"Missing any of keys: {keys}")
+
+
+def get_tensor_first(src, cands):
+    last = None
+    for k in cands:
+        try:
+            return src.get_tensor(k), k
+        except KeyError as e:
+            last = e
+    sample = list(src.keys())[:20]
+    raise KeyError(f"none of {cands}; sample keys: {sample}") from last
+
+
+def get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim):
+    # fused qkv
+    try:
+        w = src.get_tensor(f"block.{n}.attn.qkv.weight")
+        b = src.get_tensor(f"block.{n}.attn.qkv.bias")
+        return w, b, "qkv"
+    except KeyError:
+        # split projections
+        q_w,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.weight", f"model.layers.{n}.self_attn.q_proj.weight"])
+        k_w,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.weight", f"model.layers.{n}.self_attn.k_proj.weight"])
+        v_w,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.weight", f"model.layers.{n}.self_attn.v_proj.weight"])
+        w = torch.cat((q_w, k_w, v_w), dim=0).contiguous()
+        try:
+            q_b,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.bias", f"model.layers.{n}.self_attn.q_proj.bias"])
+            k_b,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.bias", f"model.layers.{n}.self_attn.k_proj.bias"])
+            v_b,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.bias", f"model.layers.{n}.self_attn.v_proj.bias"])
+            b = torch.cat((q_b, k_b, v_b), dim=0).contiguous()
+        except KeyError:
+            b = torch.zeros(w.shape[0], dtype=w.dtype)
+        return w, b, "qkv_split"
+
+
+def main(argv):
+    opt = parser.parse_args(argv)
+
+    config = json.load(open(os.path.join(opt.src, "config.json"), "r"))
+
+    # ---- hyperparams (with fallbacks for popular finetune configs) ----
+    num_blocks = int(pick(config, "num_hidden_layers"))
+    num_experts = int(pick(config, "num_experts", "num_local_experts"))
+    num_active_experts = int(pick(config, "num_active_experts", "experts_per_token", "num_experts_per_tok", default=4))
+    num_q_heads = int(pick(config, "num_attention_heads"))
+    num_kv_heads = int(pick(config, "num_key_value_heads"))
+    head_dim = int(pick(config, "head_dim"))
+    embedding_dim = int(pick(config, "hidden_size"))
+    mlp_dim = int(pick(config, "intermediate_size"))
+    swiglu_limit = float(config.get("swiglu_limit", 7.0))
+    rope_theta = float(pick(config, "rope_theta"))
+    attention_window = int(pick(config, "sliding_window"))
+    initial_context_length = int(pick(config, "initial_context_length"))
+    rope_scaling = config.get("rope_scaling")
+    if isinstance(rope_scaling, dict):
+        rope_scaling_factor = float(rope_scaling.get("factor", 1.0))
+        rope_ntk_alpha = float(rope_scaling.get("ntk_alpha", 1.0))
+        rope_ntk_beta  = float(rope_scaling.get("ntk_beta", 32.0))
+    else:
+        rope_scaling_factor = float(pick(config, "rope_scaling_factor", default=1.0))
+        rope_ntk_alpha = float(pick(config, "rope_ntk_alpha", default=1.0))
+        rope_ntk_beta  = float(pick(config, "rope_ntk_beta",  default=32.0))
+
+    # ---- tokenizer sizes ----
+    tokens_size = 0
+    num_text_tokens = 0
+    for t in range(o200k_gptoss.n_vocab):
+        if not harmony_encoding.is_special_token(t):
+            tb = o200k_gptoss.decode_single_token_bytes(t)
+            assert len(tb) > 0
+            tokens_size += len(tb) + 2  # uint16 length + data
+            num_text_tokens += 1
+    num_included_tokens = o200k_gptoss.n_vocab
+    print(f"[tokenizer] vocab={num_included_tokens} text={num_text_tokens} special={num_included_tokens - num_text_tokens}")
+
+    with open(opt.dst, "wb") as dst:
+        with open_st_reader(opt.src) as src:
+            write_file_header(dst)
+
+            # YARN params derived from NTK bounds
+            yarn_low  = (head_dim/2) * math.log(initial_context_length / (rope_ntk_beta * 2 * math.pi)) / math.log(rope_theta)
+            yarn_high = (head_dim/2) * math.log(initial_context_length / (rope_ntk_alpha * 2 * math.pi)) / math.log(rope_theta)
+
+            write_model_header(
+                dst,
+                context_length=int(initial_context_length * rope_scaling_factor),
+                num_blocks=num_blocks,
+                num_experts=num_experts,
+                num_active_experts=num_active_experts,
+                embedding_dim=embedding_dim,
+                mlp_dim=mlp_dim,
+                swiglu_limit=swiglu_limit,
+                head_dim=head_dim,
+                num_heads=num_q_heads,
+                num_kv_heads=num_kv_heads,
+                attention_window=attention_window,
+                rope_theta=rope_theta,
+                interpolation_scale=1.0/rope_scaling_factor,
+                yarn_offset=-yarn_low/(yarn_high-yarn_low),
+                yarn_scale=1.0/(yarn_high-yarn_low),
+                yarn_multiplier=0.1*math.log(rope_scaling_factor)+1.0,
+                rmsnorm_epsilon=1e-5,
+            )
+
+            write_tokenizer_header(
+                dst,
+                num_special_tokens=num_included_tokens - num_text_tokens,
+                num_text_tokens=num_text_tokens,
+                regex_size=len(o200k_gptoss._pat_str.encode("ascii")) + 1,
+                tokens_size=tokens_size,
+            )
+
+            # UUID table for special tokens [num_text .. vocab)
+            for token_idx in range(num_text_tokens, num_included_tokens):
+                token = o200k_gptoss.decode_single_token_bytes(token_idx).decode('ascii', errors='ignore')
+                if token in INCLUDE_SPECIAL_TOKENS:
+                    dst.write(SPECIAL_TOKEN_UUID[token])
+                else:
+                    dst.write(bytes(16))
+
+            # regex + NUL
+            dst.write(o200k_gptoss._pat_str.encode("ascii"))
+            dst.write(struct.pack('B', 0))
+
+            # text tokens
+            written = 0
+            for t in range(num_text_tokens):
+                tb = o200k_gptoss.decode_single_token_bytes(t)
+                dst.write(struct.pack('<H', len(tb)))
+                dst.write(tb)
+                written += len(tb) + 2
+            assert written == tokens_size
+            write_padding(dst)
+
+            # ---- embedding ----
+            emb, emb_key = get_tensor_first(src, [
+                "embedding.weight", "model.embed_tokens.weight", "embed_tokens.weight", "tok_embeddings.weight", "model.wte.weight"
+            ])
+            print(f"[ok] embedding: {emb_key} shape={tuple(emb.shape)}")
+            emb = emb[:min(emb.shape[0], num_included_tokens), :]
+            write_embedding_weight(dst, emb)
+
+            # ---- blocks ----
+            for n in tqdm(range(num_blocks), desc="blocks"):
+                attn_norm, key = get_tensor_first(src, [
+                    f"block.{n}.attn.norm.scale",
+                    f"model.layers.{n}.input_layernorm.weight",
+                ])
+                if n == 0: print(f"[ok] attn norm: {key}")
+                write_rmsnorm_gain(dst, attn_norm)
+
+                qkv_w, qkv_b, src_kind = get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim)
+                if n == 0: print(f"[ok] attn qkv source: {src_kind}")
+
+                for qkv in (qkv_w, qkv_b):
+                    qk = qkv[:head_dim*(num_q_heads+num_kv_heads), ...].contiguous()
+                    v  = qkv[head_dim*(num_q_heads+num_kv_heads):, ...].contiguous()
+                    qk = qk.view(num_q_heads+num_kv_heads, 2, head_dim//2, -1).transpose(1,2).reshape(num_q_heads+num_kv_heads, head_dim, -1)
+                    q = qk[:num_q_heads, ...]
+                    k = qk[num_q_heads:, ...]
+                    assert head_dim == 64, "assumes head_dim==64 for baked scale"
+                    q *= 0.5; k *= 0.25
+                    v = v.view(num_kv_heads, head_dim, -1)
+                    qkv.copy_(torch.cat((q, k, v), dim=0).reshape(*qkv.shape))
+
+                write_linear_weight(dst, qkv_w, qkv_b)
+
+                sinks, sinks_key = get_tensor_first(src, [
+                    f"block.{n}.attn.sinks",
+                    f"model.layers.{n}.self_attn.sinks",
+                ])
+                if n == 0: print(f"[ok] attn sinks: {sinks_key}")
+                write_attn_sink(dst, sinks)
+
+                attn_out_w,_ = get_tensor_first(src, [
+                    f"block.{n}.attn.out.weight",
+                    f"model.layers.{n}.self_attn.o_proj.weight",
+                ])
+                try:
+                    attn_out_b,_ = get_tensor_first(src, [
+                        f"block.{n}.attn.out.bias",
+                        f"model.layers.{n}.self_attn.o_proj.bias",
+                    ])
+                except KeyError:
+                    attn_out_b = torch.zeros(attn_out_w.shape[0], dtype=attn_out_w.dtype)
+                write_linear_weight(dst, attn_out_w, attn_out_b)
+
+                mlp_norm,_ = get_tensor_first(src, [
+                    f"block.{n}.mlp.norm.scale",
+                    f"model.layers.{n}.post_attention_layernorm.weight",
+                ])
+                if n == 0: print(f"[ok] mlp norm")
+                write_rmsnorm_gain(dst, mlp_norm)
+
+                router_w,_ = get_tensor_first(src, [
+                    f"block.{n}.mlp.gate.weight",        # router logits
+                    f"model.layers.{n}.mlp.router.weight",
+                ])
+                try:
+                    router_b,_ = get_tensor_first(src, [
+                        f"block.{n}.mlp.gate.bias",
+                        f"model.layers.{n}.mlp.router.bias",
+                    ])
+                except KeyError:
+                    router_b = torch.zeros(router_w.shape[0], dtype=router_w.dtype)
+                write_linear_weight(dst, router_w, router_b)
+
+            final_norm,_ = get_tensor_first(src, ["norm.scale", "model.norm.weight"])
+            print(f"[ok] final norm")
+            write_rmsnorm_gain(dst, final_norm)
+
+            unemb, unemb_key = get_tensor_first(src, ["unembedding.weight", "lm_head.weight", "model.lm_head.weight"])
+            print(f"[ok] unembedding: {unemb_key} shape={tuple(unemb.shape)}")
+            unemb = unemb[:min(unemb.shape[0], num_included_tokens), :]
+            write_linear_weight(dst, unemb)
+
+            # ---- MoE (per-expert grouped); support native + Jinx naming ----
+            for n in tqdm(range(num_blocks), desc="experts"):
+                try:
+                    mlp1_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.blocks"])
+                    mlp1_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.scales"])
+                    mlp1_bias,  _ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_bias"])
+                    mlp2_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.blocks"])
+                    mlp2_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.scales"])
+                    mlp2_bias,  _ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_bias"])
+                except KeyError:
+                    # Jinx naming: fused gate+up becomes "mlp1"; down is "mlp2"
+                    gate_up_blk,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_blocks"])
+                    gate_up_scl,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_scales"])
+                    gate_up_bia,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_bias"])
+
+                    down_blk,_    = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_blocks"])
+                    down_scl,_    = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_scales"])
+                    down_bia,_    = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_bias"])
+
+                    mlp1_blocks = gate_up_blk
+                    mlp1_scales = gate_up_scl
+                    mlp1_bias   = gate_up_bia
+
+                    mlp2_blocks = down_blk
+                    mlp2_scales = down_scl
+                    mlp2_bias   = down_bia
+
+                assert mlp1_blocks.shape[0] == mlp2_blocks.shape[0] == num_experts, \
+                    f"experts dim mismatch at block {n}: {mlp1_blocks.shape} vs {mlp2_blocks.shape}"
+
+                write_padding(dst)
+                for e in range(num_experts):
+                    write_bytes_from_tensor(dst, mlp1_blocks[e, ...], 16)
+                    write_ue8_with_bias(dst, mlp1_scales[e, ...], 16)
+                    write_bytes_from_tensor(dst, mlp1_bias[e, ...], 16)
+                    write_bytes_from_tensor(dst, mlp2_blocks[e, ...], 16)
+                    write_ue8_with_bias(dst, mlp2_scales[e, ...], 16)
+                    write_bytes_from_tensor(dst, mlp2_bias[e, ...], 16)
+
+    print(f"[done] Wrote {opt.dst}")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
+
+"""
+python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
+  -s /Volumes/long990max/gpustack_data/huihui-ai/Huihui-gpt-oss-20b-mxfp4-abliterated \
+  -d /Volumes/long990max/project/openharmony-mlx/model.bin
+
+python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
+  -s /Volumes/long990max/gpustack_data/huizimao/gpt-oss-20b-uncensored-mxfp4 \         
+  -d /Volumes/long990max/project/openharmony-mlx/model.bin
+
+想要“拒绝更少”就选 huizimao/gpt-oss-20b-uncensored-mxfp4。
+"""
--- a/tests/bin_header_dump.py
+++ b/tests/bin_header_dump.py
@@ -0,0 +1,55 @@
+# tests/token_uuid_slot.py
+import struct, uuid, sys, tiktoken
+from openai_harmony import load_harmony_encoding, HarmonyEncodingName
+
+FMT_MODEL = "<IIIIII f IIII f f f f f f"
+
+SPECIAL = {
+    "<|reversed199998|>": 199998,
+    "": 199999,
+    "<|untrusted|>": 200000,
+    "<|endofuntrusted|>": 200001,
+    "<|return|>": 200002,
+    "<|constrain|>": 200003,
+    "<|reversed200004|>": 200004,
+    "<|channel|>": 200005,
+    "<|start|>": 200006,
+    "<|end|>": 200007,
+    "<|message|>": 200008,
+    "<|reversed200008|>": 200008,
+    "<|reversed200009|>": 200009,
+    "<|reversed200010|>": 200010,
+    "<|reversed200011|>": 200011,
+    "<|call|>": 200012,
+    "<|refusal|>": 200013,
+}
+
+def table_start_offset(f):
+    f.seek(0)
+    f.read(16)                       # magic
+    f.read(16)                       # model uuid
+    f.read(struct.calcsize(FMT_MODEL))
+    f.read(16)                       # apple uuid
+    tok_uuid = uuid.UUID(bytes=f.read(16))
+    ns, nt, rs, ts = struct.unpack("<IIII", f.read(16))
+    table_off = f.tell()
+    return tok_uuid, ns, nt, rs, ts, table_off
+
+def show(path, token):
+    tok_id = SPECIAL[token]
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    o200k = tiktoken.get_encoding("o200k_base")
+
+    # 用 harmony 的判定口径得到“文本 token 数”，与写 bin 时保持一致
+    num_text = sum(1 for t in range(o200k.n_vocab) if not enc.is_special_token(t))
+    slot = tok_id - num_text
+
+    with open(path, "rb") as f:
+        tok_uuid, ns, nt, rs, ts, table_off = table_start_offset(f)
+        f.seek(table_off + 16*slot)
+        u = uuid.UUID(bytes=f.read(16))
+        print(f"{path}\n  tokenizer_uuid: {tok_uuid}\n  header_nt={nt}, header_ns={ns}\n"
+              f"  token={token} (id={tok_id}) -> slot={slot}, uuid={u}\n")
+
+if __name__ == "__main__":
+    show(sys.argv[1], sys.argv[2])
--- a/tests/peek_scales.py
+++ b/tests/peek_scales.py
@@ -0,0 +1,52 @@
+# peek_scales.py  —— 兼容分片 safetensors
+import json, os, sys
+from safetensors import safe_open
+
+def iter_keys(srcdir):
+    single = os.path.join(srcdir, "model.safetensors")
+    index  = os.path.join(srcdir, "model.safetensors.index.json")
+    if os.path.exists(single):
+        with safe_open(single, framework="pt", device="cpu") as f:
+            yield f, list(f.keys())
+    else:
+        idx = json.load(open(index))
+        wm = idx["weight_map"]
+        opened = {}
+        from contextlib import ExitStack
+        with ExitStack() as stack:
+            for rel in set(wm.values()):
+                opened[rel] = stack.enter_context(safe_open(os.path.join(srcdir, rel), framework="pt", device="cpu"))
+            yield opened, list(wm.keys())
+
+def peek(srcdir):
+    opened, keys = next(iter_keys(srcdir))
+    def get(k):
+        if isinstance(opened, dict):
+            # 分片模式
+            # 找到该 key 对应文件；这里简单起见直接遍历
+            for f in opened.values():
+                if k in f.keys():
+                    return f.get_tensor(k)
+            raise KeyError(k)
+        else:
+            return opened.get_tensor(k)
+
+    import torch
+    mx = []
+    for n in range(0, 1000):   # 粗看前 1000 层以内的命名
+        for which in ("mlp1_weight.scales", "mlp2_weight.scales"):
+            k = f"block.{n}.mlp.{which}"
+            try:
+                t = get(k)
+                mx.append(t.max().item())
+                print(k, "max=", float(mx[-1]))
+            except Exception:
+                pass
+    if mx:
+        m = max(mx)
+        print("\nGLOBAL MAX SCALE:", m, "  (m + 14 =", m+14, ")")
+        if m >= 241:
+            print("⚠️  警告：m+14 可能溢出 uint8！请用 int16+clamp 写回。")
+
+if __name__ == "__main__":
+    peek(sys.argv[1])
--- a/tests/peek_scales_v2.py
+++ b/tests/peek_scales_v2.py
@@ -0,0 +1,48 @@
+# tests/peek_scales_v2.py
+import os, json, sys
+from safetensors import safe_open
+
+def open_any(srcdir):
+    single = os.path.join(srcdir, "model.safetensors")
+    index  = os.path.join(srcdir, "model.safetensors.index.json")
+    if os.path.exists(single):
+        f = safe_open(single, framework="pt", device="cpu")
+        return [f], lambda k: f.get_tensor(k), list(f.keys())
+    wm = json.load(open(index))["weight_map"]
+    files = sorted(set(wm.values()))
+    opened = [safe_open(os.path.join(srcdir, fp), framework="pt", device="cpu") for fp in files]
+    keys = list(wm.keys())
+    def get(k):
+        for f in opened:
+            if k in f.keys(): return f.get_tensor(k)
+        raise KeyError(k)
+    return opened, get, keys
+
+def main(srcdir):
+    opened, get, keys = open_any(srcdir)
+    patterns = [
+        "block.{n}.mlp.mlp1_weight.scales",           # openharmony-mlx 原生
+        "block.{n}.mlp.mlp2_weight.scales",
+        "model.layers.{n}.mlp.experts.gate_up_proj_scales",  # Jinx 命名
+        "model.layers.{n}.mlp.experts.down_proj_scales",
+    ]
+    import torch
+    mx = []
+    for n in range(0, 64):  # 够用
+        for pat in patterns:
+            k = pat.format(n=n)
+            try:
+                t = get(k)
+            except Exception:
+                continue
+            v = float(t.max().item())
+            mx.append(v)
+            print(k, "max=", v)
+    if mx:
+        m = max(mx)
+        print("\nGLOBAL MAX SCALE:", m, " (m + 14 =", m+14, ")")
+        if m + 14 >= 256:
+            print("⚠️ 会发生 uint8 溢出，必须用 int16+clamp 的写法。")
+
+if __name__ == "__main__":
+    main(sys.argv[1])
--- a/tests/smoke_metal.py
+++ b/tests/smoke_metal.py
@@ -0,0 +1,16 @@
+# smoke_metal.py  —— 用 metal 后端直接跑 8 个新 token
+from gpt_oss.responses_api.inference.metal import setup_model
+from openai_harmony import load_harmony_encoding, HarmonyEncodingName
+
+enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+infer_next_token = setup_model("/Volumes/long990max/project/openharmony-mlx/model.bin")  # 改成你的路径
+ids = enc.encode("你好，给我一句话的回答：（英文）")[:128]  # 输入 ids
+
+new = []
+for _ in range(8):
+    tid = infer_next_token(ids + new, temperature=0.7, new_request=False)
+    new.append(tid)
+
+print("new token ids:", new)
+print("decoded:", enc.decode(ids + new))
--- a/tests/token_uuid_slot.py
+++ b/tests/token_uuid_slot.py
@@ -0,0 +1,62 @@
+# token_uuid_slot.py
+import struct, uuid, sys, pathlib, tiktoken
+from openai_harmony import load_harmony_encoding, HarmonyEncodingName
+
+FMT_MODEL = "<IIIIII f IIII f f f f f"
+
+SPECIAL = {
+    "<|reversed199998|>": 199998,
+    "": 199999,
+    "<|untrusted|>": 200000,
+    "<|endofuntrusted|>": 200001,
+    "<|return|>": 200002,
+    "<|constrain|>": 200003,
+    "<|reversed200004|>": 200004,
+    "<|channel|>": 200005,
+    "<|start|>": 200006,
+    "<|end|>": 200007,
+    "<|message|>": 200008,
+    "<|reversed200008|>": 200008,
+    "<|reversed200009|>": 200009,
+    "<|reversed200010|>": 200010,
+    "<|reversed200011|>": 200011,
+    "<|call|>": 200012,
+    "<|refusal|>": 200013,
+}
+
+def header_and_table_off(f):
+    f.read(16)                     # magic
+    f.read(16)                     # model uuid
+    f.read(struct.calcsize(FMT_MODEL))
+    f.read(16)                     # apple uuid
+    tok_uuid = uuid.UUID(bytes=f.read(16))
+    ns, nt, rs, ts = struct.unpack("<IIII", f.read(16))
+    table_off = f.tell()           # UUID 表的起始位置
+    return tok_uuid, ns, nt, rs, ts, table_off
+
+def show(path, token):
+    tok_id = SPECIAL[token]
+    enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    o200k = tiktoken.get_encoding("o200k_base")
+    # 与转权重时同源的“文本/特殊”划分
+    num_text = sum(1 for t in range(o200k.n_vocab) if not enc.is_special_token(t))
+    slot = tok_id - num_text
+    with open(path, "rb") as f:
+        tok_uuid, ns, nt, rs, ts, table_off = header_and_table_off(f)
+        f.seek(table_off + 16*slot)
+        u = uuid.UUID(bytes=f.read(16))
+        print(f"{path}\n  tokenizer_uuid: {tok_uuid}\n  num_text={nt}, num_special={ns}")
+        print(f"  token={token} (id={tok_id}) -> slot={slot}, uuid={u}\n")
+
+if __name__ == "__main__":
+    # 用法: python token_uuid_slot.py <bin> "<|channel|>"
+    show(sys.argv[1], sys.argv[2])
+
+# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|channel|>"
+# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|channel|>"
+
+# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|message|>"
+# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|message|>"
+
+# python tests/token_uuid_slot.py /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin "<|return|>"
+# python tests/token_uuid_slot.py /Volumes/long990max/project/openharmony-mlx/model.bin "<|return|>"
--- a/usage.md
+++ b/usage.md
@@ -113,9 +113,57 @@ cd ~/.cache/openai_harmony/
 wget https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
 export OPENAI_HARMONY_CACHE_DIR=~/.cache/openai_harmony/
 chmod 755 ~/.cache/openai_harmony/
-python -m gpt_oss.responses_api.serve --inference-backend metal --checkpoint /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin --host 0.0.0.0 --port 8080
+python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py -s /Volumes/long990max/gpustack_data/huihui-ai/Huihui-gpt-oss-20b-mxfp4-abliterated -d /Volumes/long990max/project/openharmony-mlx/model.bin
+
+micromamba activate gptoss && python -m gpt_oss.responses_api.serve --inference-backend metal --host 0.0.0.0 --port 8080 --checkpoint /Volumes/long990max/gpustack_data/openai/gpt-oss-20b/metal/model.bin 
+# 启动拒绝少的权重
+micromamba activate gptoss && python -m gpt_oss.responses_api.serve \
+--inference-backend metal \
+--checkpoint /Volumes/long990max/project/openharmony-mlx/pth/gpt-oss-20b-uncensored-mxfp4/metal/model.bin \
+--host 0.0.0.0 --port 8080
 ```

+## gpt-oss-120b 模型选择
+
+huizimao/gpt-oss-120b-uncensored-bf16（LoRA，BF16）
+在 Amazon FalseReject 测试集（300条）上的 误拒率≈6%（原版≈70%）。适合你追求最低误拒、且硬件吃得下 BF16 的场景。
+
+huizimao/gpt-oss-120b-uncensored-mxfp4（LoRA + PTQ，MXFP4）
+同一评测设置下，误拒率≈24%；相比 BF16 版本误拒稍高，但体积/部署友好，便于与你现在的 Metal/MXFP4 流水线对接。
+
+## 以后要转其它 finetune 的 safetensors（同 20B）时，提前确认这几件事（最小清单）
+
+config.json 至少含（或能推导）：
+
+num_hidden_layers, hidden_size, intermediate_size
+
+num_attention_heads, num_key_value_heads, head_dim（若 head_dim != 64 就不能 bake Q/K 缩放）
+
+sliding_window, rope_theta, initial_context_length
+
+rope_scaling_factor 或 rope_scaling.factor；rope_ntk_alpha/beta（给了默认 1.0/32.0）
+
+MoE：num_experts 或 num_local_experts；num_active_experts/experts_per_token（默认 4）
+
+权重命名是否落在这两类之一（脚本已兼容）：
+
+原生：block.N.attn.qkv.* 或 q_proj/k_proj/v_proj.*；mlp.mlp{1,2}_weight.{blocks,scales} + mlp{1,2}_bias
+
+Jinx：model.layers.N.self_attn.{q,k,v}_proj_*；mlp.experts.{gate_up,down}_proj_{blocks,scales,bias}
+
+特殊 Token：必须与 Harmony GPT-OSS 的映射一致（我脚本里写死了）。常见问题是有人把
+"<|endofuntrusted|>" 拼成 end_untrusted —— 这会导致 UUID 表错位；我这份脚本固定了官方拼写。
+
+MXFP4 scales：随手跑一眼（我给你的 peek_scales_v2.py 就行）。如果 max + 14 >= 256，脚本的 clamp 就会生效，避免坏值写入。
+
+快速自测：除了上面的 curl SSE，建议每次都跑一下：
+
+tests/token_uuid_slot.py 对 <|channel|> / <|message|> / <|return|> / <|call|> 看 slot 与 UUID 是否匹配；
+
+tests/smoke_metal.py 推几个 token，确认不会崩溃或卡死。
+
+以上都满足，基本可以保证在 Codex 的 responses 线上协议里稳定工作。
+
 ## cherrystudio 配置

 添加提供商选择`OpenAI-Response`
Author	SHA1	Message	Date
hotwa	5c5838605c	增加权重转化时候的检查脚本	2025-10-08 19:59:25 +08:00
hotwa	9974fc7a00	uncensored 版本的权重转化	2025-10-08 19:57:47 +08:00
hotwa	753b3f9dc8	add use description	2025-10-08 19:57:03 +08:00