uncensored 版本的权重转化
This commit is contained in:
500
gpt_oss/metal/scripts/create-local-modelnew.py
Normal file
500
gpt_oss/metal/scripts/create-local-modelnew.py
Normal file
@@ -0,0 +1,500 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
GPT‑OSS → Metal bin converter (safe, sharded‑aware)
|
||||||
|
- Supports single or sharded safetensors (index.json)
|
||||||
|
- Compatible with OpenHarmony‑MLX metal backend
|
||||||
|
- Works with finetuned GPT‑OSS‑20B variants (Jinx naming fallback)
|
||||||
|
- Safer MXFP4 scales write (int16 + clamp + bias)
|
||||||
|
- Tokenizer header/UUID table aligned to Harmony GPT‑OSS encoding
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python create-local-modelnew.py \
|
||||||
|
-s /path/to/ckpt_dir \
|
||||||
|
-d /path/to/output/model.bin
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import math
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import struct
|
||||||
|
from uuid import UUID
|
||||||
|
import contextlib
|
||||||
|
from contextlib import ExitStack
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from safetensors import safe_open
|
||||||
|
import tiktoken
|
||||||
|
from tqdm import tqdm
|
||||||
|
from openai_harmony import load_harmony_encoding, HarmonyEncodingName
|
||||||
|
|
||||||
|
# ---------------- CLI ----------------
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="create-local-modelnew.py",
|
||||||
|
description="Convert GPT‑OSS MXFP4 weights to Metal .bin (safe)"
|
||||||
|
)
|
||||||
|
parser.add_argument("-s", "--src", metavar="DIR", type=str, required=True, help="Path to safetensors dir (single or sharded)")
|
||||||
|
parser.add_argument("-d", "--dst", metavar="FILE", type=str, required=True, help="Path to output model.bin")
|
||||||
|
|
||||||
|
# --------------- Encoding (match GPT‑OSS) ---------------
|
||||||
|
o200k_base = tiktoken.get_encoding("o200k_base")
|
||||||
|
harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
|
||||||
|
|
||||||
|
# Compose the GPT‑OSS view over o200k with the expected special IDs
|
||||||
|
# NOTE: keep the official eot id as ""
|
||||||
|
o200k_gptoss = tiktoken.Encoding(
|
||||||
|
name="o200k_gptoss",
|
||||||
|
pat_str=o200k_base._pat_str,
|
||||||
|
mergeable_ranks=o200k_base._mergeable_ranks,
|
||||||
|
special_tokens={
|
||||||
|
"<|reversed199998|>": 199998,
|
||||||
|
"": 199999, # official eot id
|
||||||
|
"<|untrusted|>": 200000,
|
||||||
|
"<|endofuntrusted|>": 200001, # official spelling
|
||||||
|
"<|return|>": 200002,
|
||||||
|
"<|constrain|>": 200003,
|
||||||
|
"<|reversed200004|>": 200004,
|
||||||
|
"<|channel|>": 200005,
|
||||||
|
"<|start|>": 200006,
|
||||||
|
"<|end|>": 200007,
|
||||||
|
"<|message|>": 200008,
|
||||||
|
"<|reversed200008|>": 200008,
|
||||||
|
"<|reversed200009|>": 200009,
|
||||||
|
"<|reversed200010|>": 200010,
|
||||||
|
"<|reversed200011|>": 200011,
|
||||||
|
"<|call|>": 200012,
|
||||||
|
"<|refusal|>": 200013,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# --------------- File / token constants ---------------
|
||||||
|
FILE_MAGIC = struct.pack('ccccccccccccI', b'G', b'P', b'T', b'-', b'O', b'S', b'S', b' ', b'v', b'1', b'.', b'0', 0)
|
||||||
|
|
||||||
|
SPECIAL_TOKEN_UUID = {
|
||||||
|
'<|start|>': UUID('55a77c2f-8a01-4c54-8ac2-313bfc7e208d').bytes,
|
||||||
|
'<|message|>': UUID('16e40431-f47f-4b22-b59b-8b278fc30a54').bytes,
|
||||||
|
'<|end|>': UUID('fcac2f6d-4705-4f6b-b228-642accac7238').bytes,
|
||||||
|
'<|return|>': UUID('f799ff69-1992-43c4-a3d8-d831f475dc75').bytes,
|
||||||
|
'<|refusal|>': UUID('e15ba702-28c4-4292-ab8f-ffa434709128').bytes,
|
||||||
|
'<|constrain|>': UUID('c0bb14c7-6022-49da-ad08-792d67e8b470').bytes,
|
||||||
|
'<|channel|>': UUID('fd3dda11-c8ab-4033-876e-d93deb172c93').bytes,
|
||||||
|
'<|call|>': UUID('1220f796-e388-4de5-b487-fe2eb5fe03c0').bytes,
|
||||||
|
'<|untrusted|>': UUID('07d7da55-b346-4cff-8b37-7cefacf8a3e8').bytes,
|
||||||
|
'<|endofuntrusted|>': UUID('f265bd9c-c717-469e-a447-920687d65d90').bytes,
|
||||||
|
# NOTE: EOT ("") has no UUID
|
||||||
|
}
|
||||||
|
INCLUDE_SPECIAL_TOKENS = [
|
||||||
|
"<|start|>", "<|message|>", "<|end|>", "<|return|>",
|
||||||
|
"<|refusal|>", "<|constrain|>", "<|channel|>", "<|call|>",
|
||||||
|
"<|untrusted|>", "<|endofuntrusted|>",
|
||||||
|
]
|
||||||
|
|
||||||
|
GPTOSS_MODEL_UUID = UUID('df52dc86-1789-4ed0-a295-66f10508145b').bytes
|
||||||
|
APPLE_GPU_LAYOUT_UUID = UUID('229177a8-5775-4268-bfd8-588b351c56d'.replace('588','d588')).bytes # formatting guard
|
||||||
|
TIKTOKEN_TOKENIZER_UUID = UUID('7401aded-2a95-40cb-b782-9ccebaafe72b').bytes
|
||||||
|
|
||||||
|
UE8_OFFSET = 14 # bias for MXFP4 block scales (UE8)
|
||||||
|
|
||||||
|
# --------------- IO helpers ---------------
|
||||||
|
|
||||||
|
def write_file_header(f):
|
||||||
|
f.write(FILE_MAGIC)
|
||||||
|
|
||||||
|
def write_tokenizer_header(f, num_special_tokens:int, num_text_tokens:int, regex_size:int, tokens_size:int):
|
||||||
|
f.write(TIKTOKEN_TOKENIZER_UUID)
|
||||||
|
f.write(struct.pack('<I', num_special_tokens))
|
||||||
|
f.write(struct.pack('<I', num_text_tokens))
|
||||||
|
f.write(struct.pack('<I', regex_size))
|
||||||
|
f.write(struct.pack('<I', tokens_size))
|
||||||
|
|
||||||
|
# model header layout: 6*U32, 1*F32, 4*U32, 6*F32 (total 17 fields)
|
||||||
|
# Keep exactly in sync with the metal runtime reader.
|
||||||
|
|
||||||
|
def write_model_header(f,
|
||||||
|
context_length:int, num_blocks:int, num_experts:int, num_active_experts:int,
|
||||||
|
embedding_dim:int, mlp_dim:int, swiglu_limit:float, head_dim:int, num_heads:int,
|
||||||
|
num_kv_heads:int, attention_window:int, rope_theta:float, interpolation_scale:float,
|
||||||
|
yarn_offset:float, yarn_scale:float, yarn_multiplier:float, rmsnorm_epsilon:float,
|
||||||
|
):
|
||||||
|
f.write(GPTOSS_MODEL_UUID)
|
||||||
|
f.write(struct.pack('<I', context_length))
|
||||||
|
f.write(struct.pack('<I', num_blocks))
|
||||||
|
f.write(struct.pack('<I', num_experts))
|
||||||
|
f.write(struct.pack('<I', num_active_experts))
|
||||||
|
f.write(struct.pack('<I', embedding_dim))
|
||||||
|
f.write(struct.pack('<I', mlp_dim))
|
||||||
|
f.write(struct.pack('<f', swiglu_limit))
|
||||||
|
f.write(struct.pack('<I', head_dim))
|
||||||
|
f.write(struct.pack('<I', num_heads))
|
||||||
|
f.write(struct.pack('<I', num_kv_heads))
|
||||||
|
f.write(struct.pack('<I', attention_window))
|
||||||
|
f.write(struct.pack('<f', rope_theta))
|
||||||
|
f.write(struct.pack('<f', interpolation_scale))
|
||||||
|
f.write(struct.pack('<f', yarn_offset))
|
||||||
|
f.write(struct.pack('<f', yarn_scale))
|
||||||
|
f.write(struct.pack('<f', yarn_multiplier))
|
||||||
|
f.write(struct.pack('<f', rmsnorm_epsilon))
|
||||||
|
f.write(APPLE_GPU_LAYOUT_UUID)
|
||||||
|
|
||||||
|
|
||||||
|
def write_padding(out_file, alignment_multiple=16384):
|
||||||
|
off = out_file.tell()
|
||||||
|
pad = (-off) % alignment_multiple
|
||||||
|
if pad:
|
||||||
|
out_file.write(bytes(pad))
|
||||||
|
|
||||||
|
|
||||||
|
def write_bytes_from_tensor(out_file, t: torch.Tensor, align=16):
|
||||||
|
write_padding(out_file, align)
|
||||||
|
out_file.write(t.view(torch.uint8).numpy().tobytes())
|
||||||
|
|
||||||
|
|
||||||
|
def write_embedding_weight(out_file, weight: torch.Tensor):
|
||||||
|
assert weight.dtype in (torch.float8_e4m3fn, torch.bfloat16)
|
||||||
|
write_bytes_from_tensor(out_file, weight, 16)
|
||||||
|
|
||||||
|
|
||||||
|
def write_rmsnorm_gain(out_file, gain: torch.Tensor):
|
||||||
|
assert gain.dtype == torch.bfloat16
|
||||||
|
write_bytes_from_tensor(out_file, gain, 16)
|
||||||
|
|
||||||
|
|
||||||
|
def write_attn_sink(out_file, sink: torch.Tensor):
|
||||||
|
assert sink.dtype == torch.bfloat16
|
||||||
|
write_bytes_from_tensor(out_file, sink, 16)
|
||||||
|
|
||||||
|
|
||||||
|
def write_linear_weight(out_file, *args: torch.Tensor):
|
||||||
|
write_padding(out_file, 16)
|
||||||
|
for t in args:
|
||||||
|
out_file.write(t.view(torch.uint8).numpy().tobytes())
|
||||||
|
|
||||||
|
|
||||||
|
def write_ue8_with_bias(out_file, scales: torch.Tensor, align=16, offset: int = UE8_OFFSET):
|
||||||
|
# Avoid uint8 wraparound by accumulating in int16 then clamping to [0,255]
|
||||||
|
s = scales.to(torch.int16).add_(int(offset)).clamp_(0, 255).to(torch.uint8)
|
||||||
|
write_padding(out_file, align)
|
||||||
|
out_file.write(s.numpy().tobytes())
|
||||||
|
|
||||||
|
# ---- open single or sharded safetensors (index.json) ----
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def open_st_reader(srcdir: str):
|
||||||
|
single = os.path.join(srcdir, "model.safetensors")
|
||||||
|
index = os.path.join(srcdir, "model.safetensors.index.json")
|
||||||
|
|
||||||
|
if os.path.exists(single):
|
||||||
|
with safe_open(single, framework="pt", device="cpu") as f:
|
||||||
|
class R:
|
||||||
|
def get_tensor(self, k): return f.get_tensor(k)
|
||||||
|
def keys(self): return list(f.keys())
|
||||||
|
yield R(); return
|
||||||
|
|
||||||
|
if os.path.exists(index):
|
||||||
|
idx = json.load(open(index, "r"))
|
||||||
|
wm = idx["weight_map"]
|
||||||
|
with ExitStack() as stack:
|
||||||
|
cache = {}
|
||||||
|
def get_handle(relpath):
|
||||||
|
if relpath not in cache:
|
||||||
|
cache[relpath] = stack.enter_context(
|
||||||
|
safe_open(os.path.join(srcdir, relpath), framework="pt", device="cpu")
|
||||||
|
)
|
||||||
|
return cache[relpath]
|
||||||
|
class R:
|
||||||
|
def get_tensor(self, k):
|
||||||
|
fp = wm[k]
|
||||||
|
return get_handle(fp).get_tensor(k)
|
||||||
|
def keys(self):
|
||||||
|
return list(wm.keys())
|
||||||
|
yield R(); return
|
||||||
|
|
||||||
|
raise FileNotFoundError("Neither model.safetensors nor model.safetensors.index.json found")
|
||||||
|
|
||||||
|
# --------------- main ---------------
|
||||||
|
|
||||||
|
def pick(config, *keys, default=None):
|
||||||
|
for k in keys:
|
||||||
|
v = config
|
||||||
|
ok = True
|
||||||
|
for part in k.split('.'):
|
||||||
|
if isinstance(v, dict) and part in v:
|
||||||
|
v = v[part]
|
||||||
|
else:
|
||||||
|
ok = False; break
|
||||||
|
if ok: return v
|
||||||
|
if default is not None: return default
|
||||||
|
raise KeyError(f"Missing any of keys: {keys}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_tensor_first(src, cands):
|
||||||
|
last = None
|
||||||
|
for k in cands:
|
||||||
|
try:
|
||||||
|
return src.get_tensor(k), k
|
||||||
|
except KeyError as e:
|
||||||
|
last = e
|
||||||
|
sample = list(src.keys())[:20]
|
||||||
|
raise KeyError(f"none of {cands}; sample keys: {sample}") from last
|
||||||
|
|
||||||
|
|
||||||
|
def get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim):
|
||||||
|
# fused qkv
|
||||||
|
try:
|
||||||
|
w = src.get_tensor(f"block.{n}.attn.qkv.weight")
|
||||||
|
b = src.get_tensor(f"block.{n}.attn.qkv.bias")
|
||||||
|
return w, b, "qkv"
|
||||||
|
except KeyError:
|
||||||
|
# split projections
|
||||||
|
q_w,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.weight", f"model.layers.{n}.self_attn.q_proj.weight"])
|
||||||
|
k_w,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.weight", f"model.layers.{n}.self_attn.k_proj.weight"])
|
||||||
|
v_w,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.weight", f"model.layers.{n}.self_attn.v_proj.weight"])
|
||||||
|
w = torch.cat((q_w, k_w, v_w), dim=0).contiguous()
|
||||||
|
try:
|
||||||
|
q_b,_ = get_tensor_first(src, [f"block.{n}.attn.q_proj.bias", f"model.layers.{n}.self_attn.q_proj.bias"])
|
||||||
|
k_b,_ = get_tensor_first(src, [f"block.{n}.attn.k_proj.bias", f"model.layers.{n}.self_attn.k_proj.bias"])
|
||||||
|
v_b,_ = get_tensor_first(src, [f"block.{n}.attn.v_proj.bias", f"model.layers.{n}.self_attn.v_proj.bias"])
|
||||||
|
b = torch.cat((q_b, k_b, v_b), dim=0).contiguous()
|
||||||
|
except KeyError:
|
||||||
|
b = torch.zeros(w.shape[0], dtype=w.dtype)
|
||||||
|
return w, b, "qkv_split"
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv):
|
||||||
|
opt = parser.parse_args(argv)
|
||||||
|
|
||||||
|
config = json.load(open(os.path.join(opt.src, "config.json"), "r"))
|
||||||
|
|
||||||
|
# ---- hyperparams (with fallbacks for popular finetune configs) ----
|
||||||
|
num_blocks = int(pick(config, "num_hidden_layers"))
|
||||||
|
num_experts = int(pick(config, "num_experts", "num_local_experts"))
|
||||||
|
num_active_experts = int(pick(config, "num_active_experts", "experts_per_token", "num_experts_per_tok", default=4))
|
||||||
|
num_q_heads = int(pick(config, "num_attention_heads"))
|
||||||
|
num_kv_heads = int(pick(config, "num_key_value_heads"))
|
||||||
|
head_dim = int(pick(config, "head_dim"))
|
||||||
|
embedding_dim = int(pick(config, "hidden_size"))
|
||||||
|
mlp_dim = int(pick(config, "intermediate_size"))
|
||||||
|
swiglu_limit = float(config.get("swiglu_limit", 7.0))
|
||||||
|
rope_theta = float(pick(config, "rope_theta"))
|
||||||
|
attention_window = int(pick(config, "sliding_window"))
|
||||||
|
initial_context_length = int(pick(config, "initial_context_length"))
|
||||||
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
if isinstance(rope_scaling, dict):
|
||||||
|
rope_scaling_factor = float(rope_scaling.get("factor", 1.0))
|
||||||
|
rope_ntk_alpha = float(rope_scaling.get("ntk_alpha", 1.0))
|
||||||
|
rope_ntk_beta = float(rope_scaling.get("ntk_beta", 32.0))
|
||||||
|
else:
|
||||||
|
rope_scaling_factor = float(pick(config, "rope_scaling_factor", default=1.0))
|
||||||
|
rope_ntk_alpha = float(pick(config, "rope_ntk_alpha", default=1.0))
|
||||||
|
rope_ntk_beta = float(pick(config, "rope_ntk_beta", default=32.0))
|
||||||
|
|
||||||
|
# ---- tokenizer sizes ----
|
||||||
|
tokens_size = 0
|
||||||
|
num_text_tokens = 0
|
||||||
|
for t in range(o200k_gptoss.n_vocab):
|
||||||
|
if not harmony_encoding.is_special_token(t):
|
||||||
|
tb = o200k_gptoss.decode_single_token_bytes(t)
|
||||||
|
assert len(tb) > 0
|
||||||
|
tokens_size += len(tb) + 2 # uint16 length + data
|
||||||
|
num_text_tokens += 1
|
||||||
|
num_included_tokens = o200k_gptoss.n_vocab
|
||||||
|
print(f"[tokenizer] vocab={num_included_tokens} text={num_text_tokens} special={num_included_tokens - num_text_tokens}")
|
||||||
|
|
||||||
|
with open(opt.dst, "wb") as dst:
|
||||||
|
with open_st_reader(opt.src) as src:
|
||||||
|
write_file_header(dst)
|
||||||
|
|
||||||
|
# YARN params derived from NTK bounds
|
||||||
|
yarn_low = (head_dim/2) * math.log(initial_context_length / (rope_ntk_beta * 2 * math.pi)) / math.log(rope_theta)
|
||||||
|
yarn_high = (head_dim/2) * math.log(initial_context_length / (rope_ntk_alpha * 2 * math.pi)) / math.log(rope_theta)
|
||||||
|
|
||||||
|
write_model_header(
|
||||||
|
dst,
|
||||||
|
context_length=int(initial_context_length * rope_scaling_factor),
|
||||||
|
num_blocks=num_blocks,
|
||||||
|
num_experts=num_experts,
|
||||||
|
num_active_experts=num_active_experts,
|
||||||
|
embedding_dim=embedding_dim,
|
||||||
|
mlp_dim=mlp_dim,
|
||||||
|
swiglu_limit=swiglu_limit,
|
||||||
|
head_dim=head_dim,
|
||||||
|
num_heads=num_q_heads,
|
||||||
|
num_kv_heads=num_kv_heads,
|
||||||
|
attention_window=attention_window,
|
||||||
|
rope_theta=rope_theta,
|
||||||
|
interpolation_scale=1.0/rope_scaling_factor,
|
||||||
|
yarn_offset=-yarn_low/(yarn_high-yarn_low),
|
||||||
|
yarn_scale=1.0/(yarn_high-yarn_low),
|
||||||
|
yarn_multiplier=0.1*math.log(rope_scaling_factor)+1.0,
|
||||||
|
rmsnorm_epsilon=1e-5,
|
||||||
|
)
|
||||||
|
|
||||||
|
write_tokenizer_header(
|
||||||
|
dst,
|
||||||
|
num_special_tokens=num_included_tokens - num_text_tokens,
|
||||||
|
num_text_tokens=num_text_tokens,
|
||||||
|
regex_size=len(o200k_gptoss._pat_str.encode("ascii")) + 1,
|
||||||
|
tokens_size=tokens_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
# UUID table for special tokens [num_text .. vocab)
|
||||||
|
for token_idx in range(num_text_tokens, num_included_tokens):
|
||||||
|
token = o200k_gptoss.decode_single_token_bytes(token_idx).decode('ascii', errors='ignore')
|
||||||
|
if token in INCLUDE_SPECIAL_TOKENS:
|
||||||
|
dst.write(SPECIAL_TOKEN_UUID[token])
|
||||||
|
else:
|
||||||
|
dst.write(bytes(16))
|
||||||
|
|
||||||
|
# regex + NUL
|
||||||
|
dst.write(o200k_gptoss._pat_str.encode("ascii"))
|
||||||
|
dst.write(struct.pack('B', 0))
|
||||||
|
|
||||||
|
# text tokens
|
||||||
|
written = 0
|
||||||
|
for t in range(num_text_tokens):
|
||||||
|
tb = o200k_gptoss.decode_single_token_bytes(t)
|
||||||
|
dst.write(struct.pack('<H', len(tb)))
|
||||||
|
dst.write(tb)
|
||||||
|
written += len(tb) + 2
|
||||||
|
assert written == tokens_size
|
||||||
|
write_padding(dst)
|
||||||
|
|
||||||
|
# ---- embedding ----
|
||||||
|
emb, emb_key = get_tensor_first(src, [
|
||||||
|
"embedding.weight", "model.embed_tokens.weight", "embed_tokens.weight", "tok_embeddings.weight", "model.wte.weight"
|
||||||
|
])
|
||||||
|
print(f"[ok] embedding: {emb_key} shape={tuple(emb.shape)}")
|
||||||
|
emb = emb[:min(emb.shape[0], num_included_tokens), :]
|
||||||
|
write_embedding_weight(dst, emb)
|
||||||
|
|
||||||
|
# ---- blocks ----
|
||||||
|
for n in tqdm(range(num_blocks), desc="blocks"):
|
||||||
|
attn_norm, key = get_tensor_first(src, [
|
||||||
|
f"block.{n}.attn.norm.scale",
|
||||||
|
f"model.layers.{n}.input_layernorm.weight",
|
||||||
|
])
|
||||||
|
if n == 0: print(f"[ok] attn norm: {key}")
|
||||||
|
write_rmsnorm_gain(dst, attn_norm)
|
||||||
|
|
||||||
|
qkv_w, qkv_b, src_kind = get_attn_qkv(src, n, num_q_heads, num_kv_heads, head_dim)
|
||||||
|
if n == 0: print(f"[ok] attn qkv source: {src_kind}")
|
||||||
|
|
||||||
|
for qkv in (qkv_w, qkv_b):
|
||||||
|
qk = qkv[:head_dim*(num_q_heads+num_kv_heads), ...].contiguous()
|
||||||
|
v = qkv[head_dim*(num_q_heads+num_kv_heads):, ...].contiguous()
|
||||||
|
qk = qk.view(num_q_heads+num_kv_heads, 2, head_dim//2, -1).transpose(1,2).reshape(num_q_heads+num_kv_heads, head_dim, -1)
|
||||||
|
q = qk[:num_q_heads, ...]
|
||||||
|
k = qk[num_q_heads:, ...]
|
||||||
|
assert head_dim == 64, "assumes head_dim==64 for baked scale"
|
||||||
|
q *= 0.5; k *= 0.25
|
||||||
|
v = v.view(num_kv_heads, head_dim, -1)
|
||||||
|
qkv.copy_(torch.cat((q, k, v), dim=0).reshape(*qkv.shape))
|
||||||
|
|
||||||
|
write_linear_weight(dst, qkv_w, qkv_b)
|
||||||
|
|
||||||
|
sinks, sinks_key = get_tensor_first(src, [
|
||||||
|
f"block.{n}.attn.sinks",
|
||||||
|
f"model.layers.{n}.self_attn.sinks",
|
||||||
|
])
|
||||||
|
if n == 0: print(f"[ok] attn sinks: {sinks_key}")
|
||||||
|
write_attn_sink(dst, sinks)
|
||||||
|
|
||||||
|
attn_out_w,_ = get_tensor_first(src, [
|
||||||
|
f"block.{n}.attn.out.weight",
|
||||||
|
f"model.layers.{n}.self_attn.o_proj.weight",
|
||||||
|
])
|
||||||
|
try:
|
||||||
|
attn_out_b,_ = get_tensor_first(src, [
|
||||||
|
f"block.{n}.attn.out.bias",
|
||||||
|
f"model.layers.{n}.self_attn.o_proj.bias",
|
||||||
|
])
|
||||||
|
except KeyError:
|
||||||
|
attn_out_b = torch.zeros(attn_out_w.shape[0], dtype=attn_out_w.dtype)
|
||||||
|
write_linear_weight(dst, attn_out_w, attn_out_b)
|
||||||
|
|
||||||
|
mlp_norm,_ = get_tensor_first(src, [
|
||||||
|
f"block.{n}.mlp.norm.scale",
|
||||||
|
f"model.layers.{n}.post_attention_layernorm.weight",
|
||||||
|
])
|
||||||
|
if n == 0: print(f"[ok] mlp norm")
|
||||||
|
write_rmsnorm_gain(dst, mlp_norm)
|
||||||
|
|
||||||
|
router_w,_ = get_tensor_first(src, [
|
||||||
|
f"block.{n}.mlp.gate.weight", # router logits
|
||||||
|
f"model.layers.{n}.mlp.router.weight",
|
||||||
|
])
|
||||||
|
try:
|
||||||
|
router_b,_ = get_tensor_first(src, [
|
||||||
|
f"block.{n}.mlp.gate.bias",
|
||||||
|
f"model.layers.{n}.mlp.router.bias",
|
||||||
|
])
|
||||||
|
except KeyError:
|
||||||
|
router_b = torch.zeros(router_w.shape[0], dtype=router_w.dtype)
|
||||||
|
write_linear_weight(dst, router_w, router_b)
|
||||||
|
|
||||||
|
final_norm,_ = get_tensor_first(src, ["norm.scale", "model.norm.weight"])
|
||||||
|
print(f"[ok] final norm")
|
||||||
|
write_rmsnorm_gain(dst, final_norm)
|
||||||
|
|
||||||
|
unemb, unemb_key = get_tensor_first(src, ["unembedding.weight", "lm_head.weight", "model.lm_head.weight"])
|
||||||
|
print(f"[ok] unembedding: {unemb_key} shape={tuple(unemb.shape)}")
|
||||||
|
unemb = unemb[:min(unemb.shape[0], num_included_tokens), :]
|
||||||
|
write_linear_weight(dst, unemb)
|
||||||
|
|
||||||
|
# ---- MoE (per-expert grouped); support native + Jinx naming ----
|
||||||
|
for n in tqdm(range(num_blocks), desc="experts"):
|
||||||
|
try:
|
||||||
|
mlp1_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.blocks"])
|
||||||
|
mlp1_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_weight.scales"])
|
||||||
|
mlp1_bias, _ = get_tensor_first(src, [f"block.{n}.mlp.mlp1_bias"])
|
||||||
|
mlp2_blocks,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.blocks"])
|
||||||
|
mlp2_scales,_ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_weight.scales"])
|
||||||
|
mlp2_bias, _ = get_tensor_first(src, [f"block.{n}.mlp.mlp2_bias"])
|
||||||
|
except KeyError:
|
||||||
|
# Jinx naming: fused gate+up becomes "mlp1"; down is "mlp2"
|
||||||
|
gate_up_blk,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_blocks"])
|
||||||
|
gate_up_scl,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_scales"])
|
||||||
|
gate_up_bia,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.gate_up_proj_bias"])
|
||||||
|
|
||||||
|
down_blk,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_blocks"])
|
||||||
|
down_scl,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_scales"])
|
||||||
|
down_bia,_ = get_tensor_first(src, [f"model.layers.{n}.mlp.experts.down_proj_bias"])
|
||||||
|
|
||||||
|
mlp1_blocks = gate_up_blk
|
||||||
|
mlp1_scales = gate_up_scl
|
||||||
|
mlp1_bias = gate_up_bia
|
||||||
|
|
||||||
|
mlp2_blocks = down_blk
|
||||||
|
mlp2_scales = down_scl
|
||||||
|
mlp2_bias = down_bia
|
||||||
|
|
||||||
|
assert mlp1_blocks.shape[0] == mlp2_blocks.shape[0] == num_experts, \
|
||||||
|
f"experts dim mismatch at block {n}: {mlp1_blocks.shape} vs {mlp2_blocks.shape}"
|
||||||
|
|
||||||
|
write_padding(dst)
|
||||||
|
for e in range(num_experts):
|
||||||
|
write_bytes_from_tensor(dst, mlp1_blocks[e, ...], 16)
|
||||||
|
write_ue8_with_bias(dst, mlp1_scales[e, ...], 16)
|
||||||
|
write_bytes_from_tensor(dst, mlp1_bias[e, ...], 16)
|
||||||
|
write_bytes_from_tensor(dst, mlp2_blocks[e, ...], 16)
|
||||||
|
write_ue8_with_bias(dst, mlp2_scales[e, ...], 16)
|
||||||
|
write_bytes_from_tensor(dst, mlp2_bias[e, ...], 16)
|
||||||
|
|
||||||
|
print(f"[done] Wrote {opt.dst}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1:])
|
||||||
|
|
||||||
|
"""
|
||||||
|
python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
|
||||||
|
-s /Volumes/long990max/gpustack_data/huihui-ai/Huihui-gpt-oss-20b-mxfp4-abliterated \
|
||||||
|
-d /Volumes/long990max/project/openharmony-mlx/model.bin
|
||||||
|
|
||||||
|
python /Volumes/long990max/project/openharmony-mlx/gpt_oss/metal/scripts/create-local-modelnew.py \
|
||||||
|
-s /Volumes/long990max/gpustack_data/huizimao/gpt-oss-20b-uncensored-mxfp4 \
|
||||||
|
-d /Volumes/long990max/project/openharmony-mlx/model.bin
|
||||||
|
|
||||||
|
想要“拒绝更少”就选 huizimao/gpt-oss-20b-uncensored-mxfp4。
|
||||||
|
"""
|
||||||
Reference in New Issue
Block a user