chore: bootstrap reusable quantization template workspace

2026-03-02 23:07:48 +08:00
commit 1c5822d16b
15 changed files with 167197 additions and 0 deletions
--- a/scripts/prepare_calib_data.py
+++ b/scripts/prepare_calib_data.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""
+Prepare calibration_data_v5_rc_code.txt with exact composition:
+- base: 1152 blocks from calibration_data_v5_rc.txt
+- code: 2000 blocks from QuixiAI/Code-74k-ShareGPT-Vicuna
+- pref: 1000 blocks from alvarobartt/openhermes-preferences-coding
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+from datasets import load_dataset
+
+BASE_URL = (
+    "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/"
+    "examples/calibration/calibration_data.txt"
+)
+BLOCK_SPLIT_RE = re.compile(r"\n\s*\n")
+
+
+def split_blocks(text: str) -> list[str]:
+    blocks = [b.strip() for b in BLOCK_SPLIT_RE.split(text) if b.strip()]
+    return blocks
+
+
+def read_blocks(path: Path) -> list[str]:
+    return split_blocks(path.read_text(encoding="utf-8", errors="ignore"))
+
+
+def write_blocks(path: Path, blocks: list[str]) -> None:
+    path.write_text("\n\n".join(blocks).strip() + "\n", encoding="utf-8")
+
+
+def ensure_base_file(path: Path) -> None:
+    if path.exists():
+        return
+    cmd = ["wget", BASE_URL, "-O", str(path)]
+    print("Downloading base calibration file:")
+    print(" ", " ".join(cmd))
+    subprocess.run(cmd, check=True)
+
+
+def pick_blocks(blocks: list[str], target: int, seed: int) -> list[str]:
+    if len(blocks) < target:
+        raise ValueError(f"Need {target} blocks but only got {len(blocks)}.")
+    rng = random.Random(seed)
+    idxs = list(range(len(blocks)))
+    rng.shuffle(idxs)
+    return [blocks[i] for i in idxs[:target]]
+
+
+def build_code74k_blocks(target: int, seed: int) -> list[str]:
+    ds = load_dataset("QuixiAI/Code-74k-ShareGPT-Vicuna", split="train")
+    rows = list(range(len(ds)))
+    rng = random.Random(seed)
+    rng.shuffle(rows)
+
+    out: list[str] = []
+    for i in rows:
+        conv = ds[i].get("conversations") or []
+        parts = []
+        for msg in conv:
+            value = (msg.get("value") or "").strip()
+            if value:
+                parts.append(value)
+        if parts:
+            out.append("\n".join(parts))
+        if len(out) >= target:
+            break
+
+    if len(out) < target:
+        raise RuntimeError(
+            f"Code-74k yielded only {len(out)} valid blocks, target is {target}."
+        )
+    return out
+
+
+def build_openhermes_blocks(target: int, seed: int) -> list[str]:
+    ds = load_dataset("alvarobartt/openhermes-preferences-coding", split="train")
+    rows = list(range(len(ds)))
+    rng = random.Random(seed + 1)
+    rng.shuffle(rows)
+
+    out: list[str] = []
+    for i in rows:
+        chosen = ds[i].get("chosen") or []
+        parts = []
+        for msg in chosen:
+            value = (msg.get("content") or "").strip()
+            if value:
+                parts.append(value)
+        if parts:
+            out.append("\n".join(parts))
+        if len(out) >= target:
+            break
+
+    if len(out) < target:
+        raise RuntimeError(
+            f"OpenHermes yielded only {len(out)} valid blocks, target is {target}."
+        )
+    return out
+
+
+def ensure_cached_blocks(
+    cache_path: Path,
+    target: int,
+    build_fn,
+    seed: int,
+) -> list[str]:
+    if cache_path.exists():
+        cached = read_blocks(cache_path)
+        if len(cached) >= target:
+            return cached[:target]
+        print(
+            f"{cache_path} has {len(cached)} blocks (< {target}), rebuilding from source."
+        )
+
+    blocks = build_fn(target, seed)
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    write_blocks(cache_path, blocks)
+    return blocks
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--base-file", default="calibration_data_v5_rc.txt")
+    parser.add_argument("--output", default="calibration_data_v5_rc_code.txt")
+    parser.add_argument("--data-dir", default="data")
+    parser.add_argument("--force-refresh", action="store_true")
+    args = parser.parse_args()
+
+    base_file = Path(args.base_file)
+    output_file = Path(args.output)
+    data_dir = Path(args.data_dir)
+    code_cache = data_dir / "code74k_2000.txt"
+    openhermes_cache = data_dir / "openhermes_coding_chosen_1000.txt"
+
+    if args.force_refresh:
+        for p in [code_cache, openhermes_cache]:
+            if p.exists():
+                p.unlink()
+
+    ensure_base_file(base_file)
+    base_blocks_all = read_blocks(base_file)
+    base_blocks = pick_blocks(base_blocks_all, target=1152, seed=args.seed)
+
+    code_blocks = ensure_cached_blocks(
+        cache_path=code_cache,
+        target=2000,
+        build_fn=build_code74k_blocks,
+        seed=args.seed,
+    )
+    openhermes_blocks = ensure_cached_blocks(
+        cache_path=openhermes_cache,
+        target=1000,
+        build_fn=build_openhermes_blocks,
+        seed=args.seed,
+    )
+
+    merged = base_blocks + code_blocks + openhermes_blocks
+    write_blocks(output_file, merged)
+
+    print("Done.")
+    print(f"base blocks:       {len(base_blocks)}  ({base_file})")
+    print(f"code blocks:       {len(code_blocks)}  (QuixiAI/Code-74k-ShareGPT-Vicuna)")
+    print(
+        "openhermes blocks: "
+        f"{len(openhermes_blocks)}  (alvarobartt/openhermes-preferences-coding)"
+    )
+    print(f"total blocks:      {len(merged)}")
+    print(f"output:            {output_file}")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except subprocess.CalledProcessError as exc:
+        print(f"Command failed with exit code {exc.returncode}", file=sys.stderr)
+        raise
--- a/scripts/upload_to_modelscope.sh
+++ b/scripts/upload_to_modelscope.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# 用法：
+#   ./upload_to_modelscope.sh <repo_id> <token>
+# 示例：
+#   ./upload_to_modelscope.sh your_username/your_repo_name ms-xxxxxxxx
+
+REPO_ID="${1:-}"
+TOKEN="${2:-}"
+
+if [[ -z "${REPO_ID}" || -z "${TOKEN}" ]]; then
+  echo "Usage: $0 <repo_id> <token>"
+  exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+"${ROOT_DIR}/.venv/bin/modelscope" login --token "${TOKEN}"
+"${ROOT_DIR}/.venv/bin/modelscope" upload "${REPO_ID}" "${SCRIPT_DIR}" . \
+  --repo-type model \
+  --commit-message "Upload Qwen3.5-27B quantized GGUF weights"
+
+echo "Upload finished: ${REPO_ID}"