first commit

This commit is contained in:
2026-03-02 23:22:33 +08:00
parent 1c5822d16b
commit c5ae56c463
22 changed files with 606 additions and 462 deletions

View File

@@ -22,6 +22,8 @@ BASE_URL = (
"examples/calibration/calibration_data.txt"
)
BLOCK_SPLIT_RE = re.compile(r"\n\s*\n")
SCRIPT_DIR = Path(__file__).resolve().parent
ROOT_DIR = SCRIPT_DIR.parent
def split_blocks(text: str) -> list[str]:
@@ -130,15 +132,21 @@ def ensure_cached_blocks(
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--base-file", default="calibration_data_v5_rc.txt")
parser.add_argument("--output", default="calibration_data_v5_rc_code.txt")
parser.add_argument("--data-dir", default="data")
parser.add_argument("--base-file", default="calibration/calibration_data_v5_rc.txt")
parser.add_argument("--output", default="calibration/calibration_data_v5_rc_code.txt")
parser.add_argument("--data-dir", default="calibration/sources")
parser.add_argument("--force-refresh", action="store_true")
args = parser.parse_args()
base_file = Path(args.base_file)
output_file = Path(args.output)
data_dir = Path(args.data_dir)
def resolve_path(path_text: str) -> Path:
p = Path(path_text)
if p.is_absolute():
return p
return ROOT_DIR / p
base_file = resolve_path(args.base_file)
output_file = resolve_path(args.output)
data_dir = resolve_path(args.data_dir)
code_cache = data_dir / "code74k_2000.txt"
openhermes_cache = data_dir / "openhermes_coding_chosen_1000.txt"