import argparse import json from datetime import datetime from . import report from .gpqa_eval import GPQAEval from .aime_eval import AIME25Eval from .healthbench_eval import HealthBenchEval from .chat_completion_sampler import ( OPENAI_SYSTEM_MESSAGE_API, ChatCompletionSampler, ) from .responses_sampler import ResponsesSampler def main(): parser = argparse.ArgumentParser( description="Evaluate the models.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--list-models", action="store_true", help="List available models" ) parser.add_argument( "--model", type=str, help="Select a model by name. Also accepts a comma-separated list of models.", ) parser.add_argument( "--base-url", type=str, default="http://localhost:8000/v1", help="Base URL for the API.", ) parser.add_argument( "--eval", type=str, default="gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25", help="Select an eval by name. Also accepts a comma-separated list of evals.", ) parser.add_argument( "--temperature", type=float, default=1.0, help="Sampling temperature", ) parser.add_argument( "--n-threads", type=int, default=1584, help="Number of threads to run.", ) parser.add_argument( "--debug", action="store_true", help="Run in debug mode" ) parser.add_argument( "--examples", type=int, help="Number of examples to use (overrides default)" ) args = parser.parse_args() models = { "120b-low": ResponsesSampler( model="gpt-oss-120b", reasoning_model=True, reasoning_effort="low", temperature=args.temperature, base_url=args.base_url, ), "120b": ResponsesSampler( model="gpt-oss-120b", reasoning_model=True, reasoning_effort="medium", temperature=args.temperature, base_url=args.base_url, ), "120b-high": ResponsesSampler( model="gpt-oss-120b", reasoning_model=True, reasoning_effort="high", temperature=args.temperature, base_url=args.base_url, ), "20b-low": ResponsesSampler( model="gpt-oss-20b", reasoning_model=True, reasoning_effort="low", temperature=args.temperature, base_url=args.base_url, ), "20b": ResponsesSampler( model="gpt-oss-20b", reasoning_model=True, reasoning_effort="medium", temperature=args.temperature, base_url=args.base_url, ), "20b-high": ResponsesSampler( model="gpt-oss-20b", reasoning_model=True, reasoning_effort="high", temperature=args.temperature, base_url=args.base_url, ), } if args.list_models: print("Available models:") for model_name in models.keys(): print(f" - {model_name}") return if args.model: models_chosen = args.model.split(",") for model_name in models_chosen: if model_name not in models: print(f"Error: Model '{model_name}' not found.") return models = {model_name: models[model_name] for model_name in models_chosen} print(f"Running with args {args}") grading_sampler = ChatCompletionSampler( model="gpt-4.1-2025-04-14", system_message=OPENAI_SYSTEM_MESSAGE_API, max_tokens=2048, ) def get_evals(eval_name, debug_mode): num_examples = ( args.examples if args.examples is not None else (5 if debug_mode else None) ) # Set num_examples = None to reproduce full evals match eval_name: case "gpqa": return GPQAEval( n_repeats=8, num_examples=num_examples, debug=debug_mode, n_threads=args.n_threads or 1, ) case "healthbench": return HealthBenchEval( grader_model=grading_sampler, num_examples=10 if debug_mode else num_examples, n_repeats=1, n_threads=args.n_threads or 1, subset_name=None, ) case "healthbench_hard": return HealthBenchEval( grader_model=grading_sampler, num_examples=10 if debug_mode else num_examples, n_repeats=1, n_threads=args.n_threads or 1, subset_name="hard", ) case "healthbench_consensus": return HealthBenchEval( grader_model=grading_sampler, num_examples=10 if debug_mode else num_examples, n_repeats=1, n_threads=args.n_threads or 1, subset_name="consensus", ) case "aime25": return AIME25Eval( n_repeats=8, num_examples=num_examples, n_threads=args.n_threads or 1, ) case _: raise Exception(f"Unrecognized eval type: {eval_name}") evals_list = args.eval.split(",") evals = {} for eval_name in evals_list: evals[eval_name] = get_evals(eval_name, args.debug) print(evals) debug_suffix = "_DEBUG" if args.debug else "" print(debug_suffix) mergekey2resultpath = {} print(f"Running the following evals: {list(evals.keys())}") print(f"Running evals for the following models: {list(models.keys())}") now = datetime.now() date_str = now.strftime("%Y%m%d_%H%M%S") for model_name, sampler in models.items(): for eval_name, eval_obj in evals.items(): result = eval_obj(sampler) # ^^^ how to use a sampler file_stem = f"{eval_name}_{model_name}_temp{args.temperature}" # file stem should also include the year, month, day, and time in hours and minutes file_stem += f"_{date_str}" report_filename = f"/tmp/{file_stem}{debug_suffix}.html" print(f"Writing report to {report_filename}") with open(report_filename, "w") as fh: fh.write(report.make_report(result)) assert result.metrics is not None metrics = result.metrics | {"score": result.score} # Sort metrics by key metrics = dict(sorted(metrics.items())) print(metrics) result_filename = f"/tmp/{file_stem}{debug_suffix}.json" with open(result_filename, "w") as f: f.write(json.dumps(metrics, indent=2)) print(f"Writing results to {result_filename}") full_result_filename = f"/tmp/{file_stem}{debug_suffix}_allresults.json" with open(full_result_filename, "w") as f: result_dict = { "score": result.score, "metrics": result.metrics, "htmls": result.htmls, "convos": result.convos, "metadata": result.metadata, } f.write(json.dumps(result_dict, indent=2)) print(f"Writing all results to {full_result_filename}") mergekey2resultpath[f"{file_stem}"] = result_filename merge_metrics = [] for eval_model_name, result_filename in mergekey2resultpath.items(): try: result = json.load(open(result_filename, "r+")) except Exception as e: print(e, result_filename) continue result = result.get("f1_score", result.get("score", None)) eval_name = eval_model_name[: eval_model_name.find("_")] model_name = eval_model_name[eval_model_name.find("_") + 1 :] merge_metrics.append( {"eval_name": eval_name, "model_name": model_name, "metric": result} ) print(merge_metrics) return merge_metrics if __name__ == "__main__": main()