Files
openharmony-mlx/gpt_oss/evals/__main__.py
Dominik Kundel 243a1b0276 Initial commit
Co-authored-by: Zhuohan Li <zhuohan@openai.com>
Co-authored-by: Maratyszcza <marat@openai.com>
Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
2025-08-05 08:19:49 -07:00

242 lines
8.2 KiB
Python

import argparse
import json
from datetime import datetime
from . import report
from .gpqa_eval import GPQAEval
from .aime_eval import AIME25Eval
from .healthbench_eval import HealthBenchEval
from .chat_completion_sampler import (
OPENAI_SYSTEM_MESSAGE_API,
ChatCompletionSampler,
)
from .responses_sampler import ResponsesSampler
def main():
parser = argparse.ArgumentParser(
description="Evaluate the models.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--list-models", action="store_true", help="List available models"
)
parser.add_argument(
"--model",
type=str,
help="Select a model by name. Also accepts a comma-separated list of models.",
)
parser.add_argument(
"--base-url",
type=str,
default="http://localhost:8000/v1",
help="Base URL for the API.",
)
parser.add_argument(
"--eval",
type=str,
default="gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25",
help="Select an eval by name. Also accepts a comma-separated list of evals.",
)
parser.add_argument(
"--temperature",
type=float,
default=1.0,
help="Sampling temperature",
)
parser.add_argument(
"--n-threads",
type=int,
default=1584,
help="Number of threads to run.",
)
parser.add_argument(
"--debug", action="store_true", help="Run in debug mode"
)
parser.add_argument(
"--examples", type=int, help="Number of examples to use (overrides default)"
)
args = parser.parse_args()
models = {
"120b-low": ResponsesSampler(
model="gpt-oss-120b",
reasoning_model=True,
reasoning_effort="low",
temperature=args.temperature,
base_url=args.base_url,
),
"120b": ResponsesSampler(
model="gpt-oss-120b",
reasoning_model=True,
reasoning_effort="medium",
temperature=args.temperature,
base_url=args.base_url,
),
"120b-high": ResponsesSampler(
model="gpt-oss-120b",
reasoning_model=True,
reasoning_effort="high",
temperature=args.temperature,
base_url=args.base_url,
),
"20b-low": ResponsesSampler(
model="gpt-oss-20b",
reasoning_model=True,
reasoning_effort="low",
temperature=args.temperature,
base_url=args.base_url,
),
"20b": ResponsesSampler(
model="gpt-oss-20b",
reasoning_model=True,
reasoning_effort="medium",
temperature=args.temperature,
base_url=args.base_url,
),
"20b-high": ResponsesSampler(
model="gpt-oss-20b",
reasoning_model=True,
reasoning_effort="high",
temperature=args.temperature,
base_url=args.base_url,
),
}
if args.list_models:
print("Available models:")
for model_name in models.keys():
print(f" - {model_name}")
return
if args.model:
models_chosen = args.model.split(",")
for model_name in models_chosen:
if model_name not in models:
print(f"Error: Model '{model_name}' not found.")
return
models = {model_name: models[model_name] for model_name in models_chosen}
print(f"Running with args {args}")
grading_sampler = ChatCompletionSampler(
model="gpt-4.1-2025-04-14",
system_message=OPENAI_SYSTEM_MESSAGE_API,
max_tokens=2048,
)
def get_evals(eval_name, debug_mode):
num_examples = (
args.examples if args.examples is not None else (5 if debug_mode else None)
)
# Set num_examples = None to reproduce full evals
match eval_name:
case "gpqa":
return GPQAEval(
n_repeats=8,
num_examples=num_examples,
debug=debug_mode,
n_threads=args.n_threads or 1,
)
case "healthbench":
return HealthBenchEval(
grader_model=grading_sampler,
num_examples=10 if debug_mode else num_examples,
n_repeats=1,
n_threads=args.n_threads or 1,
subset_name=None,
)
case "healthbench_hard":
return HealthBenchEval(
grader_model=grading_sampler,
num_examples=10 if debug_mode else num_examples,
n_repeats=1,
n_threads=args.n_threads or 1,
subset_name="hard",
)
case "healthbench_consensus":
return HealthBenchEval(
grader_model=grading_sampler,
num_examples=10 if debug_mode else num_examples,
n_repeats=1,
n_threads=args.n_threads or 1,
subset_name="consensus",
)
case "aime25":
return AIME25Eval(
n_repeats=8,
num_examples=num_examples,
n_threads=args.n_threads or 1,
)
case _:
raise Exception(f"Unrecognized eval type: {eval_name}")
evals_list = args.eval.split(",")
evals = {}
for eval_name in evals_list:
evals[eval_name] = get_evals(eval_name, args.debug)
print(evals)
debug_suffix = "_DEBUG" if args.debug else ""
print(debug_suffix)
mergekey2resultpath = {}
print(f"Running the following evals: {list(evals.keys())}")
print(f"Running evals for the following models: {list(models.keys())}")
now = datetime.now()
date_str = now.strftime("%Y%m%d_%H%M%S")
for model_name, sampler in models.items():
for eval_name, eval_obj in evals.items():
result = eval_obj(sampler)
# ^^^ how to use a sampler
file_stem = f"{eval_name}_{model_name}_temp{args.temperature}"
# file stem should also include the year, month, day, and time in hours and minutes
file_stem += f"_{date_str}"
report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
print(f"Writing report to {report_filename}")
with open(report_filename, "w") as fh:
fh.write(report.make_report(result))
assert result.metrics is not None
metrics = result.metrics | {"score": result.score}
# Sort metrics by key
metrics = dict(sorted(metrics.items()))
print(metrics)
result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
with open(result_filename, "w") as f:
f.write(json.dumps(metrics, indent=2))
print(f"Writing results to {result_filename}")
full_result_filename = f"/tmp/{file_stem}{debug_suffix}_allresults.json"
with open(full_result_filename, "w") as f:
result_dict = {
"score": result.score,
"metrics": result.metrics,
"htmls": result.htmls,
"convos": result.convos,
"metadata": result.metadata,
}
f.write(json.dumps(result_dict, indent=2))
print(f"Writing all results to {full_result_filename}")
mergekey2resultpath[f"{file_stem}"] = result_filename
merge_metrics = []
for eval_model_name, result_filename in mergekey2resultpath.items():
try:
result = json.load(open(result_filename, "r+"))
except Exception as e:
print(e, result_filename)
continue
result = result.get("f1_score", result.get("score", None))
eval_name = eval_model_name[: eval_model_name.find("_")]
model_name = eval_model_name[eval_model_name.find("_") + 1 :]
merge_metrics.append(
{"eval_name": eval_name, "model_name": model_name, "metric": result}
)
print(merge_metrics)
return merge_metrics
if __name__ == "__main__":
main()