Co-authored-by: Zhuohan Li <zhuohan@openai.com> Co-authored-by: Maratyszcza <marat@openai.com> Co-authored-by: Volodymyr Kyrylov <vol@wilab.org.ua>
242 lines
8.2 KiB
Python
242 lines
8.2 KiB
Python
import argparse
|
|
import json
|
|
from datetime import datetime
|
|
|
|
from . import report
|
|
from .gpqa_eval import GPQAEval
|
|
from .aime_eval import AIME25Eval
|
|
from .healthbench_eval import HealthBenchEval
|
|
from .chat_completion_sampler import (
|
|
OPENAI_SYSTEM_MESSAGE_API,
|
|
ChatCompletionSampler,
|
|
)
|
|
from .responses_sampler import ResponsesSampler
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Evaluate the models.",
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
)
|
|
parser.add_argument(
|
|
"--list-models", action="store_true", help="List available models"
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
help="Select a model by name. Also accepts a comma-separated list of models.",
|
|
)
|
|
parser.add_argument(
|
|
"--base-url",
|
|
type=str,
|
|
default="http://localhost:8000/v1",
|
|
help="Base URL for the API.",
|
|
)
|
|
parser.add_argument(
|
|
"--eval",
|
|
type=str,
|
|
default="gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25",
|
|
help="Select an eval by name. Also accepts a comma-separated list of evals.",
|
|
)
|
|
parser.add_argument(
|
|
"--temperature",
|
|
type=float,
|
|
default=1.0,
|
|
help="Sampling temperature",
|
|
)
|
|
parser.add_argument(
|
|
"--n-threads",
|
|
type=int,
|
|
default=1584,
|
|
help="Number of threads to run.",
|
|
)
|
|
parser.add_argument(
|
|
"--debug", action="store_true", help="Run in debug mode"
|
|
)
|
|
parser.add_argument(
|
|
"--examples", type=int, help="Number of examples to use (overrides default)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
models = {
|
|
"120b-low": ResponsesSampler(
|
|
model="gpt-oss-120b",
|
|
reasoning_model=True,
|
|
reasoning_effort="low",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
"120b": ResponsesSampler(
|
|
model="gpt-oss-120b",
|
|
reasoning_model=True,
|
|
reasoning_effort="medium",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
"120b-high": ResponsesSampler(
|
|
model="gpt-oss-120b",
|
|
reasoning_model=True,
|
|
reasoning_effort="high",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
"20b-low": ResponsesSampler(
|
|
model="gpt-oss-20b",
|
|
reasoning_model=True,
|
|
reasoning_effort="low",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
"20b": ResponsesSampler(
|
|
model="gpt-oss-20b",
|
|
reasoning_model=True,
|
|
reasoning_effort="medium",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
"20b-high": ResponsesSampler(
|
|
model="gpt-oss-20b",
|
|
reasoning_model=True,
|
|
reasoning_effort="high",
|
|
temperature=args.temperature,
|
|
base_url=args.base_url,
|
|
),
|
|
}
|
|
|
|
if args.list_models:
|
|
print("Available models:")
|
|
for model_name in models.keys():
|
|
print(f" - {model_name}")
|
|
return
|
|
|
|
if args.model:
|
|
models_chosen = args.model.split(",")
|
|
for model_name in models_chosen:
|
|
if model_name not in models:
|
|
print(f"Error: Model '{model_name}' not found.")
|
|
return
|
|
models = {model_name: models[model_name] for model_name in models_chosen}
|
|
|
|
print(f"Running with args {args}")
|
|
|
|
grading_sampler = ChatCompletionSampler(
|
|
model="gpt-4.1-2025-04-14",
|
|
system_message=OPENAI_SYSTEM_MESSAGE_API,
|
|
max_tokens=2048,
|
|
)
|
|
|
|
def get_evals(eval_name, debug_mode):
|
|
num_examples = (
|
|
args.examples if args.examples is not None else (5 if debug_mode else None)
|
|
)
|
|
# Set num_examples = None to reproduce full evals
|
|
match eval_name:
|
|
case "gpqa":
|
|
return GPQAEval(
|
|
n_repeats=8,
|
|
num_examples=num_examples,
|
|
debug=debug_mode,
|
|
n_threads=args.n_threads or 1,
|
|
)
|
|
case "healthbench":
|
|
return HealthBenchEval(
|
|
grader_model=grading_sampler,
|
|
num_examples=10 if debug_mode else num_examples,
|
|
n_repeats=1,
|
|
n_threads=args.n_threads or 1,
|
|
subset_name=None,
|
|
)
|
|
case "healthbench_hard":
|
|
return HealthBenchEval(
|
|
grader_model=grading_sampler,
|
|
num_examples=10 if debug_mode else num_examples,
|
|
n_repeats=1,
|
|
n_threads=args.n_threads or 1,
|
|
subset_name="hard",
|
|
)
|
|
case "healthbench_consensus":
|
|
return HealthBenchEval(
|
|
grader_model=grading_sampler,
|
|
num_examples=10 if debug_mode else num_examples,
|
|
n_repeats=1,
|
|
n_threads=args.n_threads or 1,
|
|
subset_name="consensus",
|
|
)
|
|
case "aime25":
|
|
return AIME25Eval(
|
|
n_repeats=8,
|
|
num_examples=num_examples,
|
|
n_threads=args.n_threads or 1,
|
|
)
|
|
case _:
|
|
raise Exception(f"Unrecognized eval type: {eval_name}")
|
|
|
|
evals_list = args.eval.split(",")
|
|
evals = {}
|
|
for eval_name in evals_list:
|
|
evals[eval_name] = get_evals(eval_name, args.debug)
|
|
|
|
print(evals)
|
|
debug_suffix = "_DEBUG" if args.debug else ""
|
|
print(debug_suffix)
|
|
mergekey2resultpath = {}
|
|
print(f"Running the following evals: {list(evals.keys())}")
|
|
print(f"Running evals for the following models: {list(models.keys())}")
|
|
|
|
now = datetime.now()
|
|
date_str = now.strftime("%Y%m%d_%H%M%S")
|
|
for model_name, sampler in models.items():
|
|
for eval_name, eval_obj in evals.items():
|
|
result = eval_obj(sampler)
|
|
# ^^^ how to use a sampler
|
|
file_stem = f"{eval_name}_{model_name}_temp{args.temperature}"
|
|
# file stem should also include the year, month, day, and time in hours and minutes
|
|
file_stem += f"_{date_str}"
|
|
report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
|
|
print(f"Writing report to {report_filename}")
|
|
with open(report_filename, "w") as fh:
|
|
fh.write(report.make_report(result))
|
|
assert result.metrics is not None
|
|
metrics = result.metrics | {"score": result.score}
|
|
# Sort metrics by key
|
|
metrics = dict(sorted(metrics.items()))
|
|
print(metrics)
|
|
result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
|
|
with open(result_filename, "w") as f:
|
|
f.write(json.dumps(metrics, indent=2))
|
|
print(f"Writing results to {result_filename}")
|
|
|
|
full_result_filename = f"/tmp/{file_stem}{debug_suffix}_allresults.json"
|
|
with open(full_result_filename, "w") as f:
|
|
result_dict = {
|
|
"score": result.score,
|
|
"metrics": result.metrics,
|
|
"htmls": result.htmls,
|
|
"convos": result.convos,
|
|
"metadata": result.metadata,
|
|
}
|
|
f.write(json.dumps(result_dict, indent=2))
|
|
print(f"Writing all results to {full_result_filename}")
|
|
|
|
mergekey2resultpath[f"{file_stem}"] = result_filename
|
|
merge_metrics = []
|
|
for eval_model_name, result_filename in mergekey2resultpath.items():
|
|
try:
|
|
result = json.load(open(result_filename, "r+"))
|
|
except Exception as e:
|
|
print(e, result_filename)
|
|
continue
|
|
result = result.get("f1_score", result.get("score", None))
|
|
eval_name = eval_model_name[: eval_model_name.find("_")]
|
|
model_name = eval_model_name[eval_model_name.find("_") + 1 :]
|
|
merge_metrics.append(
|
|
{"eval_name": eval_name, "model_name": model_name, "metric": result}
|
|
)
|
|
print(merge_metrics)
|
|
return merge_metrics
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|