diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 9c18766..6f6379a 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -211,7 +211,7 @@ def make_match_groups_pairwise( logger.info("Load reference answers") judge_model = args.judge_model - answers = load_model_answers(REFERENCE_DIR / judge_model) + answers = load_model_answers(REFERENCE_DIR / "gpt-4") for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions): assert question["question_id"] in answers ref_answers = {judge_model: answers}