diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 9c18766..6f6379a 100644
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -211,7 +211,7 @@ def make_match_groups_pairwise(
 
     logger.info("Load reference answers")
     judge_model = args.judge_model
-    answers = load_model_answers(REFERENCE_DIR / judge_model)
+    answers = load_model_answers(REFERENCE_DIR / "gpt-4")
     for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions):
         assert question["question_id"] in answers
     ref_answers = {judge_model: answers}