From 1f4bd142cb9691e5a8d600770ca253dc7c829552 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Tue, 7 Jan 2025 08:33:41 -0500
Subject: [PATCH] give mt_bench best_score_file a default name and use it for
 logging metrics

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 eval/final/components.py    | 10 +++-----
 eval/mt_bench/components.py |  8 +++---
 pipeline.yaml               | 49 ++++++++++++++++++-------------------
 3 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/eval/final/components.py b/eval/final/components.py
index 8635b7b..391c7bc 100644
--- a/eval/final/components.py
+++ b/eval/final/components.py
@@ -489,15 +489,13 @@ def find_node_dataset_directories(base_dir: str):
 def generate_metrics_report_op(
     metrics: Output[Metrics],
 ):
-    import ast
     import json
 
-    with open("/output/mt_bench_data.jsonl", "r") as f:
-        mt_bench_data = json.loads(f.readline())
+    with open("/output/mt_bench_best_data.json", "r") as f:
+        mt_bench_data = json.loads(f.read())
 
-    metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
-    metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
-    metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])
+    metrics.log_metric("mt_bench_best_model", mt_bench_data["best_model"])
+    metrics.log_metric("mt_bench_best_score", mt_bench_data["best_score"])
 
     with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
         mt_bench_branch_data = json.loads(f.read())
diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
index 72cbd0a..41646df 100644
--- a/eval/mt_bench/components.py
+++ b/eval/mt_bench/components.py
@@ -16,7 +16,7 @@ def run_mt_bench_op(
     max_workers: str,
     models_folder: str,
     output_path: str = "/output/mt_bench_data.jsonl",
-    best_score_file: Optional[str] = None,
+    best_score_file: str = "/output/mt_bench_best_data.json",
 ) -> NamedTuple("outputs", best_model=str, best_score=float):
     import json
     import os
@@ -195,9 +195,9 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
     outputs = NamedTuple("outputs", best_model=str, best_score=float)
     best_model = max(scores, key=scores.get)
     best_score = scores[best_model]
-    if best_score_file:
-        with open(best_score_file, "w", encoding="utf-8") as f:
-            json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
+
+    with open(best_score_file, "w", encoding="utf-8") as f:
+        json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
 
     # Rename the best model directory to "candidate_model" for the next step
     # So we know which model to use for the final evaluation
diff --git a/pipeline.yaml b/pipeline.yaml
index cfaec7e..dfae9fe 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -495,6 +495,7 @@ components:
     inputDefinitions:
       parameters:
         best_score_file:
+          defaultValue: /output/mt_bench_best_data.json
           isOptional: true
           parameterType: STRING
         max_workers:
@@ -686,20 +687,18 @@ deploymentSpec:
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
           \ *\n\ndef generate_metrics_report_op(\n    metrics: Output[Metrics],\n\
-          ):\n    import ast\n    import json\n\n    with open(\"/output/mt_bench_data.jsonl\"\
-          , \"r\") as f:\n        mt_bench_data = json.loads(f.readline())\n\n   \
-          \ metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"])\n\
-          \    metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\
-          ])\n    metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\
-          error_rate\"])\n\n    with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\
-          , \"r\") as f:\n        mt_bench_branch_data = json.loads(f.read())\n\n\
-          \    metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\
-          overall_score\"])\n    metrics.log_metric(\n        \"mt_bench_branch_base_score\"\
-          , mt_bench_branch_data[\"base_overall_score\"]\n    )\n\n    with open(\"\
-          /output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n        mmlu_branch_data\
-          \ = json.loads(f.read())\n\n    metrics.log_metric(\"mmlu_branch_score\"\
-          , mmlu_branch_data[\"model_score\"])\n    metrics.log_metric(\"mmlu_branch_base_score\"\
-          , mmlu_branch_data[\"base_model_score\"])\n\n"
+          ):\n    import json\n\n    with open(\"/output/mt_bench_best_data.json\"\
+          , \"r\") as f:\n        mt_bench_data = json.loads(f.read())\n\n    metrics.log_metric(\"\
+          mt_bench_best_model\", mt_bench_data[\"best_model\"])\n    metrics.log_metric(\"\
+          mt_bench_best_score\", mt_bench_data[\"best_score\"])\n\n    with open(\"\
+          /output/mt_bench_branch/mt_bench_branch_data.json\", \"r\") as f:\n    \
+          \    mt_bench_branch_data = json.loads(f.read())\n\n    metrics.log_metric(\"\
+          mt_bench_branch_score\", mt_bench_branch_data[\"overall_score\"])\n    metrics.log_metric(\n\
+          \        \"mt_bench_branch_base_score\", mt_bench_branch_data[\"base_overall_score\"\
+          ]\n    )\n\n    with open(\"/output/mmlu_branch/mmlu_branch_data.json\"\
+          , \"r\") as f:\n        mmlu_branch_data = json.loads(f.read())\n\n    metrics.log_metric(\"\
+          mmlu_branch_score\", mmlu_branch_data[\"model_score\"])\n    metrics.log_metric(\"\
+          mmlu_branch_base_score\", mmlu_branch_data[\"base_model_score\"])\n\n"
         image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
     exec-git-clone-op:
       container:
@@ -1449,9 +1448,9 @@ deploymentSpec:
           \  - 'auto'\n    # with 'auto', number of gpus allocated for serving is\
           \ calculated based on environment\n    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
           \    max_workers: str,\n    models_folder: str,\n    output_path: str =\
-          \ \"/output/mt_bench_data.jsonl\",\n    best_score_file: Optional[str] =\
-          \ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\
-          \    import json\n    import os\n    import subprocess\n\n    import httpx\n\
+          \ \"/output/mt_bench_data.jsonl\",\n    best_score_file: str = \"/output/mt_bench_best_data.json\"\
+          ,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n   \
+          \ import json\n    import os\n    import subprocess\n\n    import httpx\n\
           \    import torch\n    from instructlab.eval.mt_bench import MTBenchEvaluator\n\
           \n    judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n    judge_model_name\
           \ = os.getenv(\"JUDGE_NAME\")\n    judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
@@ -1533,14 +1532,14 @@ deploymentSpec:
           \ \"w\", encoding=\"utf-8\") as f:\n        for data in all_mt_bench_data:\n\
           \            json.dump(data, f)\n            f.write(\"\\n\")\n\n    outputs\
           \ = NamedTuple(\"outputs\", best_model=str, best_score=float)\n    best_model\
-          \ = max(scores, key=scores.get)\n    best_score = scores[best_model]\n \
-          \   if best_score_file:\n        with open(best_score_file, \"w\", encoding=\"\
-          utf-8\") as f:\n            json.dump({\"best_model\": best_model, \"best_score\"\
-          : best_score}, f, indent=4)\n\n    # Rename the best model directory to\
-          \ \"candidate_model\" for the next step\n    # So we know which model to\
-          \ use for the final evaluation\n    if os.path.exists(os.path.join(models_folder,\
-          \ \"candidate_model\")):\n        print(\"candidate_model already exists.\
-          \ Skipping renaming\")\n    else:\n        os.rename(\n            os.path.join(models_folder,\
+          \ = max(scores, key=scores.get)\n    best_score = scores[best_model]\n\n\
+          \    with open(best_score_file, \"w\", encoding=\"utf-8\") as f:\n     \
+          \   json.dump({\"best_model\": best_model, \"best_score\": best_score},\
+          \ f, indent=4)\n\n    # Rename the best model directory to \"candidate_model\"\
+          \ for the next step\n    # So we know which model to use for the final evaluation\n\
+          \    if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\
+          \        print(\"candidate_model already exists. Skipping renaming\")\n\
+          \    else:\n        os.rename(\n            os.path.join(models_folder,\
           \ best_model),\n            os.path.join(models_folder, \"candidate_model\"\
           ),\n        )\n\n    return outputs(best_model=best_model, best_score=best_score)\n\
           \n"