From 1f4bd142cb9691e5a8d600770ca253dc7c829552 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 7 Jan 2025 08:33:41 -0500 Subject: [PATCH] give mt_bench best_score_file a default name and use it for logging metrics Signed-off-by: Michael Clifford --- eval/final/components.py | 10 +++----- eval/mt_bench/components.py | 8 +++--- pipeline.yaml | 49 ++++++++++++++++++------------------- 3 files changed, 32 insertions(+), 35 deletions(-) diff --git a/eval/final/components.py b/eval/final/components.py index 8635b7b..391c7bc 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -489,15 +489,13 @@ def find_node_dataset_directories(base_dir: str): def generate_metrics_report_op( metrics: Output[Metrics], ): - import ast import json - with open("/output/mt_bench_data.jsonl", "r") as f: - mt_bench_data = json.loads(f.readline()) + with open("/output/mt_bench_best_data.json", "r") as f: + mt_bench_data = json.loads(f.read()) - metrics.log_metric("mt_bench_best_model", mt_bench_data["model"]) - metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"]) - metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"]) + metrics.log_metric("mt_bench_best_model", mt_bench_data["best_model"]) + metrics.log_metric("mt_bench_best_score", mt_bench_data["best_score"]) with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f: mt_bench_branch_data = json.loads(f.read()) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 72cbd0a..41646df 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -16,7 +16,7 @@ def run_mt_bench_op( max_workers: str, models_folder: str, output_path: str = "/output/mt_bench_data.jsonl", - best_score_file: Optional[str] = None, + best_score_file: str = "/output/mt_bench_best_data.json", ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os @@ -195,9 +195,9 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): outputs = NamedTuple("outputs", best_model=str, best_score=float) best_model = max(scores, key=scores.get) best_score = scores[best_model] - if best_score_file: - with open(best_score_file, "w", encoding="utf-8") as f: - json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) + + with open(best_score_file, "w", encoding="utf-8") as f: + json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) # Rename the best model directory to "candidate_model" for the next step # So we know which model to use for the final evaluation diff --git a/pipeline.yaml b/pipeline.yaml index cfaec7e..dfae9fe 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -495,6 +495,7 @@ components: inputDefinitions: parameters: best_score_file: + defaultValue: /output/mt_bench_best_data.json isOptional: true parameterType: STRING max_workers: @@ -686,20 +687,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\ - ):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.jsonl\"\ - , \"r\") as f:\n mt_bench_data = json.loads(f.readline())\n\n \ - \ metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"])\n\ - \ metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\ - ])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\ - error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\ - , \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\ - \ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\ - overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\ - , mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\ - /output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\ - \ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\ - , mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\ - , mmlu_branch_data[\"base_model_score\"])\n\n" + ):\n import json\n\n with open(\"/output/mt_bench_best_data.json\"\ + , \"r\") as f:\n mt_bench_data = json.loads(f.read())\n\n metrics.log_metric(\"\ + mt_bench_best_model\", mt_bench_data[\"best_model\"])\n metrics.log_metric(\"\ + mt_bench_best_score\", mt_bench_data[\"best_score\"])\n\n with open(\"\ + /output/mt_bench_branch/mt_bench_branch_data.json\", \"r\") as f:\n \ + \ mt_bench_branch_data = json.loads(f.read())\n\n metrics.log_metric(\"\ + mt_bench_branch_score\", mt_bench_branch_data[\"overall_score\"])\n metrics.log_metric(\n\ + \ \"mt_bench_branch_base_score\", mt_bench_branch_data[\"base_overall_score\"\ + ]\n )\n\n with open(\"/output/mmlu_branch/mmlu_branch_data.json\"\ + , \"r\") as f:\n mmlu_branch_data = json.loads(f.read())\n\n metrics.log_metric(\"\ + mmlu_branch_score\", mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"\ + mmlu_branch_base_score\", mmlu_branch_data[\"base_model_score\"])\n\n" image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111 exec-git-clone-op: container: @@ -1449,9 +1448,9 @@ deploymentSpec: \ - 'auto'\n # with 'auto', number of gpus allocated for serving is\ \ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ \ max_workers: str,\n models_folder: str,\n output_path: str =\ - \ \"/output/mt_bench_data.jsonl\",\n best_score_file: Optional[str] =\ - \ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ - \ import json\n import os\n import subprocess\n\n import httpx\n\ + \ \"/output/mt_bench_data.jsonl\",\n best_score_file: str = \"/output/mt_bench_best_data.json\"\ + ,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n \ + \ import json\n import os\n import subprocess\n\n import httpx\n\ \ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ @@ -1533,14 +1532,14 @@ deploymentSpec: \ \"w\", encoding=\"utf-8\") as f:\n for data in all_mt_bench_data:\n\ \ json.dump(data, f)\n f.write(\"\\n\")\n\n outputs\ \ = NamedTuple(\"outputs\", best_model=str, best_score=float)\n best_model\ - \ = max(scores, key=scores.get)\n best_score = scores[best_model]\n \ - \ if best_score_file:\n with open(best_score_file, \"w\", encoding=\"\ - utf-8\") as f:\n json.dump({\"best_model\": best_model, \"best_score\"\ - : best_score}, f, indent=4)\n\n # Rename the best model directory to\ - \ \"candidate_model\" for the next step\n # So we know which model to\ - \ use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\ - \ \"candidate_model\")):\n print(\"candidate_model already exists.\ - \ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\ + \ = max(scores, key=scores.get)\n best_score = scores[best_model]\n\n\ + \ with open(best_score_file, \"w\", encoding=\"utf-8\") as f:\n \ + \ json.dump({\"best_model\": best_model, \"best_score\": best_score},\ + \ f, indent=4)\n\n # Rename the best model directory to \"candidate_model\"\ + \ for the next step\n # So we know which model to use for the final evaluation\n\ + \ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\ + \ print(\"candidate_model already exists. Skipping renaming\")\n\ + \ else:\n os.rename(\n os.path.join(models_folder,\ \ best_model),\n os.path.join(models_folder, \"candidate_model\"\ ),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\ \n"