Skip to content

Commit f829641

Browse files
update generate_metrics_report_op and consolidate mt_bench reports
Signed-off-by: Michael Clifford <[email protected]>
1 parent 70fe60c commit f829641

File tree

3 files changed

+98
-97
lines changed

3 files changed

+98
-97
lines changed

eval/final/components.py

+30-28
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def run_final_eval_op(
2424
import json
2525
import os
2626
import subprocess
27+
from pathlib import Path
2728

2829
import httpx
2930
import torch
@@ -320,19 +321,19 @@ def find_node_dataset_directories(base_dir: str):
320321
"report_title": "KNOWLEDGE EVALUATION REPORT",
321322
"max_score": "1.0",
322323
"model": candidate_model,
323-
"model_score": round(overall_score, 2),
324+
"trained_model_score": round(overall_score, 2),
324325
"base_model": base_model_dir,
325326
"base_model_score": round(base_overall_score, 2),
326327
"summary": summary,
327328
}
328329

329330
if not os.path.exists(mmlu_branch_output_path):
330331
os.makedirs(mmlu_branch_output_path)
331-
with open(
332-
f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8"
333-
) as f:
332+
mmlu_branch_output_file = (
333+
Path(mmlu_branch_output_path) / "mmlu_branch_data.json"
334+
)
335+
with open(mmlu_branch_output_file, "w", encoding="utf-8") as f:
334336
json.dump(mmlu_branch_data, f, indent=4)
335-
336337
else:
337338
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")
338339

@@ -469,16 +470,19 @@ def find_node_dataset_directories(base_dir: str):
469470
"model": candidate_model,
470471
"judge_model": judge_model_name,
471472
"max_score": "10.0",
472-
"overall_score": overall_score,
473-
"base_overall_score": base_overall_score,
473+
"trained_model_score": overall_score,
474+
"base_model_score": base_overall_score,
474475
"error_rate": error_rate,
475476
"summary": summary,
476477
}
477478

478479
if not os.path.exists(mt_bench_branch_output_path):
479480
os.makedirs(mt_bench_branch_output_path)
481+
mt_bench_branch_data_file = (
482+
Path(mt_bench_branch_output_path) / "mt_bench_branch_data.json"
483+
)
480484
with open(
481-
f"{mt_bench_branch_output_path}/mt_bench_branch_data.json",
485+
mt_bench_branch_data_file,
482486
"w",
483487
encoding="utf-8",
484488
) as f:
@@ -489,27 +493,25 @@ def find_node_dataset_directories(base_dir: str):
489493
def generate_metrics_report_op(
490494
metrics: Output[Metrics],
491495
):
492-
import ast
493496
import json
494497

495-
with open("/output/mt_bench_data.json", "r") as f:
496-
mt_bench_data = f.read()
497-
mt_bench_data = ast.literal_eval(mt_bench_data)[0]
498-
499-
metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
500-
metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
501-
metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])
502-
503-
with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
504-
mt_bench_branch_data = json.loads(f.read())
505-
506-
metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"])
507-
metrics.log_metric(
508-
"mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"]
509-
)
498+
reports = {
499+
"mt_bench": "/output/mt_bench_data.json",
500+
"mt_bench_branch": "/output/mt_bench_branch/mt_bench_branch_data.json",
501+
"mmlu_branch": "/output/mmlu_branch/mmlu_branch_data.json",
502+
}
510503

511-
with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f:
512-
mmlu_branch_data = json.loads(f.read())
504+
for report, file_name in reports.items():
505+
with open(file_name, "r", encoding="utf-8") as f:
506+
report_data = json.load(f)
513507

514-
metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"])
515-
metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"])
508+
if report == "mt_bench":
509+
metrics.log_metric(f"{report}_best_model", report_data["best_model"])
510+
metrics.log_metric(f"{report}_best_score", report_data["best_score"])
511+
else:
512+
metrics.log_metric(
513+
f"{report}_trained_model_score", report_data["trained_model_score"]
514+
)
515+
metrics.log_metric(
516+
f"{report}_base_model_score", report_data["base_model_score"]
517+
)

eval/mt_bench/components.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ def run_mt_bench_op(
1616
max_workers: str,
1717
models_folder: str,
1818
output_path: str = "/output/mt_bench_data.json",
19-
best_score_file: Optional[str] = None,
2019
) -> NamedTuple("outputs", best_model=str, best_score=float):
2120
import json
2221
import os
@@ -187,15 +186,17 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
187186
all_mt_bench_data.append(mt_bench_data)
188187
scores[model_path] = overall_score
189188

190-
with open(output_path, "w", encoding="utf-8") as f:
191-
json.dump(all_mt_bench_data, f, indent=4)
192-
193189
outputs = NamedTuple("outputs", best_model=str, best_score=float)
194190
best_model = max(scores, key=scores.get)
195191
best_score = scores[best_model]
196-
if best_score_file:
197-
with open(best_score_file, "w", encoding="utf-8") as f:
198-
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
192+
mt_bench_report = {
193+
"best_model": best_model,
194+
"best_score": best_score,
195+
"reports": all_mt_bench_data,
196+
}
197+
198+
with open(output_path, "w", encoding="utf-8") as f:
199+
json.dump(mt_bench_report, f, indent=4)
199200

200201
# Rename the best model directory to "candidate_model" for the next step
201202
# So we know which model to use for the final evaluation

pipeline.yaml

+60-62
Original file line numberDiff line numberDiff line change
@@ -494,9 +494,6 @@ components:
494494
executorLabel: exec-run-mt-bench-op
495495
inputDefinitions:
496496
parameters:
497-
best_score_file:
498-
isOptional: true
499-
parameterType: STRING
500497
max_workers:
501498
parameterType: STRING
502499
merge_system_user_message:
@@ -686,20 +683,18 @@ deploymentSpec:
686683
'
687684
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
688685
\ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\
689-
):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.json\"\
690-
, \"r\") as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n\
691-
\n metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"\
692-
])\n metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\
693-
])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\
694-
error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\
695-
, \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\
696-
\ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\
697-
overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\
698-
, mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\
699-
/output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\
700-
\ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\
701-
, mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\
702-
, mmlu_branch_data[\"base_model_score\"])\n\n"
686+
):\n import json\n\n reports = {\n \"mt_bench\": \"/output/mt_bench_data.json\"\
687+
,\n \"mt_bench_branch\": \"/output/mt_bench_branch/mt_bench_branch_data.json\"\
688+
,\n \"mmlu_branch\": \"/output/mmlu_branch/mmlu_branch_data.json\"\
689+
,\n }\n\n for report, file_name in reports.items():\n with\
690+
\ open(file_name, \"r\", encoding=\"utf-8\") as f:\n report_data\
691+
\ = json.load(f)\n\n if report == \"mt_bench\":\n metrics.log_metric(f\"\
692+
{report}_best_model\", report_data[\"best_model\"])\n metrics.log_metric(f\"\
693+
{report}_best_score\", report_data[\"best_score\"])\n else:\n \
694+
\ metrics.log_metric(\n f\"{report}_trained_model_score\"\
695+
, report_data[\"trained_model_score\"]\n )\n metrics.log_metric(\n\
696+
\ f\"{report}_base_model_score\", report_data[\"base_model_score\"\
697+
]\n )\n\n"
703698
image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
704699
exec-git-clone-op:
705700
container:
@@ -1188,13 +1183,14 @@ deploymentSpec:
11881183
,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\
11891184
\ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\
11901185
/output/mt_bench_branch\",\n):\n import json\n import os\n import\
1191-
\ subprocess\n\n import httpx\n import torch\n from instructlab.eval.mmlu\
1192-
\ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\
1193-
\ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
1194-
\ sort_score\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n\
1195-
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint =\
1196-
\ os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\"\
1197-
)\n use_tls = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
1186+
\ subprocess\n from pathlib import Path\n\n import httpx\n import\
1187+
\ torch\n from instructlab.eval.mmlu import MMLUBranchEvaluator\n \
1188+
\ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
1189+
\ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\
1190+
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
1191+
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
1192+
)\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\
1193+
\ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
11981194
\ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
11991195
\ if use_tls else None\n\n print(\"Starting Final Eval...\")\n\n def\
12001196
\ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\
@@ -1335,14 +1331,15 @@ deploymentSpec:
13351331
\ regressions,\n no_changes,\n )\n\n \
13361332
\ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\
13371333
\ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\
1338-
: candidate_model,\n \"model_score\": round(overall_score, 2),\n\
1339-
\ \"base_model\": base_model_dir,\n \"base_model_score\"\
1334+
: candidate_model,\n \"trained_model_score\": round(overall_score,\
1335+
\ 2),\n \"base_model\": base_model_dir,\n \"base_model_score\"\
13401336
: round(base_overall_score, 2),\n \"summary\": summary,\n \
13411337
\ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \
1342-
\ os.makedirs(mmlu_branch_output_path)\n with open(\n \
1343-
\ f\"{mmlu_branch_output_path}/mmlu_branch_data.json\", \"w\", encoding=\"\
1344-
utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\
1345-
\n else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
1338+
\ os.makedirs(mmlu_branch_output_path)\n mmlu_branch_output_file\
1339+
\ = (\n Path(mmlu_branch_output_path) / \"mmlu_branch_data.json\"\
1340+
\n )\n with open(mmlu_branch_output_file, \"w\", encoding=\"\
1341+
utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
1342+
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
13461343
\ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\
13471344
\ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \
13481345
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\
@@ -1406,12 +1403,13 @@ deploymentSpec:
14061403
\ new_qnas,\n )\n\n mt_bench_branch_data = {\n \"report_title\"\
14071404
: \"SKILLS EVALUATION REPORT\",\n \"model\": candidate_model,\n \
14081405
\ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\
1409-
,\n \"overall_score\": overall_score,\n \"base_overall_score\"\
1406+
,\n \"trained_model_score\": overall_score,\n \"base_model_score\"\
14101407
: base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\
14111408
: summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\
1412-
\ os.makedirs(mt_bench_branch_output_path)\n with open(\n \
1413-
\ f\"{mt_bench_branch_output_path}/mt_bench_branch_data.json\",\n \
1414-
\ \"w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\
1409+
\ os.makedirs(mt_bench_branch_output_path)\n mt_bench_branch_data_file\
1410+
\ = (\n Path(mt_bench_branch_output_path) / \"mt_bench_branch_data.json\"\
1411+
\n )\n with open(\n mt_bench_branch_data_file,\n \"\
1412+
w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\
14151413
\ f, indent=4)\n\n"
14161414
env:
14171415
- name: HOME
@@ -1449,23 +1447,23 @@ deploymentSpec:
14491447
\ - 'auto'\n # with 'auto', number of gpus allocated for serving is\
14501448
\ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
14511449
\ max_workers: str,\n models_folder: str,\n output_path: str =\
1452-
\ \"/output/mt_bench_data.json\",\n best_score_file: Optional[str] =\
1453-
\ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\
1454-
\ import json\n import os\n import subprocess\n\n import httpx\n\
1455-
\ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\
1456-
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
1457-
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
1458-
)\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\
1459-
\ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
1460-
\ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
1461-
\ if use_tls else None\n\n def launch_vllm(\n model_path: str,\
1462-
\ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\
1463-
\ import subprocess\n import sys\n import time\n\n\
1464-
\ import requests\n from instructlab.model.backends.common\
1465-
\ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\
1466-
127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\
1467-
\n\n command = [\n sys.executable,\n \"-m\"\
1468-
,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\
1450+
\ \"/output/mt_bench_data.json\",\n) -> NamedTuple(\"outputs\", best_model=str,\
1451+
\ best_score=float):\n import json\n import os\n import subprocess\n\
1452+
\n import httpx\n import torch\n from instructlab.eval.mt_bench\
1453+
\ import MTBenchEvaluator\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\
1454+
, \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\
1455+
\ = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"\
1456+
JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\
1457+
\ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\
1458+
\ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \
1459+
\ def launch_vllm(\n model_path: str, gpu_count: int, retries: int\
1460+
\ = 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \
1461+
\ import sys\n import time\n\n import requests\n \
1462+
\ from instructlab.model.backends.common import free_tcp_ipv4_port\n\
1463+
\n free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port =\
1464+
\ str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\
1465+
\n command = [\n sys.executable,\n \"-m\",\n\
1466+
\ \"vllm.entrypoints.openai.api_server\",\n \"--port\"\
14691467
,\n port,\n \"--model\",\n model_path,\n\
14701468
\ ]\n if gpu_count > 0:\n command += [\n \
14711469
\ \"--tensor-parallel-size\",\n str(gpu_count),\n\
@@ -1529,17 +1527,17 @@ deploymentSpec:
15291527
\ \"overall_score\": overall_score,\n \"turn_scores\"\
15301528
: turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\
15311529
: error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\
1532-
\ scores[model_path] = overall_score\n\n with open(output_path,\
1533-
\ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\
1534-
\ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\
1535-
\ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\
1536-
\ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\
1537-
\ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\
1538-
: best_model, \"best_score\": best_score}, f, indent=4)\n\n # Rename\
1539-
\ the best model directory to \"candidate_model\" for the next step\n \
1540-
\ # So we know which model to use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\
1541-
\ \"candidate_model\")):\n print(\"candidate_model already exists.\
1542-
\ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\
1530+
\ scores[model_path] = overall_score\n\n outputs = NamedTuple(\"\
1531+
outputs\", best_model=str, best_score=float)\n best_model = max(scores,\
1532+
\ key=scores.get)\n best_score = scores[best_model]\n mt_bench_report\
1533+
\ = {\n \"best_model\": best_model,\n \"best_score\": best_score,\n\
1534+
\ \"reports\": all_mt_bench_data,\n }\n\n with open(output_path,\
1535+
\ \"w\", encoding=\"utf-8\") as f:\n json.dump(mt_bench_report, f,\
1536+
\ indent=4)\n\n # Rename the best model directory to \"candidate_model\"\
1537+
\ for the next step\n # So we know which model to use for the final evaluation\n\
1538+
\ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\
1539+
\ print(\"candidate_model already exists. Skipping renaming\")\n\
1540+
\ else:\n os.rename(\n os.path.join(models_folder,\
15431541
\ best_model),\n os.path.join(models_folder, \"candidate_model\"\
15441542
),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\
15451543
\n"

0 commit comments

Comments
 (0)