@@ -494,9 +494,6 @@ components:
494
494
executorLabel : exec-run-mt-bench-op
495
495
inputDefinitions :
496
496
parameters :
497
- best_score_file :
498
- isOptional : true
499
- parameterType : STRING
500
497
max_workers :
501
498
parameterType : STRING
502
499
merge_system_user_message :
@@ -686,20 +683,18 @@ deploymentSpec:
686
683
'
687
684
- " \n import kfp\n from kfp import dsl\n from kfp.dsl import *\n from typing import\
688
685
\ *\n\n def generate_metrics_report_op(\n metrics: Output[Metrics],\n \
689
- ):\n import ast\n import json\n\n with open(\" /output/mt_bench_data.json\" \
690
- , \" r\" ) as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n \
691
- \n metrics.log_metric(\" mt_bench_best_model\" , mt_bench_data[\" model\" \
692
- ])\n metrics.log_metric(\" mt_bench_best_score\" , mt_bench_data[\" overall_score\" \
693
- ])\n metrics.log_metric(\" mt_bench_best_model_error_rate\" , mt_bench_data[\" \
694
- error_rate\" ])\n\n with open(\" /output/mt_bench_branch/mt_bench_branch_data.json\" \
695
- , \" r\" ) as f:\n mt_bench_branch_data = json.loads(f.read())\n\n \
696
- \ metrics.log_metric(\" mt_bench_branch_score\" , mt_bench_branch_data[\" \
697
- overall_score\" ])\n metrics.log_metric(\n \" mt_bench_branch_base_score\" \
698
- , mt_bench_branch_data[\" base_overall_score\" ]\n )\n\n with open(\" \
699
- /output/mmlu_branch/mmlu_branch_data.json\" , \" r\" ) as f:\n mmlu_branch_data\
700
- \ = json.loads(f.read())\n\n metrics.log_metric(\" mmlu_branch_score\" \
701
- , mmlu_branch_data[\" model_score\" ])\n metrics.log_metric(\" mmlu_branch_base_score\" \
702
- , mmlu_branch_data[\" base_model_score\" ])\n\n "
686
+ ):\n import json\n\n reports = {\n \" mt_bench\" : \" /output/mt_bench_data.json\" \
687
+ ,\n \" mt_bench_branch\" : \" /output/mt_bench_branch/mt_bench_branch_data.json\" \
688
+ ,\n \" mmlu_branch\" : \" /output/mmlu_branch/mmlu_branch_data.json\" \
689
+ ,\n }\n\n for report, file_name in reports.items():\n with\
690
+ \ open(file_name, \" r\" , encoding=\" utf-8\" ) as f:\n report_data\
691
+ \ = json.load(f)\n\n if report == \" mt_bench\" :\n metrics.log_metric(f\" \
692
+ {report}_best_model\" , report_data[\" best_model\" ])\n metrics.log_metric(f\" \
693
+ {report}_best_score\" , report_data[\" best_score\" ])\n else:\n \
694
+ \ metrics.log_metric(\n f\" {report}_trained_model_score\" \
695
+ , report_data[\" trained_model_score\" ]\n )\n metrics.log_metric(\n \
696
+ \ f\" {report}_base_model_score\" , report_data[\" base_model_score\" \
697
+ ]\n )\n\n "
703
698
image : quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
704
699
exec-git-clone-op :
705
700
container :
@@ -1188,13 +1183,14 @@ deploymentSpec:
1188
1183
,\n sdg_path: str = \" /input/sdg\" ,\n mmlu_branch_output_path: str\
1189
1184
\ = \" /output/mmlu_branch\" ,\n mt_bench_branch_output_path: str = \" \
1190
1185
/output/mt_bench_branch\" ,\n ):\n import json\n import os\n import\
1191
- \ subprocess\n\n import httpx\n import torch\n from instructlab.eval.mmlu\
1192
- \ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\
1193
- \ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
1194
- \ sort_score\n\n judge_api_key = os.getenv(\" JUDGE_API_KEY\" , \"\" )\n \
1195
- \ judge_model_name = os.getenv(\" JUDGE_NAME\" )\n judge_endpoint =\
1196
- \ os.getenv(\" JUDGE_ENDPOINT\" )\n judge_ca_cert_path = os.getenv(\" JUDGE_CA_CERT_PATH\" \
1197
- )\n use_tls = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
1186
+ \ subprocess\n from pathlib import Path\n\n import httpx\n import\
1187
+ \ torch\n from instructlab.eval.mmlu import MMLUBranchEvaluator\n \
1188
+ \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
1189
+ \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n \
1190
+ \n judge_api_key = os.getenv(\" JUDGE_API_KEY\" , \"\" )\n judge_model_name\
1191
+ \ = os.getenv(\" JUDGE_NAME\" )\n judge_endpoint = os.getenv(\" JUDGE_ENDPOINT\" \
1192
+ )\n judge_ca_cert_path = os.getenv(\" JUDGE_CA_CERT_PATH\" )\n use_tls\
1193
+ \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
1198
1194
\ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
1199
1195
\ if use_tls else None\n\n print(\" Starting Final Eval...\" )\n\n def\
1200
1196
\ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\
@@ -1335,14 +1331,15 @@ deploymentSpec:
1335
1331
\ regressions,\n no_changes,\n )\n\n \
1336
1332
\ mmlu_branch_data = {\n \" report_title\" : \" KNOWLEDGE EVALUATION\
1337
1333
\ REPORT\" ,\n \" max_score\" : \" 1.0\" ,\n \" model\" \
1338
- : candidate_model,\n \" model_score \" : round(overall_score, 2), \n \
1339
- \ \" base_model\" : base_model_dir,\n \" base_model_score\" \
1334
+ : candidate_model,\n \" trained_model_score \" : round(overall_score,\
1335
+ \ 2), \n \" base_model\" : base_model_dir,\n \" base_model_score\" \
1340
1336
: round(base_overall_score, 2),\n \" summary\" : summary,\n \
1341
1337
\ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \
1342
- \ os.makedirs(mmlu_branch_output_path)\n with open(\n \
1343
- \ f\" {mmlu_branch_output_path}/mmlu_branch_data.json\" , \" w\" , encoding=\" \
1344
- utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
1345
- \n else:\n print(\" No MMLU tasks directories found, skipping MMLU_branch\
1338
+ \ os.makedirs(mmlu_branch_output_path)\n mmlu_branch_output_file\
1339
+ \ = (\n Path(mmlu_branch_output_path) / \" mmlu_branch_data.json\" \
1340
+ \n )\n with open(mmlu_branch_output_file, \" w\" , encoding=\" \
1341
+ utf-8\" ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
1342
+ \ else:\n print(\" No MMLU tasks directories found, skipping MMLU_branch\
1346
1343
\ evaluation.\" )\n\n # MT_BENCH_BRANCH\n\n print(\" Starting MT_BENCH_BRANCH\
1347
1344
\ ...\" )\n\n judge_api_key = os.getenv(\" JUDGE_API_KEY\" , \"\" )\n \
1348
1345
\ judge_model_name = os.getenv(\" JUDGE_NAME\" )\n judge_endpoint = os.getenv(\" \
@@ -1406,12 +1403,13 @@ deploymentSpec:
1406
1403
\ new_qnas,\n )\n\n mt_bench_branch_data = {\n \" report_title\" \
1407
1404
: \" SKILLS EVALUATION REPORT\" ,\n \" model\" : candidate_model,\n \
1408
1405
\ \" judge_model\" : judge_model_name,\n \" max_score\" : \" 10.0\" \
1409
- ,\n \" overall_score \" : overall_score,\n \" base_overall_score \" \
1406
+ ,\n \" trained_model_score \" : overall_score,\n \" base_model_score \" \
1410
1407
: base_overall_score,\n \" error_rate\" : error_rate,\n \" summary\" \
1411
1408
: summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n \
1412
- \ os.makedirs(mt_bench_branch_output_path)\n with open(\n \
1413
- \ f\" {mt_bench_branch_output_path}/mt_bench_branch_data.json\" ,\n \
1414
- \ \" w\" ,\n encoding=\" utf-8\" ,\n ) as f:\n json.dump(mt_bench_branch_data,\
1409
+ \ os.makedirs(mt_bench_branch_output_path)\n mt_bench_branch_data_file\
1410
+ \ = (\n Path(mt_bench_branch_output_path) / \" mt_bench_branch_data.json\" \
1411
+ \n )\n with open(\n mt_bench_branch_data_file,\n \" \
1412
+ w\" ,\n encoding=\" utf-8\" ,\n ) as f:\n json.dump(mt_bench_branch_data,\
1415
1413
\ f, indent=4)\n\n "
1416
1414
env :
1417
1415
- name : HOME
@@ -1449,23 +1447,23 @@ deploymentSpec:
1449
1447
\ - 'auto'\n # with 'auto', number of gpus allocated for serving is\
1450
1448
\ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n \
1451
1449
\ max_workers: str,\n models_folder: str,\n output_path: str =\
1452
- \ \" /output/mt_bench_data.json\" ,\n best_score_file: Optional[str] = \
1453
- \ None, \n ) -> NamedTuple( \" outputs \" , best_model=str, best_score=float): \n \
1454
- \ import json \ n import os \n import subprocess \n\n import httpx \n \
1455
- \ import torch \n from instructlab.eval.mt_bench import MTBenchEvaluator \n \
1456
- \ n judge_api_key = os.getenv(\" JUDGE_API_KEY \" , \"\" )\n judge_model_name \
1457
- \ = os.getenv(\" JUDGE_NAME \" )\n judge_endpoint = os.getenv(\" JUDGE_ENDPOINT \" \
1458
- )\n judge_ca_cert_path = os.getenv( \" JUDGE_CA_CERT_PATH \" ) \n use_tls \
1459
- \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
1460
- \ > 0 \n ) \n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
1461
- \ if use_tls else None \n\n def launch_vllm(\n model_path: str,\
1462
- \ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n \
1463
- \ import subprocess \n import sys \n import time \n\n \
1464
- \ import requests \n from instructlab.model.backends.common\
1465
- \ import free_tcp_ipv4_port \n\n free_port = free_tcp_ipv4_port(\" \
1466
- 127.0.0.1 \" ) \n port = str(free_port)\n vllm_server = f\" http://127.0.0.1:{port}/v1\" \
1467
- \n\n command = [\n sys.executable,\n \" -m\" \
1468
- , \n \" vllm.entrypoints.openai.api_server\" ,\n \" --port\" \
1450
+ \ \" /output/mt_bench_data.json\" ,\n ) -> NamedTuple( \" outputs \" , best_model=str, \
1451
+ \ best_score=float): \n import json \n import os \n import subprocess \n \
1452
+ \n import httpx \n import torch \n from instructlab.eval.mt_bench \
1453
+ \ import MTBenchEvaluator \n\n judge_api_key = os.getenv( \" JUDGE_API_KEY \" \
1454
+ , \"\" ) \ n judge_model_name = os.getenv(\" JUDGE_NAME \" )\n judge_endpoint \
1455
+ \ = os.getenv(\" JUDGE_ENDPOINT \" )\n judge_ca_cert_path = os.getenv(\" \
1456
+ JUDGE_CA_CERT_PATH \" )\n use_tls = os.path.exists(judge_ca_cert_path) \
1457
+ \ and (\n os.path.getsize(judge_ca_cert_path) > 0 \n ) \n judge_http_client \
1458
+ \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None \n\n \
1459
+ \ def launch_vllm(\n model_path: str, gpu_count: int, retries: int \
1460
+ \ = 120, delay: int = 10\n ) -> tuple:\n import subprocess \n \
1461
+ \ import sys \n import time \n\n import requests \n \
1462
+ \ from instructlab.model.backends.common import free_tcp_ipv4_port \n \
1463
+ \n free_port = free_tcp_ipv4_port(\" 127.0.0.1 \" ) \n port = \
1464
+ \ str(free_port)\n vllm_server = f\" http://127.0.0.1:{port}/v1\"\n \
1465
+ \n command = [\n sys.executable,\n \" -m\" , \n \
1466
+ \ \" vllm.entrypoints.openai.api_server\" ,\n \" --port\" \
1469
1467
,\n port,\n \" --model\" ,\n model_path,\n \
1470
1468
\ ]\n if gpu_count > 0:\n command += [\n \
1471
1469
\ \" --tensor-parallel-size\" ,\n str(gpu_count),\n \
@@ -1529,17 +1527,17 @@ deploymentSpec:
1529
1527
\ \" overall_score\" : overall_score,\n \" turn_scores\" \
1530
1528
: turn_scores,\n \" qa_scores\" : qa_pairs,\n \" error_rate\" \
1531
1529
: error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n \
1532
- \ scores[model_path] = overall_score\n\n with open(output_path, \
1533
- \ \" w \" , encoding= \" utf-8 \" ) as f: \n json.dump(all_mt_bench_data ,\
1534
- \ f, indent=4 )\n\n outputs = NamedTuple( \" outputs \" , best_model=str, \
1535
- \ best_score=float) \n best_model = max(scores, key=scores.get) \ n best_score\
1536
- \ = scores[best_model] \n if best_score_file: \n with open(best_score_file ,\
1537
- \ \" w\" , encoding=\" utf-8\" ) as f:\n json.dump({ \" best_model \" \
1538
- : best_model, \" best_score \" : best_score}, f, indent=4)\n\n # Rename\
1539
- \ the best model directory to \" candidate_model \" for the next step \n \
1540
- \ # So we know which model to use for the final evaluation \n if os.path.exists(os.path.join(models_folder,\
1541
- \ \" candidate_model \" )): \n print(\" candidate_model already exists.\
1542
- \ Skipping renaming \" ) \n else:\n os.rename(\n os.path.join(models_folder,\
1530
+ \ scores[model_path] = overall_score\n\n outputs = NamedTuple( \" \
1531
+ outputs \" , best_model=str, best_score=float) \n best_model = max(scores ,\
1532
+ \ key=scores.get )\n best_score = scores[best_model] \n mt_bench_report \
1533
+ \ = { \n \" best_model \" : best_model, \ n \" best_score\" : best_score, \n \
1534
+ \ \" reports \" : all_mt_bench_data, \n } \n\n with open(output_path ,\
1535
+ \ \" w\" , encoding=\" utf-8\" ) as f:\n json.dump(mt_bench_report, f, \
1536
+ \ indent=4)\n\n # Rename the best model directory to \" candidate_model \" \
1537
+ \ for the next step \n # So we know which model to use for the final evaluation \n \
1538
+ \ if os.path.exists(os.path.join(models_folder, \" candidate_model \" )): \n \
1539
+ \ print(\" candidate_model already exists. Skipping renaming \" ) \n \
1540
+ \ else:\n os.rename(\n os.path.join(models_folder,\
1543
1541
\ best_model),\n os.path.join(models_folder, \" candidate_model\" \
1544
1542
),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n \
1545
1543
\n "
0 commit comments