Skip to content

Commit f5eeed4

Browse files
committed
Add final eval mmlu_branch
1 parent 2f7d51e commit f5eeed4

File tree

8 files changed

+239
-23
lines changed

8 files changed

+239
-23
lines changed

eval/final/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from .components import run_mt_bench_branch_op
1+
from .components import run_final_eval_op
22

33
# from . import faked
44

5-
__all__ = ["run_mt_bench_branch_op"]
5+
__all__ = ["run_final_eval_op"]

eval/final/components.py

+98-1
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,20 @@
1414
"vllm",
1515
],
1616
)
17-
def run_mt_bench_branch_op(
17+
def run_final_eval_op(
18+
mmlu_branch_output: Output[Artifact],
1819
mt_bench_branch_output: Output[Artifact],
1920
candidate_model: str,
2021
base_model_dir: str,
22+
tasks: Input[Dataset],
2123
taxonomy: Input[Dataset],
2224
base_branch: str,
2325
candidate_branch: str,
2426
max_workers: str,
2527
device: str,
28+
model_dtype: str,
29+
few_shots: int,
30+
batch_size: int,
2631
merge_system_user_message: bool,
2732
):
2833
import json
@@ -34,6 +39,7 @@ def run_mt_bench_branch_op(
3439
launch_vllm,
3540
stop_vllm,
3641
)
42+
from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator
3743
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
3844
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score
3945

@@ -241,3 +247,94 @@ def branch_eval_summary_to_json(
241247

242248
with open(mt_bench_branch_output.path, "w") as f:
243249
json.dump(mt_bench_branch_data, f, indent=4)
250+
251+
# MMLU_BRANCH
252+
253+
# find_node_dataset_directories to find sdg output node_datasets_*
254+
def find_node_dataset_directories(base_directory: str):
255+
import os
256+
import re
257+
258+
# This is specific to ilab/eval output
259+
pattern = r"node_datasets_"
260+
matching_dirs = []
261+
regex = re.compile(pattern)
262+
263+
for root, dirs, files in os.walk(base_directory):
264+
for directory in dirs:
265+
if regex.search(directory):
266+
matching_dirs.append(os.path.join(root, directory))
267+
268+
return matching_dirs
269+
270+
mmlu_tasks = ["mmlu_pr"]
271+
272+
node_dataset_dirs = find_node_dataset_directories(tasks.path)
273+
if node_dataset_dirs:
274+
tasks_dir = node_dataset_dirs[0]
275+
276+
mmlu_branch_evaluators = [
277+
MMLUBranchEvaluator(
278+
model_path=candidate_model,
279+
tasks_dir=tasks_dir,
280+
tasks=mmlu_tasks,
281+
few_shots=few_shots,
282+
batch_size=batch_size,
283+
),
284+
MMLUBranchEvaluator(
285+
model_path=base_model,
286+
tasks_dir=tasks_dir,
287+
tasks=mmlu_tasks,
288+
few_shots=few_shots,
289+
batch_size=batch_size,
290+
),
291+
]
292+
m_paths = [candidate_model, base_model]
293+
overall_scores = []
294+
individual_scores_list = []
295+
for i, evaluator in enumerate(mmlu_branch_evaluators):
296+
m_path = m_paths[i]
297+
launch_local_vllm(m_path, gpu_count)
298+
overall_score, individual_scores = evaluator.run(VLLM_SERVER)
299+
overall_scores.append(overall_score)
300+
individual_scores_list.append(individual_scores)
301+
stop_local_vllm()
302+
303+
overall_score = overall_scores[0]
304+
base_overall_score = overall_scores[1]
305+
individual_scores = individual_scores_list[0]
306+
base_individual_scores = individual_scores_list[1]
307+
308+
improvements, regressions, no_changes = [], [], []
309+
for task, score in individual_scores.items():
310+
base_score = base_individual_scores[task]
311+
s = score["score"]
312+
b_s = base_score["score"]
313+
d = round(s - b_s, 2)
314+
if s > b_s:
315+
improvements.append((task, d, b_s, s))
316+
elif b_s > s:
317+
regressions.append((task, d, b_s, s))
318+
else:
319+
no_changes.append((task, s))
320+
321+
summary = branch_eval_summary_to_json(
322+
improvements,
323+
regressions,
324+
no_changes,
325+
)
326+
327+
mmlu_branch_data = {
328+
"report_title": "KNOWLEDGE EVALUATION REPORT",
329+
"max_score": "1.0",
330+
"model": candidate_model,
331+
"model_score": round(overall_score, 2),
332+
"base_model": base_model,
333+
"base_model_score": round(base_overall_score, 2),
334+
"summary": summary,
335+
}
336+
337+
with open(mmlu_branch_output.path, "w") as f:
338+
json.dump(mmlu_branch_data, f, indent=4)
339+
else:
340+
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")

pipeline.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
6565
)
6666

6767
# Imports for evaluation
68-
from eval.final import run_mt_bench_branch_op
68+
from eval.final import run_final_eval_op
6969
from eval.mmlu import load_mmlu_results_op, run_mmlu_op
7070

7171
## from eval.mmlu import run_mmlu_op, load_mmlu_results_op
@@ -313,16 +313,20 @@ def pipeline(
313313

314314
use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"})
315315

316-
final_eval_task = run_mt_bench_branch_op(
316+
final_eval_task = run_final_eval_op(
317317
candidate_model=run_mt_bench_task.outputs["best_model"],
318318
taxonomy=git_clone_task.outputs["taxonomy"],
319+
tasks=sdg_task.outputs["sdg"],
319320
# TODO: DO we need both candidate_branch and base_branch
320321
base_branch=repo_branch,
321322
candidate_branch=repo_branch,
322323
device=device,
323324
base_model_dir=BASE_MODEL_DIR,
324325
max_workers=max_workers,
325326
merge_system_user_message=merge_system_user_message,
327+
model_dtype=model_dtype,
328+
few_shots=few_shots,
329+
batch_size=batch_size,
326330
)
327331

328332
mount_pvc(

pipeline.yaml

+86-17
Original file line numberDiff line numberDiff line change
@@ -423,10 +423,14 @@ components:
423423
parameterType: STRING
424424
name:
425425
parameterType: STRING
426-
comp-run-mt-bench-branch-op:
427-
executorLabel: exec-run-mt-bench-branch-op
426+
comp-run-final-eval-op:
427+
executorLabel: exec-run-final-eval-op
428428
inputDefinitions:
429429
artifacts:
430+
tasks:
431+
artifactType:
432+
schemaTitle: system.Dataset
433+
schemaVersion: 0.0.1
430434
taxonomy:
431435
artifactType:
432436
schemaTitle: system.Dataset
@@ -436,18 +440,28 @@ components:
436440
parameterType: STRING
437441
base_model_dir:
438442
parameterType: STRING
443+
batch_size:
444+
parameterType: NUMBER_INTEGER
439445
candidate_branch:
440446
parameterType: STRING
441447
candidate_model:
442448
parameterType: STRING
443449
device:
444450
parameterType: STRING
451+
few_shots:
452+
parameterType: NUMBER_INTEGER
445453
max_workers:
446454
parameterType: STRING
447455
merge_system_user_message:
448456
parameterType: BOOLEAN
457+
model_dtype:
458+
parameterType: STRING
449459
outputDefinitions:
450460
artifacts:
461+
mmlu_branch_output:
462+
artifactType:
463+
schemaTitle: system.Artifact
464+
schemaVersion: 0.0.1
451465
mt_bench_branch_output:
452466
artifactType:
453467
schemaTitle: system.Artifact
@@ -934,13 +948,13 @@ deploymentSpec:
934948
\ claimName: {output_pvc_name}\n \"\"\"\n\
935949
\ )\n\n return Outputs(manifest, name)\n\n"
936950
image: registry.access.redhat.com/ubi9/python-311:latest
937-
exec-run-mt-bench-branch-op:
951+
exec-run-final-eval-op:
938952
container:
939953
args:
940954
- --executor_input
941955
- '{{$}}'
942956
- --function_to_execute
943-
- run_mt_bench_branch_op
957+
- run_final_eval_op
944958
command:
945959
- sh
946960
- -c
@@ -961,15 +975,17 @@ deploymentSpec:
961975
962976
'
963977
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
964-
\ *\n\ndef run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n\
965-
\ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n\
978+
\ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\
979+
\ mt_bench_branch_output: Output[Artifact],\n candidate_model: str,\n\
980+
\ base_model_dir: str,\n tasks: Input[Dataset],\n taxonomy: Input[Dataset],\n\
966981
\ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\
967-
\ device: str,\n merge_system_user_message: bool,\n):\n import\
968-
\ json\n import os\n\n import torch\n from helpers import (\n \
969-
\ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n\
970-
\ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
971-
\ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\
972-
\n ######################################################################\n\
982+
\ device: str,\n model_dtype: str,\n few_shots: int,\n batch_size:\
983+
\ int,\n merge_system_user_message: bool,\n):\n import json\n import\
984+
\ os\n\n import torch\n from helpers import (\n VLLM_SERVER,\n\
985+
\ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mmlu\
986+
\ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\
987+
\ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\
988+
\ qa_pairs_to_qna_to_avg_scores, sort_score\n\n ######################################################################\n\
973989
\ # branch_eval_summary_to_json creates a json object from output of\
974990
\ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\
975991
\ repository\n def branch_eval_summary_to_json(\n improvements:\
@@ -1068,7 +1084,49 @@ deploymentSpec:
10681084
,\n \"overall_score\": overall_score,\n \"base_overall_score\"\
10691085
: base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\
10701086
: summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\") as\
1071-
\ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n"
1087+
\ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n # MMLU_BRANCH\n\
1088+
\n # find_node_dataset_directories to find sdg output node_datasets_*\n\
1089+
\ def find_node_dataset_directories(base_directory: str):\n import\
1090+
\ os\n import re\n\n # This is specific to ilab/eval output\n\
1091+
\ pattern = r\"node_datasets_\"\n matching_dirs = []\n \
1092+
\ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\
1093+
\ for directory in dirs:\n if regex.search(directory):\n\
1094+
\ matching_dirs.append(os.path.join(root, directory))\n\
1095+
\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \
1096+
\ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n if\
1097+
\ node_dataset_dirs:\n tasks_dir = node_dataset_dirs[0]\n\n \
1098+
\ mmlu_branch_evaluators = [\n MMLUBranchEvaluator(\n \
1099+
\ model_path=candidate_model,\n tasks_dir=tasks_dir,\n\
1100+
\ tasks=mmlu_tasks,\n few_shots=few_shots,\n\
1101+
\ batch_size=batch_size,\n ),\n MMLUBranchEvaluator(\n\
1102+
\ model_path=base_model,\n tasks_dir=tasks_dir,\n\
1103+
\ tasks=mmlu_tasks,\n few_shots=few_shots,\n\
1104+
\ batch_size=batch_size,\n ),\n ]\n \
1105+
\ m_paths = [candidate_model, base_model]\n overall_scores =\
1106+
\ []\n individual_scores_list = []\n for i, evaluator in enumerate(mmlu_branch_evaluators):\n\
1107+
\ m_path = m_paths[i]\n launch_local_vllm(m_path,\
1108+
\ gpu_count)\n overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n\
1109+
\ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n\
1110+
\ stop_local_vllm()\n\n overall_score = overall_scores[0]\n\
1111+
\ base_overall_score = overall_scores[1]\n individual_scores\
1112+
\ = individual_scores_list[0]\n base_individual_scores = individual_scores_list[1]\n\
1113+
\n improvements, regressions, no_changes = [], [], []\n for\
1114+
\ task, score in individual_scores.items():\n base_score = base_individual_scores[task]\n\
1115+
\ s = score[\"score\"]\n b_s = base_score[\"score\"\
1116+
]\n d = round(s - b_s, 2)\n if s > b_s:\n \
1117+
\ improvements.append((task, d, b_s, s))\n elif b_s >\
1118+
\ s:\n regressions.append((task, d, b_s, s))\n \
1119+
\ else:\n no_changes.append((task, s))\n\n summary\
1120+
\ = branch_eval_summary_to_json(\n improvements,\n \
1121+
\ regressions,\n no_changes,\n )\n\n mmlu_branch_data\
1122+
\ = {\n \"report_title\": \"KNOWLEDGE EVALUATION REPORT\",\n\
1123+
\ \"max_score\": \"1.0\",\n \"model\": candidate_model,\n\
1124+
\ \"model_score\": round(overall_score, 2),\n \"base_model\"\
1125+
: base_model,\n \"base_model_score\": round(base_overall_score,\
1126+
\ 2),\n \"summary\": summary,\n }\n\n with open(mmlu_branch_output.path,\
1127+
\ \"w\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
1128+
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
1129+
\ evaluation.\")\n\n"
10721130
image: quay.io/sallyom/instructlab-ocp:eval-10-8
10731131
resources:
10741132
accelerator:
@@ -1593,18 +1651,23 @@ root:
15931651
constant: second
15941652
taskInfo:
15951653
name: pytorchjob-manifest-op-2
1596-
run-mt-bench-branch-op:
1654+
run-final-eval-op:
15971655
cachingOptions:
15981656
enableCache: true
15991657
componentRef:
1600-
name: comp-run-mt-bench-branch-op
1658+
name: comp-run-final-eval-op
16011659
dependentTasks:
16021660
- createpvc
16031661
- createpvc-3
16041662
- git-clone-op
16051663
- run-mt-bench-op
1664+
- sdg-op
16061665
inputs:
16071666
artifacts:
1667+
tasks:
1668+
taskOutputArtifact:
1669+
outputArtifactKey: sdg
1670+
producerTask: sdg-op
16081671
taxonomy:
16091672
taskOutputArtifact:
16101673
outputArtifactKey: taxonomy
@@ -1615,6 +1678,8 @@ root:
16151678
base_model_dir:
16161679
runtimeValue:
16171680
constant: /model/model
1681+
batch_size:
1682+
componentInputParameter: batch_size
16181683
candidate_branch:
16191684
componentInputParameter: repo_branch
16201685
candidate_model:
@@ -1623,12 +1688,16 @@ root:
16231688
producerTask: run-mt-bench-op
16241689
device:
16251690
componentInputParameter: device
1691+
few_shots:
1692+
componentInputParameter: few_shots
16261693
max_workers:
16271694
componentInputParameter: max_workers
16281695
merge_system_user_message:
16291696
componentInputParameter: merge_system_user_message
1697+
model_dtype:
1698+
componentInputParameter: model_dtype
16301699
taskInfo:
1631-
name: run-mt-bench-branch-op
1700+
name: run-final-eval-op
16321701
run-mt-bench-op:
16331702
cachingOptions: {}
16341703
componentRef:
@@ -1773,7 +1842,7 @@ platforms:
17731842
taskOutputParameter:
17741843
outputParameterKey: name
17751844
producerTask: createpvc-3
1776-
exec-run-mt-bench-branch-op:
1845+
exec-run-final-eval-op:
17771846
configMapAsEnv:
17781847
- configMapName: kfp-model-server
17791848
keyToEnv:

sdg/faked/components.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,11 @@ def git_clone_op(
2727
)
2828

2929

30+
# TODO: Update once merged into main
3031
@dsl.component(
3132
base_image=PYTHON_IMAGE,
3233
packages_to_install=[
33-
"git+https://github.com/redhat-et/ilab-on-ocp.git#subdirectory=sdg/faked/fixtures"
34+
"git+https://github.com/sallyom/ilab-on-ocp.git@final-mmlu-branch#subdirectory=sdg/faked/fixtures"
3435
],
3536
)
3637
def sdg_op(
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
task: mmlu_pr
2+
dataset_path: json
3+
dataset_name: null
4+
test_split: test
5+
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
6+
doc_to_choice: ["A", "B", "C", "D"]
7+
doc_to_target: answer
8+
output_type: multiple_choice
9+
metric_list:
10+
- metric: acc
11+
aggregation: mean
12+
higher_is_better: true

0 commit comments

Comments
 (0)