@@ -423,10 +423,14 @@ components:
423
423
parameterType : STRING
424
424
name :
425
425
parameterType : STRING
426
- comp-run-mt-bench-branch -op :
427
- executorLabel : exec-run-mt-bench-branch -op
426
+ comp-run-final-eval -op :
427
+ executorLabel : exec-run-final-eval -op
428
428
inputDefinitions :
429
429
artifacts :
430
+ tasks :
431
+ artifactType :
432
+ schemaTitle : system.Dataset
433
+ schemaVersion : 0.0.1
430
434
taxonomy :
431
435
artifactType :
432
436
schemaTitle : system.Dataset
@@ -436,18 +440,28 @@ components:
436
440
parameterType : STRING
437
441
base_model_dir :
438
442
parameterType : STRING
443
+ batch_size :
444
+ parameterType : NUMBER_INTEGER
439
445
candidate_branch :
440
446
parameterType : STRING
441
447
candidate_model :
442
448
parameterType : STRING
443
449
device :
444
450
parameterType : STRING
451
+ few_shots :
452
+ parameterType : NUMBER_INTEGER
445
453
max_workers :
446
454
parameterType : STRING
447
455
merge_system_user_message :
448
456
parameterType : BOOLEAN
457
+ model_dtype :
458
+ parameterType : STRING
449
459
outputDefinitions :
450
460
artifacts :
461
+ mmlu_branch_output :
462
+ artifactType :
463
+ schemaTitle : system.Artifact
464
+ schemaVersion : 0.0.1
451
465
mt_bench_branch_output :
452
466
artifactType :
453
467
schemaTitle : system.Artifact
@@ -934,13 +948,13 @@ deploymentSpec:
934
948
\ claimName: {output_pvc_name}\n \"\"\"\n \
935
949
\ )\n\n return Outputs(manifest, name)\n\n "
936
950
image : registry.access.redhat.com/ubi9/python-311:latest
937
- exec-run-mt-bench-branch -op :
951
+ exec-run-final-eval -op :
938
952
container :
939
953
args :
940
954
- --executor_input
941
955
- ' {{$}}'
942
956
- --function_to_execute
943
- - run_mt_bench_branch_op
957
+ - run_final_eval_op
944
958
command :
945
959
- sh
946
960
- -c
@@ -961,15 +975,17 @@ deploymentSpec:
961
975
962
976
'
963
977
- " \n import kfp\n from kfp import dsl\n from kfp.dsl import *\n from typing import\
964
- \ *\n\n def run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n \
965
- \ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n \
978
+ \ *\n\n def run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n \
979
+ \ mt_bench_branch_output: Output[Artifact],\n candidate_model: str,\n \
980
+ \ base_model_dir: str,\n tasks: Input[Dataset],\n taxonomy: Input[Dataset],\n \
966
981
\ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n \
967
- \ device: str,\n merge_system_user_message: bool,\n ):\n import\
968
- \ json\n import os\n\n import torch\n from helpers import (\n \
969
- \ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n \
970
- \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
971
- \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n \
972
- \n ######################################################################\n \
982
+ \ device: str,\n model_dtype: str,\n few_shots: int,\n batch_size:\
983
+ \ int,\n merge_system_user_message: bool,\n ):\n import json\n import\
984
+ \ os\n\n import torch\n from helpers import (\n VLLM_SERVER,\n \
985
+ \ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mmlu\
986
+ \ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\
987
+ \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\
988
+ \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n ######################################################################\n \
973
989
\ # branch_eval_summary_to_json creates a json object from output of\
974
990
\ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\
975
991
\ repository\n def branch_eval_summary_to_json(\n improvements:\
@@ -1068,7 +1084,49 @@ deploymentSpec:
1068
1084
,\n \" overall_score\" : overall_score,\n \" base_overall_score\" \
1069
1085
: base_overall_score,\n \" error_rate\" : error_rate,\n \" summary\" \
1070
1086
: summary,\n }\n\n with open(mt_bench_branch_output.path, \" w\" ) as\
1071
- \ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n "
1087
+ \ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n # MMLU_BRANCH\n \
1088
+ \n # find_node_dataset_directories to find sdg output node_datasets_*\n \
1089
+ \ def find_node_dataset_directories(base_directory: str):\n import\
1090
+ \ os\n import re\n\n # This is specific to ilab/eval output\n \
1091
+ \ pattern = r\" node_datasets_\"\n matching_dirs = []\n \
1092
+ \ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n \
1093
+ \ for directory in dirs:\n if regex.search(directory):\n \
1094
+ \ matching_dirs.append(os.path.join(root, directory))\n \
1095
+ \n return matching_dirs\n\n mmlu_tasks = [\" mmlu_pr\" ]\n\n \
1096
+ \ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n if\
1097
+ \ node_dataset_dirs:\n tasks_dir = node_dataset_dirs[0]\n\n \
1098
+ \ mmlu_branch_evaluators = [\n MMLUBranchEvaluator(\n \
1099
+ \ model_path=candidate_model,\n tasks_dir=tasks_dir,\n \
1100
+ \ tasks=mmlu_tasks,\n few_shots=few_shots,\n \
1101
+ \ batch_size=batch_size,\n ),\n MMLUBranchEvaluator(\n \
1102
+ \ model_path=base_model,\n tasks_dir=tasks_dir,\n \
1103
+ \ tasks=mmlu_tasks,\n few_shots=few_shots,\n \
1104
+ \ batch_size=batch_size,\n ),\n ]\n \
1105
+ \ m_paths = [candidate_model, base_model]\n overall_scores =\
1106
+ \ []\n individual_scores_list = []\n for i, evaluator in enumerate(mmlu_branch_evaluators):\n \
1107
+ \ m_path = m_paths[i]\n launch_local_vllm(m_path,\
1108
+ \ gpu_count)\n overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n \
1109
+ \ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n \
1110
+ \ stop_local_vllm()\n\n overall_score = overall_scores[0]\n \
1111
+ \ base_overall_score = overall_scores[1]\n individual_scores\
1112
+ \ = individual_scores_list[0]\n base_individual_scores = individual_scores_list[1]\n \
1113
+ \n improvements, regressions, no_changes = [], [], []\n for\
1114
+ \ task, score in individual_scores.items():\n base_score = base_individual_scores[task]\n \
1115
+ \ s = score[\" score\" ]\n b_s = base_score[\" score\" \
1116
+ ]\n d = round(s - b_s, 2)\n if s > b_s:\n \
1117
+ \ improvements.append((task, d, b_s, s))\n elif b_s >\
1118
+ \ s:\n regressions.append((task, d, b_s, s))\n \
1119
+ \ else:\n no_changes.append((task, s))\n\n summary\
1120
+ \ = branch_eval_summary_to_json(\n improvements,\n \
1121
+ \ regressions,\n no_changes,\n )\n\n mmlu_branch_data\
1122
+ \ = {\n \" report_title\" : \" KNOWLEDGE EVALUATION REPORT\" ,\n \
1123
+ \ \" max_score\" : \" 1.0\" ,\n \" model\" : candidate_model,\n \
1124
+ \ \" model_score\" : round(overall_score, 2),\n \" base_model\" \
1125
+ : base_model,\n \" base_model_score\" : round(base_overall_score,\
1126
+ \ 2),\n \" summary\" : summary,\n }\n\n with open(mmlu_branch_output.path,\
1127
+ \ \" w\" ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
1128
+ \ else:\n print(\" No MMLU tasks directories found, skipping MMLU_branch\
1129
+ \ evaluation.\" )\n\n "
1072
1130
image : quay.io/sallyom/instructlab-ocp:eval-10-8
1073
1131
resources :
1074
1132
accelerator :
@@ -1593,18 +1651,23 @@ root:
1593
1651
constant : second
1594
1652
taskInfo :
1595
1653
name : pytorchjob-manifest-op-2
1596
- run-mt-bench-branch -op :
1654
+ run-final-eval -op :
1597
1655
cachingOptions :
1598
1656
enableCache : true
1599
1657
componentRef :
1600
- name : comp-run-mt-bench-branch -op
1658
+ name : comp-run-final-eval -op
1601
1659
dependentTasks :
1602
1660
- createpvc
1603
1661
- createpvc-3
1604
1662
- git-clone-op
1605
1663
- run-mt-bench-op
1664
+ - sdg-op
1606
1665
inputs :
1607
1666
artifacts :
1667
+ tasks :
1668
+ taskOutputArtifact :
1669
+ outputArtifactKey : sdg
1670
+ producerTask : sdg-op
1608
1671
taxonomy :
1609
1672
taskOutputArtifact :
1610
1673
outputArtifactKey : taxonomy
@@ -1615,6 +1678,8 @@ root:
1615
1678
base_model_dir :
1616
1679
runtimeValue :
1617
1680
constant : /model/model
1681
+ batch_size :
1682
+ componentInputParameter : batch_size
1618
1683
candidate_branch :
1619
1684
componentInputParameter : repo_branch
1620
1685
candidate_model :
@@ -1623,12 +1688,16 @@ root:
1623
1688
producerTask : run-mt-bench-op
1624
1689
device :
1625
1690
componentInputParameter : device
1691
+ few_shots :
1692
+ componentInputParameter : few_shots
1626
1693
max_workers :
1627
1694
componentInputParameter : max_workers
1628
1695
merge_system_user_message :
1629
1696
componentInputParameter : merge_system_user_message
1697
+ model_dtype :
1698
+ componentInputParameter : model_dtype
1630
1699
taskInfo :
1631
- name : run-mt-bench-branch -op
1700
+ name : run-final-eval -op
1632
1701
run-mt-bench-op :
1633
1702
cachingOptions : {}
1634
1703
componentRef :
@@ -1773,7 +1842,7 @@ platforms:
1773
1842
taskOutputParameter :
1774
1843
outputParameterKey : name
1775
1844
producerTask : createpvc-3
1776
- exec-run-mt-bench-branch -op :
1845
+ exec-run-final-eval -op :
1777
1846
configMapAsEnv :
1778
1847
- configMapName : kfp-model-server
1779
1848
keyToEnv :
0 commit comments