Skip to content

Commit

Permalink
Add final eval mmlu_branch
Browse files Browse the repository at this point in the history
  • Loading branch information
sallyom committed Oct 8, 2024
1 parent 2f7d51e commit f5eeed4
Show file tree
Hide file tree
Showing 8 changed files with 239 additions and 23 deletions.
4 changes: 2 additions & 2 deletions eval/final/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .components import run_mt_bench_branch_op
from .components import run_final_eval_op

# from . import faked

__all__ = ["run_mt_bench_branch_op"]
__all__ = ["run_final_eval_op"]
99 changes: 98 additions & 1 deletion eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@
"vllm",
],
)
def run_mt_bench_branch_op(
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
candidate_model: str,
base_model_dir: str,
tasks: Input[Dataset],
taxonomy: Input[Dataset],
base_branch: str,
candidate_branch: str,
max_workers: str,
device: str,
model_dtype: str,
few_shots: int,
batch_size: int,
merge_system_user_message: bool,
):
import json
Expand All @@ -34,6 +39,7 @@ def run_mt_bench_branch_op(
launch_vllm,
stop_vllm,
)
from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

Expand Down Expand Up @@ -241,3 +247,94 @@ def branch_eval_summary_to_json(

with open(mt_bench_branch_output.path, "w") as f:
json.dump(mt_bench_branch_data, f, indent=4)

# MMLU_BRANCH

# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_directory: str):
import os
import re

# This is specific to ilab/eval output
pattern = r"node_datasets_"
matching_dirs = []
regex = re.compile(pattern)

for root, dirs, files in os.walk(base_directory):
for directory in dirs:
if regex.search(directory):
matching_dirs.append(os.path.join(root, directory))

return matching_dirs

mmlu_tasks = ["mmlu_pr"]

node_dataset_dirs = find_node_dataset_directories(tasks.path)
if node_dataset_dirs:
tasks_dir = node_dataset_dirs[0]

mmlu_branch_evaluators = [
MMLUBranchEvaluator(
model_path=candidate_model,
tasks_dir=tasks_dir,
tasks=mmlu_tasks,
few_shots=few_shots,
batch_size=batch_size,
),
MMLUBranchEvaluator(
model_path=base_model,
tasks_dir=tasks_dir,
tasks=mmlu_tasks,
few_shots=few_shots,
batch_size=batch_size,
),
]
m_paths = [candidate_model, base_model]
overall_scores = []
individual_scores_list = []
for i, evaluator in enumerate(mmlu_branch_evaluators):
m_path = m_paths[i]
launch_local_vllm(m_path, gpu_count)
overall_score, individual_scores = evaluator.run(VLLM_SERVER)
overall_scores.append(overall_score)
individual_scores_list.append(individual_scores)
stop_local_vllm()

overall_score = overall_scores[0]
base_overall_score = overall_scores[1]
individual_scores = individual_scores_list[0]
base_individual_scores = individual_scores_list[1]

improvements, regressions, no_changes = [], [], []
for task, score in individual_scores.items():
base_score = base_individual_scores[task]
s = score["score"]
b_s = base_score["score"]
d = round(s - b_s, 2)
if s > b_s:
improvements.append((task, d, b_s, s))
elif b_s > s:
regressions.append((task, d, b_s, s))
else:
no_changes.append((task, s))

summary = branch_eval_summary_to_json(
improvements,
regressions,
no_changes,
)

mmlu_branch_data = {
"report_title": "KNOWLEDGE EVALUATION REPORT",
"max_score": "1.0",
"model": candidate_model,
"model_score": round(overall_score, 2),
"base_model": base_model,
"base_model_score": round(base_overall_score, 2),
"summary": summary,
}

with open(mmlu_branch_output.path, "w") as f:
json.dump(mmlu_branch_data, f, indent=4)
else:
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")
8 changes: 6 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
)

# Imports for evaluation
from eval.final import run_mt_bench_branch_op
from eval.final import run_final_eval_op
from eval.mmlu import load_mmlu_results_op, run_mmlu_op

## from eval.mmlu import run_mmlu_op, load_mmlu_results_op
Expand Down Expand Up @@ -313,16 +313,20 @@ def pipeline(

use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"})

final_eval_task = run_mt_bench_branch_op(
final_eval_task = run_final_eval_op(
candidate_model=run_mt_bench_task.outputs["best_model"],
taxonomy=git_clone_task.outputs["taxonomy"],
tasks=sdg_task.outputs["sdg"],
# TODO: DO we need both candidate_branch and base_branch
base_branch=repo_branch,
candidate_branch=repo_branch,
device=device,
base_model_dir=BASE_MODEL_DIR,
max_workers=max_workers,
merge_system_user_message=merge_system_user_message,
model_dtype=model_dtype,
few_shots=few_shots,
batch_size=batch_size,
)

mount_pvc(
Expand Down
103 changes: 86 additions & 17 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -423,10 +423,14 @@ components:
parameterType: STRING
name:
parameterType: STRING
comp-run-mt-bench-branch-op:
executorLabel: exec-run-mt-bench-branch-op
comp-run-final-eval-op:
executorLabel: exec-run-final-eval-op
inputDefinitions:
artifacts:
tasks:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
taxonomy:
artifactType:
schemaTitle: system.Dataset
Expand All @@ -436,18 +440,28 @@ components:
parameterType: STRING
base_model_dir:
parameterType: STRING
batch_size:
parameterType: NUMBER_INTEGER
candidate_branch:
parameterType: STRING
candidate_model:
parameterType: STRING
device:
parameterType: STRING
few_shots:
parameterType: NUMBER_INTEGER
max_workers:
parameterType: STRING
merge_system_user_message:
parameterType: BOOLEAN
model_dtype:
parameterType: STRING
outputDefinitions:
artifacts:
mmlu_branch_output:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
mt_bench_branch_output:
artifactType:
schemaTitle: system.Artifact
Expand Down Expand Up @@ -934,13 +948,13 @@ deploymentSpec:
\ claimName: {output_pvc_name}\n \"\"\"\n\
\ )\n\n return Outputs(manifest, name)\n\n"
image: registry.access.redhat.com/ubi9/python-311:latest
exec-run-mt-bench-branch-op:
exec-run-final-eval-op:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- run_mt_bench_branch_op
- run_final_eval_op
command:
- sh
- -c
Expand All @@ -961,15 +975,17 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n\
\ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n\
\ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\
\ mt_bench_branch_output: Output[Artifact],\n candidate_model: str,\n\
\ base_model_dir: str,\n tasks: Input[Dataset],\n taxonomy: Input[Dataset],\n\
\ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\
\ device: str,\n merge_system_user_message: bool,\n):\n import\
\ json\n import os\n\n import torch\n from helpers import (\n \
\ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n\
\ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
\ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\
\n ######################################################################\n\
\ device: str,\n model_dtype: str,\n few_shots: int,\n batch_size:\
\ int,\n merge_system_user_message: bool,\n):\n import json\n import\
\ os\n\n import torch\n from helpers import (\n VLLM_SERVER,\n\
\ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mmlu\
\ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\
\ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\
\ qa_pairs_to_qna_to_avg_scores, sort_score\n\n ######################################################################\n\
\ # branch_eval_summary_to_json creates a json object from output of\
\ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\
\ repository\n def branch_eval_summary_to_json(\n improvements:\
Expand Down Expand Up @@ -1068,7 +1084,49 @@ deploymentSpec:
,\n \"overall_score\": overall_score,\n \"base_overall_score\"\
: base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\
: summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\") as\
\ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n"
\ f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n # MMLU_BRANCH\n\
\n # find_node_dataset_directories to find sdg output node_datasets_*\n\
\ def find_node_dataset_directories(base_directory: str):\n import\
\ os\n import re\n\n # This is specific to ilab/eval output\n\
\ pattern = r\"node_datasets_\"\n matching_dirs = []\n \
\ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\
\ for directory in dirs:\n if regex.search(directory):\n\
\ matching_dirs.append(os.path.join(root, directory))\n\
\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \
\ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n if\
\ node_dataset_dirs:\n tasks_dir = node_dataset_dirs[0]\n\n \
\ mmlu_branch_evaluators = [\n MMLUBranchEvaluator(\n \
\ model_path=candidate_model,\n tasks_dir=tasks_dir,\n\
\ tasks=mmlu_tasks,\n few_shots=few_shots,\n\
\ batch_size=batch_size,\n ),\n MMLUBranchEvaluator(\n\
\ model_path=base_model,\n tasks_dir=tasks_dir,\n\
\ tasks=mmlu_tasks,\n few_shots=few_shots,\n\
\ batch_size=batch_size,\n ),\n ]\n \
\ m_paths = [candidate_model, base_model]\n overall_scores =\
\ []\n individual_scores_list = []\n for i, evaluator in enumerate(mmlu_branch_evaluators):\n\
\ m_path = m_paths[i]\n launch_local_vllm(m_path,\
\ gpu_count)\n overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n\
\ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n\
\ stop_local_vllm()\n\n overall_score = overall_scores[0]\n\
\ base_overall_score = overall_scores[1]\n individual_scores\
\ = individual_scores_list[0]\n base_individual_scores = individual_scores_list[1]\n\
\n improvements, regressions, no_changes = [], [], []\n for\
\ task, score in individual_scores.items():\n base_score = base_individual_scores[task]\n\
\ s = score[\"score\"]\n b_s = base_score[\"score\"\
]\n d = round(s - b_s, 2)\n if s > b_s:\n \
\ improvements.append((task, d, b_s, s))\n elif b_s >\
\ s:\n regressions.append((task, d, b_s, s))\n \
\ else:\n no_changes.append((task, s))\n\n summary\
\ = branch_eval_summary_to_json(\n improvements,\n \
\ regressions,\n no_changes,\n )\n\n mmlu_branch_data\
\ = {\n \"report_title\": \"KNOWLEDGE EVALUATION REPORT\",\n\
\ \"max_score\": \"1.0\",\n \"model\": candidate_model,\n\
\ \"model_score\": round(overall_score, 2),\n \"base_model\"\
: base_model,\n \"base_model_score\": round(base_overall_score,\
\ 2),\n \"summary\": summary,\n }\n\n with open(mmlu_branch_output.path,\
\ \"w\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ evaluation.\")\n\n"
image: quay.io/sallyom/instructlab-ocp:eval-10-8
resources:
accelerator:
Expand Down Expand Up @@ -1593,18 +1651,23 @@ root:
constant: second
taskInfo:
name: pytorchjob-manifest-op-2
run-mt-bench-branch-op:
run-final-eval-op:
cachingOptions:
enableCache: true
componentRef:
name: comp-run-mt-bench-branch-op
name: comp-run-final-eval-op
dependentTasks:
- createpvc
- createpvc-3
- git-clone-op
- run-mt-bench-op
- sdg-op
inputs:
artifacts:
tasks:
taskOutputArtifact:
outputArtifactKey: sdg
producerTask: sdg-op
taxonomy:
taskOutputArtifact:
outputArtifactKey: taxonomy
Expand All @@ -1615,6 +1678,8 @@ root:
base_model_dir:
runtimeValue:
constant: /model/model
batch_size:
componentInputParameter: batch_size
candidate_branch:
componentInputParameter: repo_branch
candidate_model:
Expand All @@ -1623,12 +1688,16 @@ root:
producerTask: run-mt-bench-op
device:
componentInputParameter: device
few_shots:
componentInputParameter: few_shots
max_workers:
componentInputParameter: max_workers
merge_system_user_message:
componentInputParameter: merge_system_user_message
model_dtype:
componentInputParameter: model_dtype
taskInfo:
name: run-mt-bench-branch-op
name: run-final-eval-op
run-mt-bench-op:
cachingOptions: {}
componentRef:
Expand Down Expand Up @@ -1773,7 +1842,7 @@ platforms:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc-3
exec-run-mt-bench-branch-op:
exec-run-final-eval-op:
configMapAsEnv:
- configMapName: kfp-model-server
keyToEnv:
Expand Down
3 changes: 2 additions & 1 deletion sdg/faked/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ def git_clone_op(
)


# TODO: Update once merged into main
@dsl.component(
base_image=PYTHON_IMAGE,
packages_to_install=[
"git+https://github.com/redhat-et/ilab-on-ocp.git#subdirectory=sdg/faked/fixtures"
"git+https://github.com/sallyom/ilab-on-ocp.git@final-mmlu-branch#subdirectory=sdg/faked/fixtures"
],
)
def sdg_op(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
task: mmlu_pr
dataset_path: json
dataset_name: null
test_split: test
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
output_type: multiple_choice
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
Loading

0 comments on commit f5eeed4

Please sign in to comment.