Skip to content

Commit

Permalink
Merge pull request #241 from MichaelClifford/metrics
Browse files Browse the repository at this point in the history
Add additional step to pipeline to generate a metrics report
  • Loading branch information
HumairAK authored Feb 6, 2025
2 parents 684926f + c556967 commit 2189ef2
Show file tree
Hide file tree
Showing 7 changed files with 372 additions and 109 deletions.
4 changes: 2 additions & 2 deletions eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .final import run_final_eval_op
from .final import generate_metrics_report_op, run_final_eval_op
from .mt_bench import run_mt_bench_op

__all__ = ["run_final_eval_op", "run_mt_bench_op"]
__all__ = ["run_final_eval_op", "run_mt_bench_op", "generate_metrics_report_op"]
61 changes: 52 additions & 9 deletions eval/final.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# type: ignore
# pylint: disable=import-outside-toplevel,import-error

from kfp.dsl import Artifact, Output, component
from kfp.dsl import Metrics, Output, component

from utils.consts import RHELAI_IMAGE
from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE


@component(base_image=RHELAI_IMAGE, install_kfp_package=False)
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
base_model_dir: str,
base_branch: str,
candidate_branch: str,
Expand All @@ -20,10 +18,13 @@ def run_final_eval_op(
candidate_model: str = None,
taxonomy_path: str = "/input/taxonomy",
sdg_path: str = "/input/sdg",
mmlu_branch_output_path: str = "/output/mmlu_branch",
mt_bench_branch_output_path: str = "/output/mt_bench_branch",
):
import json
import os
import subprocess
from pathlib import Path

import httpx
import torch
Expand Down Expand Up @@ -320,13 +321,18 @@ def find_node_dataset_directories(base_dir: str):
"report_title": "KNOWLEDGE EVALUATION REPORT",
"max_score": "1.0",
"model": candidate_model,
"model_score": round(overall_score, 2),
"trained_model_score": round(overall_score, 2),
"base_model": base_model_dir,
"base_model_score": round(base_overall_score, 2),
"summary": summary,
}

with open(mmlu_branch_output.path, "w", encoding="utf-8") as f:
if not os.path.exists(mmlu_branch_output_path):
os.makedirs(mmlu_branch_output_path)
mmlu_branch_output_file = (
Path(mmlu_branch_output_path) / "mmlu_branch_data.json"
)
with open(mmlu_branch_output_file, "w", encoding="utf-8") as f:
json.dump(mmlu_branch_data, f, indent=4)
else:
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")
Expand Down Expand Up @@ -464,11 +470,48 @@ def find_node_dataset_directories(base_dir: str):
"model": candidate_model,
"judge_model": judge_model_name,
"max_score": "10.0",
"overall_score": overall_score,
"base_overall_score": base_overall_score,
"trained_model_score": overall_score,
"base_model_score": base_overall_score,
"error_rate": error_rate,
"summary": summary,
}

with open(mt_bench_branch_output.path, "w", encoding="utf-8") as f:
if not os.path.exists(mt_bench_branch_output_path):
os.makedirs(mt_bench_branch_output_path)
mt_bench_branch_data_file = (
Path(mt_bench_branch_output_path) / "mt_bench_branch_data.json"
)
with open(
mt_bench_branch_data_file,
"w",
encoding="utf-8",
) as f:
json.dump(mt_bench_branch_data, f, indent=4)


@component(base_image=PYTHON_IMAGE, install_kfp_package=False)
def generate_metrics_report_op(
metrics: Output[Metrics],
):
import json

reports = {
"mt_bench": "/output/mt_bench_data.json",
"mt_bench_branch": "/output/mt_bench_branch/mt_bench_branch_data.json",
"mmlu_branch": "/output/mmlu_branch/mmlu_branch_data.json",
}

for report, file_name in reports.items():
with open(file_name, "r", encoding="utf-8") as f:
report_data = json.load(f)

if report == "mt_bench":
metrics.log_metric(f"{report}_best_model", report_data["best_model"])
metrics.log_metric(f"{report}_best_score", report_data["best_score"])
else:
metrics.log_metric(
f"{report}_trained_model_score", report_data["trained_model_score"]
)
metrics.log_metric(
f"{report}_base_model_score", report_data["base_model_score"]
)
15 changes: 8 additions & 7 deletions eval/mt_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def run_mt_bench_op(
max_workers: str,
models_folder: str,
output_path: str = "/output/mt_bench_data.json",
best_score_file: Optional[str] = None,
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
Expand Down Expand Up @@ -188,15 +187,17 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_mt_bench_data, f, indent=4)

outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
if best_score_file:
with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
mt_bench_report = {
"best_model": best_model,
"best_score": best_score,
"reports": all_mt_bench_data,
}

with open(output_path, "w", encoding="utf-8") as f:
json.dump(mt_bench_report, f, indent=4)

# Rename the best model directory to "candidate_model" for the next step
# So we know which model to use for the final evaluation
Expand Down
47 changes: 43 additions & 4 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use_secret_as_volume,
)

from eval import run_final_eval_op, run_mt_bench_op
from eval import generate_metrics_report_op, run_final_eval_op, run_mt_bench_op
from sdg import (
git_clone_op,
sdg_op,
Expand All @@ -33,7 +33,9 @@
from utils import (
ilab_importer_op,
model_to_pvc_op,
pvc_to_mmlu_branch_op,
pvc_to_model_op,
pvc_to_mt_bench_branch_op,
pvc_to_mt_bench_op,
)
from utils.consts import RHELAI_IMAGE
Expand Down Expand Up @@ -435,9 +437,28 @@ def ilab_pipeline(
mount_path="/output",
)

output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
output_pvc_delete_task.after(
output_model_task, output_mt_bench_task, final_eval_task
output_mt_bench_branch_task = pvc_to_mt_bench_branch_op(
pvc_path="/output/mt_bench_branch/mt_bench_branch_data.json",
)
output_mt_bench_branch_task.after(final_eval_task)
output_mt_bench_branch_task.set_caching_options(False)

mount_pvc(
task=output_mt_bench_branch_task,
pvc_name=output_pvc_task.output,
mount_path="/output",
)

output_mmlu_branch_task = pvc_to_mmlu_branch_op(
pvc_path="/output/mmlu_branch/mmlu_branch_data.json",
)
output_mmlu_branch_task.after(final_eval_task)
output_mmlu_branch_task.set_caching_options(False)

mount_pvc(
task=output_mmlu_branch_task,
pvc_name=output_pvc_task.output,
mount_path="/output",
)

sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output)
Expand All @@ -446,6 +467,24 @@ def ilab_pipeline(
model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output)
model_pvc_delete_task.after(final_eval_task)

generate_metrics_report_task = generate_metrics_report_op()
generate_metrics_report_task.after(final_eval_task)
generate_metrics_report_task.set_caching_options(False)
mount_pvc(
task=generate_metrics_report_task,
pvc_name=output_pvc_task.output,
mount_path="/output",
)

output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output)
output_pvc_delete_task.after(
output_model_task,
output_mt_bench_task,
output_mmlu_branch_task,
output_mt_bench_branch_task,
generate_metrics_report_task,
)

return


Expand Down
Loading

0 comments on commit 2189ef2

Please sign in to comment.