Skip to content

Commit

Permalink
wip:final eval
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Oct 11, 2024
1 parent 5b9f7f2 commit 891f726
Show file tree
Hide file tree
Showing 5 changed files with 388 additions and 143 deletions.
17 changes: 14 additions & 3 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,19 @@ def run_final_eval_op(
):
import json
import os

import torch
from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(model_path: str, gpu_count: int, retries: int = 60, delay: int = 5):
print("Starting Final Eval...")

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
import subprocess
import sys
import time
Expand Down Expand Up @@ -134,7 +139,7 @@ def branch_eval_summary_to_json(
no_changes: list[tuple[str, float]],
new=None,
) -> str:
"""Generates a JSON object from the _branch benchmark evaluations"""
# Generates a JSON object from the _branch benchmark evaluations

import json

Expand Down Expand Up @@ -183,7 +188,7 @@ def branch_eval_summary_to_json(
return json.dumps(summary, indent=4)

######################################################################

print("Checking GPUs...")
gpu_available = torch.cuda.is_available()
gpu_name = (
torch.cuda.get_device_name(torch.cuda.current_device())
Expand Down Expand Up @@ -213,6 +218,8 @@ def find_node_dataset_directories(base_directory: str):

return matching_dirs

print("Starting MMLU_Branch...")

mmlu_tasks = ["mmlu_pr"]

node_dataset_dirs = find_node_dataset_directories(tasks.path)
Expand Down Expand Up @@ -242,10 +249,12 @@ def find_node_dataset_directories(base_directory: str):
individual_scores_list = []
for i, evaluator in enumerate(mmlu_branch_evaluators):
m_path = m_paths[i]
print("Launching Vllm...")
launch_vllm(m_path, gpu_count)
overall_score, individual_scores = evaluator.run(VLLM_SERVER)
overall_scores.append(overall_score)
individual_scores_list.append(individual_scores)
print("Stopping Vllm")
stop_vllm()

# TODO: update instructlab/instructlab model/evaluate.py
Expand Down Expand Up @@ -291,6 +300,8 @@ def find_node_dataset_directories(base_directory: str):

# MT_BENCH_BRANCH

print("Strating MT_BENCH_BRANCH ...")

judge_api_key = os.getenv("JUDGE_API_KEY", "")
judge_model_name = os.getenv("JUDGE_NAME")
judge_endpoint = os.getenv("JUDGE_ENDPOINT")
Expand Down
5 changes: 3 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git"
KFP_MODEL_SERVER_CM = "sdg/kfp-model-server.yaml"
BASE_MODE = "ibm-granite/granite-7b-base"
BASE_MODEL_DIR = "/model/model" # <- "model ID for vLLM chat/completions - corresponds to path within pvc"
BASE_MODEL_DIR = "/data/model/" # <- "model ID for vLLM chat/completions - corresponds to path within pvc"
MMLU_TASKS_LIST = "mmlu_anatomy,mmlu_astronomy"
MODEL_DTYPE = "bfloat16"
FEW_SHOTS = 5
Expand Down Expand Up @@ -447,7 +447,7 @@ def gen_standalone():
"exec-git-clone-op": {},
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/hf_format", models_path_prefix="/data/model/output/hf_format", max_workers="auto", merge_system_user_message=False)',
"exec-run-final-eval-op": "run_final_eval_op(candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/model/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
"exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generate', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
}

details = {}
Expand Down Expand Up @@ -620,6 +620,7 @@ def change_dsl_function_to_normal_function(rendered_code: list):
"dsl.Output[dsl.Dataset]": "str",
"dsl.Output[dsl.Model]": "str",
"Output[Artifact]": "str",
"Input[Dataset]": "str",
"import kfp": "",
"from kfp import dsl": "",
"from kfp.dsl import *": "",
Expand Down
130 changes: 66 additions & 64 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1017,21 +1017,22 @@ deploymentSpec:
\ str,\n candidate_branch: str,\n max_workers: str,\n device: str,\n\
\ model_dtype: str,\n few_shots: int,\n batch_size: int,\n merge_system_user_message:\
\ bool,\n candidate_model: str = None,\n):\n import json\n import\
\ os\n import torch\n from instructlab.eval.mmlu import MMLU_TASKS,\
\ os\n\n import torch\n from instructlab.eval.mmlu import MMLU_TASKS,\
\ MMLUBranchEvaluator\n from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n\
\ from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
\ sort_score\n\n VLLM_SERVER = \"http://localhost:8000/v1\"\n\n def\
\ launch_vllm(model_path: str, gpu_count: int, retries: int = 60, delay:\
\ int = 5):\n import subprocess\n import sys\n import\
\ time\n\n import requests\n\n if gpu_count > 0:\n \
\ sort_score\n\n VLLM_SERVER = \"http://localhost:8000/v1\"\n\n print(\"\
Starting Final Eval...\")\n\n def launch_vllm(\n model_path: str,\
\ gpu_count: int, retries: int = 120, delay: int = 10\n ):\n import\
\ subprocess\n import sys\n import time\n\n import\
\ requests\n\n if gpu_count > 0:\n command = [\n \
\ sys.executable,\n \"-m\",\n \"\
vllm.entrypoints.openai.api_server\",\n \"--model\",\n \
\ model_path,\n \"--tensor-parallel-size\",\n\
\ str(gpu_count),\n ]\n else:\n \
\ command = [\n sys.executable,\n \"-m\"\
,\n \"vllm.entrypoints.openai.api_server\",\n \
\ \"--model\",\n model_path,\n \"--tensor-parallel-size\"\
,\n str(gpu_count),\n ]\n else:\n \
\ command = [\n sys.executable,\n \"\
-m\",\n \"vllm.entrypoints.openai.api_server\",\n \
\ \"--model\",\n model_path,\n ]\n\n \
\ subprocess.Popen(args=command)\n\n print(f\"Waiting for vLLM\
\ \"--model\",\n model_path,\n ]\n\n \
\ subprocess.Popen(args=command)\n\n print(f\"Waiting for vLLM\
\ server to start at {VLLM_SERVER}...\")\n\n for attempt in range(retries):\n\
\ try:\n response = requests.get(f\"{VLLM_SERVER}/models\"\
)\n if response.status_code == 200:\n \
Expand Down Expand Up @@ -1073,10 +1074,10 @@ deploymentSpec:
\ repository\n def branch_eval_summary_to_json(\n improvements:\
\ list[tuple[str, float, float, float]],\n regressions: list[tuple[str,\
\ float, float, float]],\n no_changes: list[tuple[str, float]],\n\
\ new=None,\n ) -> str:\n \"\"\"Generates a JSON object\
\ from the _branch benchmark evaluations\"\"\"\n\n import json\n\n\
\ summary = {\"improvements\": [], \"regressions\": [], \"no_changes\"\
: [], \"new\": []}\n\n if len(improvements) > 0:\n improvements.sort(key=sort_score,\
\ new=None,\n ) -> str:\n # Generates a JSON object from\
\ the _branch benchmark evaluations\n\n import json\n\n summary\
\ = {\"improvements\": [], \"regressions\": [], \"no_changes\": [], \"new\"\
: []}\n\n if len(improvements) > 0:\n improvements.sort(key=sort_score,\
\ reverse=True)\n for improvement in improvements:\n \
\ task, delta, base_score, new_score = improvement\n \
\ summary[\"improvements\"].append(\n {\n \
Expand All @@ -1098,23 +1099,23 @@ deploymentSpec:
\ summary[\"new\"].append(\n {\"qna\"\
: qna, \"average_score\": round(avg_score, 2)}\n )\n\n \
\ return json.dumps(summary, indent=4)\n\n ######################################################################\n\
\n gpu_available = torch.cuda.is_available()\n gpu_name = (\n \
\ torch.cuda.get_device_name(torch.cuda.current_device())\n if\
\ gpu_available\n else \"No GPU available\"\n )\n gpu_count\
\ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\
\ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MMLU_BRANCH\n\
\n # find_node_dataset_directories to find sdg output node_datasets_*\n\
\ def find_node_dataset_directories(base_directory: str):\n import\
\ os\n import re\n\n # This is specific to ilab/eval output\n\
\ pattern = r\"node_datasets_\"\n matching_dirs = []\n \
\ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\
\ for directory in dirs:\n if regex.search(directory):\n\
\ matching_dirs.append(os.path.join(root, directory))\n\
\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \
\ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n # This\
\ assumes generated filesystem from ilab sdg, which\n # generates a node_datasets_\
\ directory for MMLU custom tasks data\n if node_dataset_dirs:\n \
\ tasks_dir = node_dataset_dirs[0]\n\n mmlu_branch_evaluators\
\ print(\"Checking GPUs...\")\n gpu_available = torch.cuda.is_available()\n\
\ gpu_name = (\n torch.cuda.get_device_name(torch.cuda.current_device())\n\
\ if gpu_available\n else \"No GPU available\"\n )\n \
\ gpu_count = torch.cuda.device_count() if gpu_available else 0\n\n \
\ print(f\"GPU Available: {gpu_available}, Using: {gpu_name}\")\n\n #\
\ MMLU_BRANCH\n\n # find_node_dataset_directories to find sdg output\
\ node_datasets_*\n def find_node_dataset_directories(base_directory:\
\ str):\n import os\n import re\n\n # This is specific\
\ to ilab/eval output\n pattern = r\"node_datasets_\"\n matching_dirs\
\ = []\n regex = re.compile(pattern)\n\n for root, dirs, files\
\ in os.walk(base_directory):\n for directory in dirs:\n \
\ if regex.search(directory):\n matching_dirs.append(os.path.join(root,\
\ directory))\n\n return matching_dirs\n\n print(\"Starting MMLU_Branch...\"\
)\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n node_dataset_dirs = find_node_dataset_directories(tasks.path)\n\
\ # This assumes generated filesystem from ilab sdg, which\n # generates\
\ a node_datasets_ directory for MMLU custom tasks data\n if node_dataset_dirs:\n\
\ tasks_dir = node_dataset_dirs[0]\n\n mmlu_branch_evaluators\
\ = [\n MMLUBranchEvaluator(\n model_path=candidate_model,\n\
\ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\
\ few_shots=few_shots,\n batch_size=batch_size,\n\
Expand All @@ -1124,37 +1125,38 @@ deploymentSpec:
\ ),\n ]\n m_paths = [candidate_model, base_model_dir]\n\
\ overall_scores = []\n individual_scores_list = []\n \
\ for i, evaluator in enumerate(mmlu_branch_evaluators):\n \
\ m_path = m_paths[i]\n launch_vllm(m_path, gpu_count)\n \
\ overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n\
\ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n\
\ stop_vllm()\n\n # TODO: update instructlab/instructlab\
\ model/evaluate.py\n # so this logic can be imported outside of\
\ the CLI\n overall_score = overall_scores[0]\n base_overall_score\
\ = overall_scores[1]\n individual_scores = individual_scores_list[0]\n\
\ base_individual_scores = individual_scores_list[1]\n\n improvements,\
\ regressions, no_changes = [], [], []\n for task, score in individual_scores.items():\n\
\ base_score = base_individual_scores[task]\n s =\
\ score[\"score\"]\n b_s = base_score[\"score\"]\n \
\ d = round(s - b_s, 2)\n if s > b_s:\n improvements.append((task,\
\ d, b_s, s))\n elif b_s > s:\n regressions.append((task,\
\ d, b_s, s))\n else:\n no_changes.append((task,\
\ s))\n\n summary = branch_eval_summary_to_json(\n improvements,\n\
\ regressions,\n no_changes,\n )\n\n \
\ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\
\ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\
: candidate_model,\n \"model_score\": round(overall_score, 2),\n\
\ \"base_model\": base_model_dir,\n \"base_model_score\"\
: round(base_overall_score, 2),\n \"summary\": summary,\n \
\ }\n\n with open(mmlu_branch_output.path, \"w\") as f:\n \
\ json.dump(mmlu_branch_data, f, indent=4)\n else:\n print(\"\
No MMLU tasks directories found, skipping MMLU_branch evaluation.\")\n\n\
\ # MT_BENCH_BRANCH\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\
, \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\
\ = os.getenv(\"JUDGE_ENDPOINT\")\n\n output_dir = \"/tmp/eval_output\"\
\n\n # TODO: candidate_branch must be in same repo, not a fork, or, can\
\ compare main branch against candidate, base models\n base_branch =\
\ base_branch or \"main\"\n candidate_branch = candidate_branch or \"\
main\"\n\n ######################################################################\n\
\ m_path = m_paths[i]\n print(\"Launching Vllm...\")\n \
\ launch_vllm(m_path, gpu_count)\n overall_score, individual_scores\
\ = evaluator.run(VLLM_SERVER)\n overall_scores.append(overall_score)\n\
\ individual_scores_list.append(individual_scores)\n \
\ print(\"Stopping Vllm\")\n stop_vllm()\n\n # TODO:\
\ update instructlab/instructlab model/evaluate.py\n # so this logic\
\ can be imported outside of the CLI\n overall_score = overall_scores[0]\n\
\ base_overall_score = overall_scores[1]\n individual_scores\
\ = individual_scores_list[0]\n base_individual_scores = individual_scores_list[1]\n\
\n improvements, regressions, no_changes = [], [], []\n for\
\ task, score in individual_scores.items():\n base_score = base_individual_scores[task]\n\
\ s = score[\"score\"]\n b_s = base_score[\"score\"\
]\n d = round(s - b_s, 2)\n if s > b_s:\n \
\ improvements.append((task, d, b_s, s))\n elif b_s >\
\ s:\n regressions.append((task, d, b_s, s))\n \
\ else:\n no_changes.append((task, s))\n\n summary\
\ = branch_eval_summary_to_json(\n improvements,\n \
\ regressions,\n no_changes,\n )\n\n mmlu_branch_data\
\ = {\n \"report_title\": \"KNOWLEDGE EVALUATION REPORT\",\n\
\ \"max_score\": \"1.0\",\n \"model\": candidate_model,\n\
\ \"model_score\": round(overall_score, 2),\n \"base_model\"\
: base_model_dir,\n \"base_model_score\": round(base_overall_score,\
\ 2),\n \"summary\": summary,\n }\n\n with open(mmlu_branch_output.path,\
\ \"w\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Strating MT_BENCH_BRANCH\
\ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\
JUDGE_ENDPOINT\")\n\n output_dir = \"/tmp/eval_output\"\n\n # TODO:\
\ candidate_branch must be in same repo, not a fork, or, can compare main\
\ branch against candidate, base models\n base_branch = base_branch or\
\ \"main\"\n candidate_branch = candidate_branch or \"main\"\n\n ######################################################################\n\
\ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
\ external judge model\n # and when that happens, much of this logic\
\ can be imported from the 'evaluate' definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
Expand Down Expand Up @@ -1814,7 +1816,7 @@ root:
componentInputParameter: repo_branch
base_model_dir:
runtimeValue:
constant: /model/model
constant: /data/model/
batch_size:
componentInputParameter: batch_size
candidate_branch:
Expand Down
Loading

0 comments on commit 891f726

Please sign in to comment.