Skip to content

Commit bc44932

Browse files
committed
bulk: mt-bench eval, final eval and trained model push to S3
- do not print final eval scores in logs - use the correct model location for final push - fix job/cr watch Signed-off-by: Sébastien Han <[email protected]>
1 parent cec15e1 commit bc44932

File tree

3 files changed

+113
-66
lines changed

3 files changed

+113
-66
lines changed

pipeline.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ def gen_standalone():
447447
"exec-git-clone-op": {},
448448
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
449449
"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)',
450-
"exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
450+
"exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
451451
}
452452

453453
details = {}

standalone/standalone.py

+57-33
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@
6161
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
6262
MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
6363
MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
64-
CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
64+
CANDIDATE_MODEL_PATH = path.join(
65+
DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model"
66+
)
6567
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
6668
KFP_MODEL_SERVER_CM = """
6769
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -265,16 +267,18 @@
265267
fi
266268
267269
if [ "$STRATEGY" == "upload" ]; then
268-
export FINAL_DATA_TAR_FILE="final.$SDG_OBJECT_STORE_DATA_KEY"
270+
export FINAL_DATA_TAR_FILE="$(date +"%Y-%m-%d_%H-%M-%S").$SDG_OBJECT_STORE_DATA_KEY"
269271
export FINAL_DATA_TAR_PATH="{data_pvc_mount_path}/$FINAL_DATA_TAR_FILE"
270272
echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
271273
echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
272274
echo "Archiving data before pushing to the object store"
275+
# Use '--ignore-failed-read' to ignore missing files, needed when no MMLU tasks directories are found MMLU_branch is skipped
276+
# So '{mmlu_branch_scores_path}' will not exist
273277
tar --create \
274278
--gzip \
275279
--verbose \
280+
--ignore-failed-read \
276281
--file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
277-
# TODO: change model path for the final model!!!
278282
fi
279283
280284
tmp=$(mktemp -d)
@@ -844,11 +848,8 @@ def run(
844848

845849
# Final evaluation
846850
ctx.obj["eval_type"] = "final-eval"
847-
scores = ctx.invoke(evaluation)
848-
scores = json.loads(scores)
849-
logger.info("Best model: %s", scores.get("best_model"))
850-
ctx.obj["candidate_model"] = scores.get("best_model")
851-
logger.info("instructLab Training Finished!")
851+
ctx.invoke(evaluation)
852+
logger.info("InstructLab Training Finished!")
852853

853854
# Push the best model to S3
854855
ctx.invoke(upload_trained_model)
@@ -2120,7 +2121,7 @@ def find_node_dataset_directories(base_dir: str):
21202121
json.dump(mt_bench_branch_data, f, indent=4)
21212122
"""
21222123
exec_run_final_eval_op_args = """
2123-
run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
2124+
run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
21242125
"""
21252126

21262127
if eval_type == "mt-bench":
@@ -2324,6 +2325,7 @@ def run_job(namespace: str, job: kubernetes.client.V1Job) -> str:
23242325
name=pods.items[0].metadata.name, namespace=namespace
23252326
)
23262327
w.stop()
2328+
break
23272329
elif job_event.status.failed == 1:
23282330
logger.error("Job failed. Pod logs:")
23292331
pods = core_v1.list_namespaced_pod(
@@ -2739,23 +2741,29 @@ def train(
27392741
namespace=namespace,
27402742
plural="pytorchjobs",
27412743
):
2742-
job_event = event["object"]
2744+
pytorchjob_event = event["object"]
27432745
if (
2744-
job_event["metadata"]["name"]
2746+
pytorchjob_event["metadata"]["name"]
27452747
!= pytorch_training_job_yaml["metadata"]["name"]
27462748
):
27472749
continue
2748-
job_name = job_event["metadata"]["name"]
2750+
pytorchjob_name = pytorchjob_event["metadata"]["name"]
27492751

2750-
if "status" not in job_event or "conditions" not in job_event["status"]:
2752+
if (
2753+
"status" not in pytorchjob_event
2754+
or "conditions" not in pytorchjob_event["status"]
2755+
):
27512756
continue
27522757
logger.info(
2753-
"Job: %s - %s",
2754-
job_name,
2755-
job_event["status"].get("conditions", "No conditions yet"),
2758+
"PytorchJob: %s - %s",
2759+
pytorchjob_name,
2760+
pytorchjob_event["status"].get("conditions", "No conditions yet"),
27562761
)
27572762

2758-
for job_condition in job_event["status"]["conditions"]:
2763+
# Always start by the last condition so that if the job is completed, we can stop watching
2764+
# If we don't do this, we might get 'stuck' into the Running condition and never stop watching
2765+
for job_condition in reversed(pytorchjob_event["status"]["conditions"]):
2766+
print(job_condition)
27592767
if job_condition["type"] == "Running":
27602768
# now watch for pod event
27612769
for event in w.stream(
@@ -2764,7 +2772,7 @@ def train(
27642772
label_selector=f"training.kubeflow.org/job-name=train-phase-{training_phase}",
27652773
):
27662774
pod_event = event["object"]
2767-
if pod_event.metadata.name.startswith(job_name):
2775+
if pod_event.metadata.name.startswith(pytorchjob_name):
27682776
logger.info(
27692777
"Pod: %s - %s",
27702778
pod_event.metadata.name,
@@ -2786,15 +2794,25 @@ def train(
27862794
if pod_event.status.phase == "Failed":
27872795
log_pod_containers(pod_event, "init_containers", namespace)
27882796
log_pod_containers(pod_event, "containers", namespace)
2789-
if job_condition["type"] == "Succeeded":
2797+
w.stop()
2798+
if pod_event.status.phase == "Succeeded":
2799+
continue
2800+
elif job_condition["type"] == "Succeeded":
27902801
logger.info(
2791-
"Job '%s' completed successfully: %s",
2792-
job_name,
2802+
"PytorchJob '%s' completed successfully: %s",
2803+
pytorchjob_name,
27932804
job_condition["reason"],
27942805
)
2806+
logger.info("Training phase %s completed.", training_phase)
27952807
w.stop()
2808+
# Break here to avoid going into other conditions, we are done
2809+
break
27962810
elif job_condition["type"] == "Failed":
2797-
logger.error("Job' %s' failed: %s", job_name, job_condition["reason"])
2811+
logger.error(
2812+
"PytorchJob' %s' failed: %s",
2813+
pytorchjob_name,
2814+
job_condition["reason"],
2815+
)
27982816
w.stop()
27992817
raise RuntimeError("Job failed.")
28002818

@@ -2816,7 +2834,9 @@ def evaluation(ctx: click.Context) -> str:
28162834
eval_type = ctx.obj["eval_type"]
28172835

28182836
if eval_type is None:
2819-
raise ValueError("Evaluation type must be provided with --eval-type=[mt-bench]")
2837+
raise ValueError(
2838+
"Evaluation type must be provided with --eval-type=[mt-bench|final-eval]"
2839+
)
28202840

28212841
logger.info("Running %s evaluation.", eval_type)
28222842

@@ -2825,17 +2845,21 @@ def evaluation(ctx: click.Context) -> str:
28252845
namespace=namespace, job_name=f"eval-{eval_type}", eval_type=eval_type
28262846
)
28272847
scores = run_job(namespace, job)
2828-
scores = scores.replace("'", '"')
28292848

2830-
try:
2831-
scores_data = json.loads(scores)
2832-
if isinstance(scores_data, dict):
2833-
scores = json.dumps(scores_data)
2834-
else:
2835-
raise ValueError("Unexpected format for scores data")
2836-
except json.JSONDecodeError as e:
2837-
logger.error("Failed to parse scores: %s", e)
2838-
raise
2849+
if eval_type == "mt-bench":
2850+
scores = scores.replace("'", '"')
2851+
2852+
try:
2853+
scores_data = json.loads(scores)
2854+
if isinstance(scores_data, dict):
2855+
scores = json.dumps(scores_data)
2856+
else:
2857+
raise ValueError("Unexpected format for scores data")
2858+
except json.JSONDecodeError as e:
2859+
logger.error("Failed to parse scores: %s", e)
2860+
raise
2861+
2862+
return scores
28392863

28402864
logger.info("Evaluation scores: %s", scores)
28412865

standalone/standalone.tpl

+55-32
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
6161
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
6262
MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
6363
MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
64-
CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
64+
CANDIDATE_MODEL_PATH = path.join(
65+
DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model"
66+
)
6567
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
6668
KFP_MODEL_SERVER_CM = """
6769
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -250,16 +252,18 @@ if [ "$STRATEGY" == "download" ]; then
250252
fi
251253
252254
if [ "$STRATEGY" == "upload" ]; then
253-
export FINAL_DATA_TAR_FILE="final.$SDG_OBJECT_STORE_DATA_KEY"
255+
export FINAL_DATA_TAR_FILE="$(date +"%Y-%m-%d_%H-%M-%S").$SDG_OBJECT_STORE_DATA_KEY"
254256
export FINAL_DATA_TAR_PATH="{data_pvc_mount_path}/$FINAL_DATA_TAR_FILE"
255257
echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
256258
echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
257259
echo "Archiving data before pushing to the object store"
260+
# Use '--ignore-failed-read' to ignore missing files, needed when no MMLU tasks directories are found MMLU_branch is skipped
261+
# So '{mmlu_branch_scores_path}' will not exist
258262
tar --create \
259263
--gzip \
260264
--verbose \
265+
--ignore-failed-read \
261266
--file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
262-
# TODO: change model path for the final model!!!
263267
fi
264268
265269
tmp=$(mktemp -d)
@@ -829,11 +833,8 @@ def run(
829833

830834
# Final evaluation
831835
ctx.obj["eval_type"] = "final-eval"
832-
scores = ctx.invoke(evaluation)
833-
scores = json.loads(scores)
834-
logger.info("Best model: %s", scores.get("best_model"))
835-
ctx.obj["candidate_model"] = scores.get("best_model")
836-
logger.info("instructLab Training Finished!")
836+
ctx.invoke(evaluation)
837+
logger.info("InstructLab Training Finished!")
837838

838839
# Push the best model to S3
839840
ctx.invoke(upload_trained_model)
@@ -1481,6 +1482,7 @@ def run_job(namespace: str, job: kubernetes.client.V1Job) -> str:
14811482
name=pods.items[0].metadata.name, namespace=namespace
14821483
)
14831484
w.stop()
1485+
break
14841486
elif job_event.status.failed == 1:
14851487
logger.error("Job failed. Pod logs:")
14861488
pods = core_v1.list_namespaced_pod(
@@ -1896,23 +1898,29 @@ def train(
18961898
namespace=namespace,
18971899
plural="pytorchjobs",
18981900
):
1899-
job_event = event["object"]
1901+
pytorchjob_event = event["object"]
19001902
if (
1901-
job_event["metadata"]["name"]
1903+
pytorchjob_event["metadata"]["name"]
19021904
!= pytorch_training_job_yaml["metadata"]["name"]
19031905
):
19041906
continue
1905-
job_name = job_event["metadata"]["name"]
1907+
pytorchjob_name = pytorchjob_event["metadata"]["name"]
19061908

1907-
if "status" not in job_event or "conditions" not in job_event["status"]:
1909+
if (
1910+
"status" not in pytorchjob_event
1911+
or "conditions" not in pytorchjob_event["status"]
1912+
):
19081913
continue
19091914
logger.info(
1910-
"Job: %s - %s",
1911-
job_name,
1912-
job_event["status"].get("conditions", "No conditions yet"),
1915+
"PytorchJob: %s - %s",
1916+
pytorchjob_name,
1917+
pytorchjob_event["status"].get("conditions", "No conditions yet"),
19131918
)
19141919

1915-
for job_condition in job_event["status"]["conditions"]:
1920+
# Always start by the last condition so that if the job is completed, we can stop watching
1921+
# If we don't do this, we might get 'stuck' into the Running condition and never stop watching
1922+
for job_condition in reversed(pytorchjob_event["status"]["conditions"]):
1923+
print(job_condition)
19161924
if job_condition["type"] == "Running":
19171925
# now watch for pod event
19181926
for event in w.stream(
@@ -1921,7 +1929,7 @@ def train(
19211929
label_selector=f"training.kubeflow.org/job-name=train-phase-{training_phase}",
19221930
):
19231931
pod_event = event["object"]
1924-
if pod_event.metadata.name.startswith(job_name):
1932+
if pod_event.metadata.name.startswith(pytorchjob_name):
19251933
logger.info(
19261934
"Pod: %s - %s",
19271935
pod_event.metadata.name,
@@ -1943,15 +1951,25 @@ def train(
19431951
if pod_event.status.phase == "Failed":
19441952
log_pod_containers(pod_event, "init_containers", namespace)
19451953
log_pod_containers(pod_event, "containers", namespace)
1946-
if job_condition["type"] == "Succeeded":
1954+
w.stop()
1955+
if pod_event.status.phase == "Succeeded":
1956+
continue
1957+
elif job_condition["type"] == "Succeeded":
19471958
logger.info(
1948-
"Job '%s' completed successfully: %s",
1949-
job_name,
1959+
"PytorchJob '%s' completed successfully: %s",
1960+
pytorchjob_name,
19501961
job_condition["reason"],
19511962
)
1963+
logger.info("Training phase %s completed.", training_phase)
19521964
w.stop()
1965+
# Break here to avoid going into other conditions, we are done
1966+
break
19531967
elif job_condition["type"] == "Failed":
1954-
logger.error("Job' %s' failed: %s", job_name, job_condition["reason"])
1968+
logger.error(
1969+
"PytorchJob' %s' failed: %s",
1970+
pytorchjob_name,
1971+
job_condition["reason"],
1972+
)
19551973
w.stop()
19561974
raise RuntimeError("Job failed.")
19571975

@@ -1973,7 +1991,9 @@ def evaluation(ctx: click.Context) -> str:
19731991
eval_type = ctx.obj["eval_type"]
19741992

19751993
if eval_type is None:
1976-
raise ValueError("Evaluation type must be provided with --eval-type=[mt-bench]")
1994+
raise ValueError(
1995+
"Evaluation type must be provided with --eval-type=[mt-bench|final-eval]"
1996+
)
19771997

19781998
logger.info("Running %s evaluation.", eval_type)
19791999

@@ -1982,17 +2002,20 @@ def evaluation(ctx: click.Context) -> str:
19822002
namespace=namespace, job_name=f"eval-{eval_type}", eval_type=eval_type
19832003
)
19842004
scores = run_job(namespace, job)
1985-
scores = scores.replace("'", '"')
19862005

1987-
try:
1988-
scores_data = json.loads(scores)
1989-
if isinstance(scores_data, dict):
1990-
scores = json.dumps(scores_data)
1991-
else:
1992-
raise ValueError("Unexpected format for scores data")
1993-
except json.JSONDecodeError as e:
1994-
logger.error("Failed to parse scores: %s", e)
1995-
raise
2006+
if eval_type == "mt-bench":
2007+
scores = scores.replace("'", '"')
2008+
try:
2009+
scores_data = json.loads(scores)
2010+
if isinstance(scores_data, dict):
2011+
scores = json.dumps(scores_data)
2012+
else:
2013+
raise ValueError("Unexpected format for scores data")
2014+
except json.JSONDecodeError as e:
2015+
logger.error("Failed to parse scores: %s", e)
2016+
raise
2017+
2018+
return scores
19962019

19972020
logger.info("Evaluation scores: %s", scores)
19982021

0 commit comments

Comments
 (0)