bulk: mt-bench eval, final eval and trained model push to S3

leseb · leseb · commit bc44932a9a0e · 2024-10-14T23:03:12.000+02:00
- do not print final eval scores in logs
- use the correct model location for final push
- fix job/cr watch

Signed-off-by: Sébastien Han &lt;seb@redhat.com&gt;
diff --git a/pipeline.py b/pipeline.py
@@ -447,7 +447,7 @@ def gen_standalone():
         "exec-git-clone-op": {},
         "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
         "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)',
-        "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
+        "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
     }
 
     details = {}
diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -61,7 +61,9 @@
 MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
 MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
 MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
-CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
+CANDIDATE_MODEL_PATH = path.join(
+    DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model"
+)
 SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
 KFP_MODEL_SERVER_CM = """
 # TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -265,16 +267,18 @@
 fi
 
 if [ "$STRATEGY" == "upload" ]; then
-    export FINAL_DATA_TAR_FILE="final.$SDG_OBJECT_STORE_DATA_KEY"
+    export FINAL_DATA_TAR_FILE="$(date +"%Y-%m-%d_%H-%M-%S").$SDG_OBJECT_STORE_DATA_KEY"
     export FINAL_DATA_TAR_PATH="{data_pvc_mount_path}/$FINAL_DATA_TAR_FILE"
     echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
     echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
     echo "Archiving data before pushing to the object store"
+    # Use '--ignore-failed-read' to ignore missing files, needed when no MMLU tasks directories are found MMLU_branch is skipped
+    # So '{mmlu_branch_scores_path}' will not exist
     tar --create \
       --gzip \
       --verbose \
+      --ignore-failed-read \
       --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
-    # TODO: change model path for the final model!!!
 fi
 
 tmp=$(mktemp -d)
@@ -844,11 +848,8 @@ def run(
 
         # Final evaluation
         ctx.obj["eval_type"] = "final-eval"
-        scores = ctx.invoke(evaluation)
-        scores = json.loads(scores)
-        logger.info("Best model: %s", scores.get("best_model"))
-        ctx.obj["candidate_model"] = scores.get("best_model")
-        logger.info("instructLab Training Finished!")
+        ctx.invoke(evaluation)
+        logger.info("InstructLab Training Finished!")
 
         # Push the best model to S3
         ctx.invoke(upload_trained_model)
@@ -2120,7 +2121,7 @@ def find_node_dataset_directories(base_dir: str):
         json.dump(mt_bench_branch_data, f, indent=4)
 """
     exec_run_final_eval_op_args = """
-run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
+run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
 """
 
     if eval_type == "mt-bench":
@@ -2324,6 +2325,7 @@ def run_job(namespace: str, job: kubernetes.client.V1Job) -> str:
                 name=pods.items[0].metadata.name, namespace=namespace
             )
             w.stop()
+            break
         elif job_event.status.failed == 1:
             logger.error("Job failed. Pod logs:")
             pods = core_v1.list_namespaced_pod(
@@ -2739,23 +2741,29 @@ def train(
         namespace=namespace,
         plural="pytorchjobs",
     ):
-        job_event = event["object"]
+        pytorchjob_event = event["object"]
         if (
-            job_event["metadata"]["name"]
+            pytorchjob_event["metadata"]["name"]
             != pytorch_training_job_yaml["metadata"]["name"]
         ):
             continue
-        job_name = job_event["metadata"]["name"]
+        pytorchjob_name = pytorchjob_event["metadata"]["name"]
 
-        if "status" not in job_event or "conditions" not in job_event["status"]:
+        if (
+            "status" not in pytorchjob_event
+            or "conditions" not in pytorchjob_event["status"]
+        ):
             continue
         logger.info(
-            "Job: %s - %s",
-            job_name,
-            job_event["status"].get("conditions", "No conditions yet"),
+            "PytorchJob: %s - %s",
+            pytorchjob_name,
+            pytorchjob_event["status"].get("conditions", "No conditions yet"),
         )
 
-        for job_condition in job_event["status"]["conditions"]:
+        # Always start by the last condition so that if the job is completed, we can stop watching
+        # If we don't do this, we might get 'stuck' into the Running condition and never stop watching
+        for job_condition in reversed(pytorchjob_event["status"]["conditions"]):
+            print(job_condition)
             if job_condition["type"] == "Running":
                 # now watch for pod event
                 for event in w.stream(
@@ -2764,7 +2772,7 @@ def train(
                     label_selector=f"training.kubeflow.org/job-name=train-phase-{training_phase}",
                 ):
                     pod_event = event["object"]
-                    if pod_event.metadata.name.startswith(job_name):
+                    if pod_event.metadata.name.startswith(pytorchjob_name):
                         logger.info(
                             "Pod: %s - %s",
                             pod_event.metadata.name,
@@ -2786,15 +2794,25 @@ def train(
                         if pod_event.status.phase == "Failed":
                             log_pod_containers(pod_event, "init_containers", namespace)
                             log_pod_containers(pod_event, "containers", namespace)
-            if job_condition["type"] == "Succeeded":
+                            w.stop()
+                        if pod_event.status.phase == "Succeeded":
+                            continue
+            elif job_condition["type"] == "Succeeded":
                 logger.info(
-                    "Job '%s' completed successfully: %s",
-                    job_name,
+                    "PytorchJob '%s' completed successfully: %s",
+                    pytorchjob_name,
                     job_condition["reason"],
                 )
+                logger.info("Training phase %s completed.", training_phase)
                 w.stop()
+                # Break here to avoid going into other conditions, we are done
+                break
             elif job_condition["type"] == "Failed":
-                logger.error("Job' %s' failed: %s", job_name, job_condition["reason"])
+                logger.error(
+                    "PytorchJob' %s' failed: %s",
+                    pytorchjob_name,
+                    job_condition["reason"],
+                )
                 w.stop()
                 raise RuntimeError("Job failed.")
 
@@ -2816,7 +2834,9 @@ def evaluation(ctx: click.Context) -> str:
     eval_type = ctx.obj["eval_type"]
 
     if eval_type is None:
-        raise ValueError("Evaluation type must be provided with --eval-type=[mt-bench]")
+        raise ValueError(
+            "Evaluation type must be provided with --eval-type=[mt-bench|final-eval]"
+        )
 
     logger.info("Running %s evaluation.", eval_type)
 
@@ -2825,17 +2845,21 @@ def evaluation(ctx: click.Context) -> str:
         namespace=namespace, job_name=f"eval-{eval_type}", eval_type=eval_type
     )
     scores = run_job(namespace, job)
-    scores = scores.replace("'", '"')
 
-    try:
-        scores_data = json.loads(scores)
-        if isinstance(scores_data, dict):
-            scores = json.dumps(scores_data)
-        else:
-            raise ValueError("Unexpected format for scores data")
-    except json.JSONDecodeError as e:
-        logger.error("Failed to parse scores: %s", e)
-        raise
+    if eval_type == "mt-bench":
+        scores = scores.replace("'", '"')
+
+        try:
+            scores_data = json.loads(scores)
+            if isinstance(scores_data, dict):
+                scores = json.dumps(scores_data)
+            else:
+                raise ValueError("Unexpected format for scores data")
+        except json.JSONDecodeError as e:
+            logger.error("Failed to parse scores: %s", e)
+            raise
+
+        return scores
 
     logger.info("Evaluation scores: %s", scores)
 
diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl
@@ -61,7 +61,9 @@ MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
 MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
 MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
 MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
-CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
+CANDIDATE_MODEL_PATH = path.join(
+    DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model"
+)
 SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
 KFP_MODEL_SERVER_CM = """
 # TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -250,16 +252,18 @@ if [ "$STRATEGY" == "download" ]; then
 fi
 
 if [ "$STRATEGY" == "upload" ]; then
-    export FINAL_DATA_TAR_FILE="final.$SDG_OBJECT_STORE_DATA_KEY"
+    export FINAL_DATA_TAR_FILE="$(date +"%Y-%m-%d_%H-%M-%S").$SDG_OBJECT_STORE_DATA_KEY"
     export FINAL_DATA_TAR_PATH="{data_pvc_mount_path}/$FINAL_DATA_TAR_FILE"
     echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
     echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
     echo "Archiving data before pushing to the object store"
+    # Use '--ignore-failed-read' to ignore missing files, needed when no MMLU tasks directories are found MMLU_branch is skipped
+    # So '{mmlu_branch_scores_path}' will not exist
     tar --create \
       --gzip \
       --verbose \
+      --ignore-failed-read \
       --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
-    # TODO: change model path for the final model!!!
 fi
 
 tmp=$(mktemp -d)
@@ -829,11 +833,8 @@ def run(
 
         # Final evaluation
         ctx.obj["eval_type"] = "final-eval"
-        scores = ctx.invoke(evaluation)
-        scores = json.loads(scores)
-        logger.info("Best model: %s", scores.get("best_model"))
-        ctx.obj["candidate_model"] = scores.get("best_model")
-        logger.info("instructLab Training Finished!")
+        ctx.invoke(evaluation)
+        logger.info("InstructLab Training Finished!")
 
         # Push the best model to S3
         ctx.invoke(upload_trained_model)
@@ -1481,6 +1482,7 @@ def run_job(namespace: str, job: kubernetes.client.V1Job) -> str:
                 name=pods.items[0].metadata.name, namespace=namespace
             )
             w.stop()
+            break
         elif job_event.status.failed == 1:
             logger.error("Job failed. Pod logs:")
             pods = core_v1.list_namespaced_pod(
@@ -1896,23 +1898,29 @@ def train(
         namespace=namespace,
         plural="pytorchjobs",
     ):
-        job_event = event["object"]
+        pytorchjob_event = event["object"]
         if (
-            job_event["metadata"]["name"]
+            pytorchjob_event["metadata"]["name"]
             != pytorch_training_job_yaml["metadata"]["name"]
         ):
             continue
-        job_name = job_event["metadata"]["name"]
+        pytorchjob_name = pytorchjob_event["metadata"]["name"]
 
-        if "status" not in job_event or "conditions" not in job_event["status"]:
+        if (
+            "status" not in pytorchjob_event
+            or "conditions" not in pytorchjob_event["status"]
+        ):
             continue
         logger.info(
-            "Job: %s - %s",
-            job_name,
-            job_event["status"].get("conditions", "No conditions yet"),
+            "PytorchJob: %s - %s",
+            pytorchjob_name,
+            pytorchjob_event["status"].get("conditions", "No conditions yet"),
         )
 
-        for job_condition in job_event["status"]["conditions"]:
+        # Always start by the last condition so that if the job is completed, we can stop watching
+        # If we don't do this, we might get 'stuck' into the Running condition and never stop watching
+        for job_condition in reversed(pytorchjob_event["status"]["conditions"]):
+            print(job_condition)
             if job_condition["type"] == "Running":
                 # now watch for pod event
                 for event in w.stream(
@@ -1921,7 +1929,7 @@ def train(
                     label_selector=f"training.kubeflow.org/job-name=train-phase-{training_phase}",
                 ):
                     pod_event = event["object"]
-                    if pod_event.metadata.name.startswith(job_name):
+                    if pod_event.metadata.name.startswith(pytorchjob_name):
                         logger.info(
                             "Pod: %s - %s",
                             pod_event.metadata.name,
@@ -1943,15 +1951,25 @@ def train(
                         if pod_event.status.phase == "Failed":
                             log_pod_containers(pod_event, "init_containers", namespace)
                             log_pod_containers(pod_event, "containers", namespace)
-            if job_condition["type"] == "Succeeded":
+                            w.stop()
+                        if pod_event.status.phase == "Succeeded":
+                            continue
+            elif job_condition["type"] == "Succeeded":
                 logger.info(
-                    "Job '%s' completed successfully: %s",
-                    job_name,
+                    "PytorchJob '%s' completed successfully: %s",
+                    pytorchjob_name,
                     job_condition["reason"],
                 )
+                logger.info("Training phase %s completed.", training_phase)
                 w.stop()
+                # Break here to avoid going into other conditions, we are done
+                break
             elif job_condition["type"] == "Failed":
-                logger.error("Job' %s' failed: %s", job_name, job_condition["reason"])
+                logger.error(
+                    "PytorchJob' %s' failed: %s",
+                    pytorchjob_name,
+                    job_condition["reason"],
+                )
                 w.stop()
                 raise RuntimeError("Job failed.")
 
@@ -1973,7 +1991,9 @@ def evaluation(ctx: click.Context) -> str:
     eval_type = ctx.obj["eval_type"]
 
     if eval_type is None:
-        raise ValueError("Evaluation type must be provided with --eval-type=[mt-bench]")
+        raise ValueError(
+            "Evaluation type must be provided with --eval-type=[mt-bench|final-eval]"
+        )
 
     logger.info("Running %s evaluation.", eval_type)
 
@@ -1982,17 +2002,20 @@ def evaluation(ctx: click.Context) -> str:
         namespace=namespace, job_name=f"eval-{eval_type}", eval_type=eval_type
     )
     scores = run_job(namespace, job)
-    scores = scores.replace("'", '"')
 
-    try:
-        scores_data = json.loads(scores)
-        if isinstance(scores_data, dict):
-            scores = json.dumps(scores_data)
-        else:
-            raise ValueError("Unexpected format for scores data")
-    except json.JSONDecodeError as e:
-        logger.error("Failed to parse scores: %s", e)
-        raise
+    if eval_type == "mt-bench":
+        scores = scores.replace("'", '"')
+        try:
+            scores_data = json.loads(scores)
+            if isinstance(scores_data, dict):
+                scores = json.dumps(scores_data)
+            else:
+                raise ValueError("Unexpected format for scores data")
+        except json.JSONDecodeError as e:
+            logger.error("Failed to parse scores: %s", e)
+            raise
+
+        return scores
 
     logger.info("Evaluation scores: %s", scores)
 

Original file line number	Diff line number	Diff line change
`@@ -447,7 +447,7 @@ def gen_standalone():`
`447`	`447`	`"exec-git-clone-op": {},`
`448`	`448`	`"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',`
`449`	`449`	`"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)',`
`450`		`- "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",`
	`450`	`+ "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",`
`451`	`451`	`}`
`452`	`452`
`453`	`453`	`details = {}`