fix: add final details to push model to S3

leseb · leseb · commit 8f23bbfac818 · 2024-10-14T11:25:02.000+02:00
Signed-off-by: Sébastien Han &lt;seb@redhat.com&gt;
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -1104,7 +1104,7 @@ deploymentSpec:
           \        if gpu_available\n        else \"No GPU available\"\n    )\n  \
           \  gpu_count = torch.cuda.device_count() if gpu_available else 0\n\n   \
           \ print(f\"GPU Available: {gpu_available}, Using: {gpu_name}\")\n\n    #\
-          \ MMLU_BRANCH\n\n    # This is very specific to `ilab generate`, necessary\
+          \ MMLU_BRANCH\n\n    # This is very specific to 'ilab generate', necessary\
           \ because the data generation and\n    # model evaluation are taking place\
           \ in separate environments.\n    def update_test_lines_in_files(base_dir):\n\
           \        import os\n        import re\n\n        # Define the regex to match\
@@ -1129,7 +1129,7 @@ deploymentSpec:
           \ = []\n        regex = re.compile(pattern)\n\n        for root, dirs, files\
           \ in os.walk(base_dir):\n            for directory in dirs:\n          \
           \      if regex.search(directory):\n                    matching_dirs.append(os.path.join(root,\
-          \ directory))\n\n        # From `ilab sdg` the knowledge_*_task.yaml files\
+          \ directory))\n\n        # From 'ilab sdg' the knowledge_*_task.yaml files\
           \ have a line that references where the SDG took place.\n        # This\
           \ needs to be updated to run elsewhere.\n        # The line is:\n      \
           \  #    test: /path/to/where/sdg/occured/node_datasets_*\n        # TODO:\
diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -60,6 +60,10 @@
 MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
 MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
 MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
+MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
+CANDIDATE_MODEL_PATH = path.join(
+    DATA_PVC_MOUNT_PATH, "model/output/hf_format/candidate_model"
+)
 SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
 KFP_MODEL_SERVER_CM = """
 # TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -268,7 +272,10 @@
     echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
     echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
     echo "Archiving data before pushing to the object store"
-    tar --create --gzip --verbose --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {data_pvc_mount_path}/model
+    tar --create \
+      --gzip \
+      --verbose \
+      --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
     # TODO: change model path for the final model!!!
 fi
 
@@ -843,11 +850,7 @@ def run(
         scores = json.loads(scores)
         logger.info("Best model: %s", scores.get("best_model"))
         ctx.obj["candidate_model"] = scores.get("best_model")
-
-        # Push the best model to S3
-        # TODO
         logger.info("instructLab Training Finished!")
-        return 0
 
         # Push the best model to S3
         ctx.invoke(upload_trained_model)
@@ -1271,6 +1274,8 @@ def data_processing(train_args: TrainingArgs) -> None:
                 mt_bench_output_path=MT_BENCH_OUTPUT_PATH,
                 mt_bench_scores_path=MT_BENCH_SCORES_PATH,
                 mt_bench_branch_scores_path=MT_BENCH_BRANCH_SCORES_PATH,
+                mmlu_branch_scores_path=MMLU_BRANCH_SCORES_PATH,
+                candidate_model=CANDIDATE_MODEL_PATH,
             )
         ],
         volume_mounts=get_vol_mount(),
@@ -1843,7 +1848,7 @@ def branch_eval_summary_to_json(
 
     # MMLU_BRANCH
 
-    # This is very specific to `ilab generate`, necessary because the data generation and
+    # This is very specific to 'ilab generate', necessary because the data generation and
     # model evaluation are taking place in separate environments.
     def update_test_lines_in_files(base_dir):
         import os
@@ -1889,7 +1894,7 @@ def find_node_dataset_directories(base_dir: str):
                 if regex.search(directory):
                     matching_dirs.append(os.path.join(root, directory))
 
-        # From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
+        # From 'ilab sdg' the knowledge_*_task.yaml files have a line that references where the SDG took place.
         # This needs to be updated to run elsewhere.
         # The line is:
         #    test: /path/to/where/sdg/occured/node_datasets_*
@@ -2117,7 +2122,7 @@ def find_node_dataset_directories(base_dir: str):
         json.dump(mt_bench_branch_data, f, indent=4)
 """
     exec_run_final_eval_op_args = """
-run_final_eval_op(candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/model/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
+run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generate', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
 """
 
     if eval_type == "mt-bench":
diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl
@@ -60,6 +60,8 @@ PYTORCH_NNODES = 2
 MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
 MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
 MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
+MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
+CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
 SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
 KFP_MODEL_SERVER_CM = """
 # TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -253,7 +255,10 @@ if [ "$STRATEGY" == "upload" ]; then
     echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
     echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
     echo "Archiving data before pushing to the object store"
-    tar --create --gzip --verbose --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {data_pvc_mount_path}/model
+    tar --create \
+      --gzip \
+      --verbose \
+      --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
     # TODO: change model path for the final model!!!
 fi
 
@@ -828,11 +833,7 @@ def run(
         scores = json.loads(scores)
         logger.info("Best model: %s", scores.get("best_model"))
         ctx.obj["candidate_model"] = scores.get("best_model")
-
-        # Push the best model to S3
-        # TODO
         logger.info("instructLab Training Finished!")
-        return 0
 
         # Push the best model to S3
         ctx.invoke(upload_trained_model)
@@ -1083,6 +1084,8 @@ def create_data_job(
                 mt_bench_output_path=MT_BENCH_OUTPUT_PATH,
                 mt_bench_scores_path=MT_BENCH_SCORES_PATH,
                 mt_bench_branch_scores_path=MT_BENCH_BRANCH_SCORES_PATH,
+                mmlu_branch_scores_path=MMLU_BRANCH_SCORES_PATH,
+                candidate_model=CANDIDATE_MODEL_PATH,
             )
         ],
         volume_mounts=get_vol_mount(),