Skip to content

Commit 8f23bbf

Browse files
committed
fix: add final details to push model to S3
Signed-off-by: Sébastien Han <[email protected]>
1 parent 8210917 commit 8f23bbf

File tree

3 files changed

+23
-15
lines changed

3 files changed

+23
-15
lines changed

pipeline.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ deploymentSpec:
11041104
\ if gpu_available\n else \"No GPU available\"\n )\n \
11051105
\ gpu_count = torch.cuda.device_count() if gpu_available else 0\n\n \
11061106
\ print(f\"GPU Available: {gpu_available}, Using: {gpu_name}\")\n\n #\
1107-
\ MMLU_BRANCH\n\n # This is very specific to `ilab generate`, necessary\
1107+
\ MMLU_BRANCH\n\n # This is very specific to 'ilab generate', necessary\
11081108
\ because the data generation and\n # model evaluation are taking place\
11091109
\ in separate environments.\n def update_test_lines_in_files(base_dir):\n\
11101110
\ import os\n import re\n\n # Define the regex to match\
@@ -1129,7 +1129,7 @@ deploymentSpec:
11291129
\ = []\n regex = re.compile(pattern)\n\n for root, dirs, files\
11301130
\ in os.walk(base_dir):\n for directory in dirs:\n \
11311131
\ if regex.search(directory):\n matching_dirs.append(os.path.join(root,\
1132-
\ directory))\n\n # From `ilab sdg` the knowledge_*_task.yaml files\
1132+
\ directory))\n\n # From 'ilab sdg' the knowledge_*_task.yaml files\
11331133
\ have a line that references where the SDG took place.\n # This\
11341134
\ needs to be updated to run elsewhere.\n # The line is:\n \
11351135
\ # test: /path/to/where/sdg/occured/node_datasets_*\n # TODO:\

standalone/standalone.py

+13-8
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@
6060
MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
6161
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
6262
MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
63+
MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
64+
CANDIDATE_MODEL_PATH = path.join(
65+
DATA_PVC_MOUNT_PATH, "model/output/hf_format/candidate_model"
66+
)
6367
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
6468
KFP_MODEL_SERVER_CM = """
6569
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -268,7 +272,10 @@
268272
echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
269273
echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
270274
echo "Archiving data before pushing to the object store"
271-
tar --create --gzip --verbose --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {data_pvc_mount_path}/model
275+
tar --create \
276+
--gzip \
277+
--verbose \
278+
--file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
272279
# TODO: change model path for the final model!!!
273280
fi
274281
@@ -843,11 +850,7 @@ def run(
843850
scores = json.loads(scores)
844851
logger.info("Best model: %s", scores.get("best_model"))
845852
ctx.obj["candidate_model"] = scores.get("best_model")
846-
847-
# Push the best model to S3
848-
# TODO
849853
logger.info("instructLab Training Finished!")
850-
return 0
851854

852855
# Push the best model to S3
853856
ctx.invoke(upload_trained_model)
@@ -1271,6 +1274,8 @@ def data_processing(train_args: TrainingArgs) -> None:
12711274
mt_bench_output_path=MT_BENCH_OUTPUT_PATH,
12721275
mt_bench_scores_path=MT_BENCH_SCORES_PATH,
12731276
mt_bench_branch_scores_path=MT_BENCH_BRANCH_SCORES_PATH,
1277+
mmlu_branch_scores_path=MMLU_BRANCH_SCORES_PATH,
1278+
candidate_model=CANDIDATE_MODEL_PATH,
12741279
)
12751280
],
12761281
volume_mounts=get_vol_mount(),
@@ -1843,7 +1848,7 @@ def branch_eval_summary_to_json(
18431848
18441849
# MMLU_BRANCH
18451850
1846-
# This is very specific to `ilab generate`, necessary because the data generation and
1851+
# This is very specific to 'ilab generate', necessary because the data generation and
18471852
# model evaluation are taking place in separate environments.
18481853
def update_test_lines_in_files(base_dir):
18491854
import os
@@ -1889,7 +1894,7 @@ def find_node_dataset_directories(base_dir: str):
18891894
if regex.search(directory):
18901895
matching_dirs.append(os.path.join(root, directory))
18911896
1892-
# From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
1897+
# From 'ilab sdg' the knowledge_*_task.yaml files have a line that references where the SDG took place.
18931898
# This needs to be updated to run elsewhere.
18941899
# The line is:
18951900
# test: /path/to/where/sdg/occured/node_datasets_*
@@ -2117,7 +2122,7 @@ def find_node_dataset_directories(base_dir: str):
21172122
json.dump(mt_bench_branch_data, f, indent=4)
21182123
"""
21192124
exec_run_final_eval_op_args = """
2120-
run_final_eval_op(candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/model/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
2125+
run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generate', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
21212126
"""
21222127

21232128
if eval_type == "mt-bench":

standalone/standalone.tpl

+8-5
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ PYTORCH_NNODES = 2
6060
MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt")
6161
MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt")
6262
MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt")
63+
MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt")
64+
CANDIDATE_MODEL_PATH = path.join(DATA_PVC_OUTPUT_PATH, "hf_format/candidate_model")
6365
SDG_OBJECT_STORE_SECRET_NAME = "sdg-object-store-credentials"
6466
KFP_MODEL_SERVER_CM = """
6567
# TODO: remove the following line and replace it with the actual ConfigMap/Secret
@@ -253,7 +255,10 @@ if [ "$STRATEGY" == "upload" ]; then
253255
echo "Final data tarball path: $FINAL_DATA_TAR_PATH"
254256
echo "Final data tarball file: $FINAL_DATA_TAR_FILE"
255257
echo "Archiving data before pushing to the object store"
256-
tar --create --gzip --verbose --file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {data_pvc_mount_path}/model
258+
tar --create \
259+
--gzip \
260+
--verbose \
261+
--file "$FINAL_DATA_TAR_PATH" {mt_bench_output_path} {mt_bench_scores_path} {mt_bench_branch_scores_path} {mmlu_branch_scores_path} {candidate_model_path}
257262
# TODO: change model path for the final model!!!
258263
fi
259264
@@ -828,11 +833,7 @@ def run(
828833
scores = json.loads(scores)
829834
logger.info("Best model: %s", scores.get("best_model"))
830835
ctx.obj["candidate_model"] = scores.get("best_model")
831-
832-
# Push the best model to S3
833-
# TODO
834836
logger.info("instructLab Training Finished!")
835-
return 0
836837

837838
# Push the best model to S3
838839
ctx.invoke(upload_trained_model)
@@ -1083,6 +1084,8 @@ def create_data_job(
10831084
mt_bench_output_path=MT_BENCH_OUTPUT_PATH,
10841085
mt_bench_scores_path=MT_BENCH_SCORES_PATH,
10851086
mt_bench_branch_scores_path=MT_BENCH_BRANCH_SCORES_PATH,
1087+
mmlu_branch_scores_path=MMLU_BRANCH_SCORES_PATH,
1088+
candidate_model=CANDIDATE_MODEL_PATH,
10861089
)
10871090
],
10881091
volume_mounts=get_vol_mount(),

0 commit comments

Comments
 (0)