From b188ce4c19f75b809f2f9411ad8a0f22fed4703c Mon Sep 17 00:00:00 2001 From: sallyom Date: Thu, 17 Oct 2024 16:50:08 -0400 Subject: [PATCH 1/3] fix data-processing-op args in pipeline.py Signed-off-by: sallyom --- pipeline.py | 2 +- standalone/standalone.py | 4 ++-- standalone/standalone.tpl | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline.py b/pipeline.py index 1c390621..c162b97d 100644 --- a/pipeline.py +++ b/pipeline.py @@ -442,7 +442,7 @@ def gen_standalone(): # The list of executor names to extract details from to generate the standalone script executors = { - "exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_SDG_PATH}", processed_data="{PREPROCESSED_DATA_PATH}")', + "exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_MODEL_PATH}", processed_data="{PREPROCESSED_DATA_PATH}")', "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, repo_branch="{exec_git_clone_op_repo_branch}", repo_pr={exec_git_clone_op_repo_pr}, taxonomy="{TAXONOMY_DATA_PATH}", sdg="{SDG_GENERATED_DATA_PATH}")', "exec-git-clone-op": {}, "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model="{DATA_PVC_MODEL_PATH}")', diff --git a/standalone/standalone.py b/standalone/standalone.py index c132d694..919d991f 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1166,7 +1166,7 @@ def data_processing(train_args: TrainingArgs) -> None: data_processing(train_args=training_args) """ exec_data_processing_op_args = f""" -data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_SDG_PATH}", processed_data="{PREPROCESSED_DATA_PATH}") +data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_MODEL_PATH}", processed_data="{PREPROCESSED_DATA_PATH}") """ init_containers = [ @@ -1383,7 +1383,7 @@ def data_processing(train_args: TrainingArgs) -> None: data_processing(train_args=training_args) """ exec_data_processing_op_args = f""" -data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg={DATA_PVC_SDG_PATH}, model={DATA_PVC_SDG_PATH}, processed_data={PREPROCESSED_DATA_PATH}) +data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_MODEL_PATH}", processed_data="{PREPROCESSED_DATA_PATH}") """ data_container = kubernetes.client.V1Container( diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index 5f94cf36..4948794e 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -1195,7 +1195,7 @@ def create_data_job( {{exec_data_processing_op_command}} """ exec_data_processing_op_args = f""" -data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg={DATA_PVC_SDG_PATH}, model={DATA_PVC_SDG_PATH}, processed_data={PREPROCESSED_DATA_PATH}) +data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg="{DATA_PVC_SDG_PATH}", model="{DATA_PVC_MODEL_PATH}", processed_data="{PREPROCESSED_DATA_PATH}") """ data_container = kubernetes.client.V1Container( From 56e42248cffdc2127ac1f0c3655eb0fbe81bcc2d Mon Sep 17 00:00:00 2001 From: sallyom Date: Thu, 17 Oct 2024 17:05:49 -0400 Subject: [PATCH 2/3] update data_path Signed-off-by: sallyom --- standalone/standalone.py | 4 ++-- standalone/standalone.tpl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/standalone/standalone.py b/standalone/standalone.py index 919d991f..bdcde821 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -149,7 +149,7 @@ --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) \ -m instructlab.training.main_ds \ --model_name_or_path="$PATH_TO_MODEL" \ - --data_path=/data/processed_data/data.jsonl \ + --data_path=/data/data/processed_data/data.jsonl \ --output_dir={path_to_model}/output/phase_{phase_num} \ --num_epochs={epoch_num} \ --effective_batch_size=3840 \ @@ -221,7 +221,7 @@ --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) \ -m instructlab.training.main_ds \ --model_name_or_path="$PATH_TO_MODEL" \ - --data_path=/data/processed_data/data.jsonl \ + --data_path=/data/data/processed_data/data.jsonl \ --output_dir="$tmp_model" \ --num_epochs={epoch_num} \ --effective_batch_size=3840 \ diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index 4948794e..a9a2b669 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -134,7 +134,7 @@ spec: --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) \ -m instructlab.training.main_ds \ --model_name_or_path="$PATH_TO_MODEL" \ - --data_path=/data/processed_data/data.jsonl \ + --data_path=/data/data/processed_data/data.jsonl \ --output_dir={path_to_model}/output/phase_{phase_num} \ --num_epochs={epoch_num} \ --effective_batch_size=3840 \ @@ -206,7 +206,7 @@ spec: --rdzv_endpoint $(MASTER_ADDR):$(MASTER_PORT) \ -m instructlab.training.main_ds \ --model_name_or_path="$PATH_TO_MODEL" \ - --data_path=/data/processed_data/data.jsonl \ + --data_path=/data/data/processed_data/data.jsonl \ --output_dir="$tmp_model" \ --num_epochs={epoch_num} \ --effective_batch_size=3840 \ From 08bcb4ca0b85b01d1f7cdfb8a75a1a66a8cb6a26 Mon Sep 17 00:00:00 2001 From: sallyom Date: Thu, 17 Oct 2024 18:10:37 -0400 Subject: [PATCH 3/3] fix mt_bench candidate_model path prefix Signed-off-by: sallyom --- pipeline.py | 2 +- standalone/standalone.py | 7 ++++--- standalone/standalone.tpl | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pipeline.py b/pipeline.py index c162b97d..3cfc14a9 100644 --- a/pipeline.py +++ b/pipeline.py @@ -446,7 +446,7 @@ def gen_standalone(): "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, repo_branch="{exec_git_clone_op_repo_branch}", repo_pr={exec_git_clone_op_repo_pr}, taxonomy="{TAXONOMY_DATA_PATH}", sdg="{SDG_GENERATED_DATA_PATH}")', "exec-git-clone-op": {}, "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model="{DATA_PVC_MODEL_PATH}")', - "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",mt_bench_output="{MT_BENCH_OUTPUT_PATH}", models_folder="{CANDIDATE_MODEL_PATH}", models_path_prefix="{CANDIDATE_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})', + "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",mt_bench_output="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}",models_path_prefix="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})', "exec-run-final-eval-op": 'run_final_eval_op(mmlu_branch_output="{MMLU_BRANCH_SCORES_PATH}", mt_bench_branch_output="{MT_BENCH_OUTPUT_PATH}", candidate_model="{CANDIDATE_MODEL_PATH}", taxonomy="{TAXONOMY_PATH}", tasks="{DATA_PVC_SDG_PATH}", base_branch="", candidate_branch="", device=None, base_model_dir="{DATA_PVC_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}, model_dtype="{MODEL_DTYPE}", few_shots={FEW_SHOTS}, batch_size={BATCH_SIZE})', } diff --git a/standalone/standalone.py b/standalone/standalone.py index bdcde821..9ac0fc7a 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -75,9 +75,10 @@ MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt") MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt") MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt") -CANDIDATE_MODEL_PATH = path.join( - DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model" +CANDIDATE_MODEL_PATH_PREFIX = path.join( + DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format" ) +CANDIDATE_MODEL_PATH = path.join(CANDIDATE_MODEL_PATH_PREFIX, "candidate_model") SDG_GENERATED_DATA_PATH = path.join(DATA_PVC_MOUNT_PATH, "generated") TAXONOMY_DATA_PATH = path.join(DATA_PVC_MOUNT_PATH, "taxonomy") # MMLU_SCORES_PATH = "/output/mmlu-results.txt" - after training phase 1 is done MMLU is not performed anymore @@ -1776,7 +1777,7 @@ def stop_vllm(): return outputs(best_model=best_model, best_score=best_score) """ exec_run_mt_bench_op_args = f""" -run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",mt_bench_output="{MT_BENCH_OUTPUT_PATH}", models_folder="{CANDIDATE_MODEL_PATH}", models_path_prefix="{CANDIDATE_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}) +run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",mt_bench_output="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}",models_path_prefix="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}) """ exec_run_final_eval_op_command = """ from typing import * diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index a9a2b669..092da696 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -75,9 +75,10 @@ MT_BENCH_OUTPUT_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-results.txt") MT_BENCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-best.txt") MT_BENCH_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mt-bench-branch-best.txt") MMLU_BRANCH_SCORES_PATH = path.join(DATA_PVC_MOUNT_PATH, "mmlu-branch-best.txt") -CANDIDATE_MODEL_PATH = path.join( - DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format/candidate_model" +CANDIDATE_MODEL_PATH_PREFIX = path.join( + DATA_PVC_MOUNT_PATH, "model/output/phase_2/hf_format" ) +CANDIDATE_MODEL_PATH = path.join(CANDIDATE_MODEL_PATH_PREFIX, "candidate_model") SDG_GENERATED_DATA_PATH = path.join(DATA_PVC_MOUNT_PATH, "generated") TAXONOMY_DATA_PATH = path.join(DATA_PVC_MOUNT_PATH, "taxonomy") # MMLU_SCORES_PATH = "/output/mmlu-results.txt" - after training phase 1 is done MMLU is not performed anymore