Skip to content

Commit 3e5d1fc

Browse files
committed
bulk commit
Sorry, I went really far with this one but I can confirm that: * sdg-data-fetch is working * data processing works * training phase 1 is stuck when launched, need investigation Also: * remove backtick from the code since it breaks the shell that runs the python executor * only use a single PVC for everything: sdg data, model, trained model * --force-pull: to force pulling from the object store again if the data are already present Signed-off-by: Sébastien Han <[email protected]>
1 parent ca03343 commit 3e5d1fc

File tree

10 files changed

+640
-531
lines changed

10 files changed

+640
-531
lines changed

eval/final/components.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def find_node_dataset_directories(base_directory: str):
221221

222222
######################################################################
223223
# TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
224-
# and when that happens, much of this logic can be imported from the `evaluate` definition:
224+
# and when that happens, much of this logic can be imported from the 'evaluate' definition:
225225
# https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
226226
#
227227
# With instructlab, model_name is synonomous with model_path
@@ -244,8 +244,8 @@ def find_node_dataset_directories(base_directory: str):
244244
),
245245
]
246246

247-
# ilab/evaluate uses a magic word for its mt_bench evaluator - `auto`
248-
# with `auto`, number of gpus allocated for serving is calculated based on environment
247+
# ilab/evaluate uses a magic word for its mt_bench evaluator - 'auto'
248+
# with 'auto', number of gpus allocated for serving is calculated based on environment
249249
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
250250
if max_workers == "auto":
251251
try:

eval/mt_bench/components.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ def run_mt_bench_op(
1212
models_path_prefix: str,
1313
mt_bench_output: Output[Artifact],
1414
merge_system_user_message: bool,
15-
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
16-
# with `auto`, number of gpus allocated for serving is calculated based on environment
15+
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
16+
# with 'auto', number of gpus allocated for serving is calculated based on environment
1717
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
1818
max_workers: str,
1919
models_list: List[str] = None,
@@ -53,8 +53,8 @@ def run_mt_bench_op(
5353
scores = {}
5454
all_mt_bench_data = []
5555

56-
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
57-
# with `auto`, number of gpus allocated for serving is calculated based on environment
56+
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
57+
# with 'auto', number of gpus allocated for serving is calculated based on environment
5858
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
5959
if max_workers == "auto":
6060
try:

pipeline.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def pipeline(
348348
final_eval_task.set_accelerator_type("nvidia.com/gpu")
349349
final_eval_task.set_accelerator_limit(1)
350350

351-
# Technically `output_model_task` and `output_data_task` can happen before evaluation,
351+
# Technically 'output_model_task' and 'output_data_task' can happen before evaluation,
352352
# however the PVC can only be mounted once, so, setting these to _after_ so the eval proceeds.
353353
output_model_task = pvc_to_artifact_op(
354354
pvc_path="/output/data",
@@ -417,7 +417,7 @@ def gen_standalone():
417417
This function should be used when Kubeflow Pipelines are not available. It will generate a
418418
script that replicates the pipeline's functionality.
419419
420-
Example usage: ``` $ python pipeline.py gen-standalone ```
420+
Example usage: ''' $ python pipeline.py gen-standalone '''
421421
"""
422422
from os import path
423423

@@ -442,11 +442,11 @@ def gen_standalone():
442442

443443
# The list of executor names to extract details from to generate the standalone script
444444
executors = {
445-
"exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/input_data/generated", model="/input_model", processed_data="/input_data/processed_data")',
446-
"exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/input_data/taxonomy", sdg="/input_data/generated")',
445+
"exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/data/data", model="/data/model", processed_data="/data/processed_data")',
446+
"exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")',
447447
"exec-git-clone-op": {},
448-
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/input_model")',
449-
"exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/output/mt-bench-results.txt", models_list="/output/model/model/hf_format", models_path_prefix="/output/model/hf_format", max_workers="auto", merge_system_user_message=False)',
448+
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
449+
"exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_list="/data/model/model/hf_format", models_path_prefix="/data/model/hf_format", max_workers="auto", merge_system_user_message=False)',
450450
}
451451

452452
details = {}
@@ -621,9 +621,18 @@ def change_dsl_function_to_normal_function(rendered_code: list):
621621
"import kfp": "",
622622
"from kfp import dsl": "",
623623
"from kfp.dsl import *": "",
624-
".path": "", # super hacky, but works for now, the idea is that "taxonomy.path" is a string so we just remove the ".path" part
625624
}
626625

626+
import re
627+
628+
# Regular expression to match ".path" but not "os.path"
629+
path_pattern = re.compile(r"(?<!os)\.path")
630+
631+
def remove_path_not_os_path(line):
632+
return path_pattern.sub("", line)
633+
634+
rendered_code = [remove_path_not_os_path(line) for line in rendered_code]
635+
627636
for old, new in replacements.items():
628637
rendered_code = [line.replace(old, new) for line in rendered_code]
629638
return rendered_code[-1].strip()

pipeline.yaml

+6-6
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ deploymentSpec:
589589
\ )\n\n def data_processing(train_args: TrainingArgs) -> None:\n \
590590
\ # early validation logic here\n if train_args.max_batch_len\
591591
\ < train_args.max_seq_len:\n raise ValueError(\n \
592-
\ f\"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=}\
592+
\ f\"the 'max_batch_len' cannot be less than 'max_seq_len': {train_args.max_batch_len=}\
593593
\ < {train_args.max_seq_len=}\"\n )\n\n # process\
594594
\ the training data\n if not os.path.exists(train_args.data_output_dir):\n\
595595
\ os.makedirs(train_args.data_output_dir, exist_ok=True)\n \
@@ -1107,7 +1107,7 @@ deploymentSpec:
11071107
main\"\n\n ######################################################################\n\
11081108
\ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
11091109
\ external judge model\n # and when that happens, much of this logic\
1110-
\ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
1110+
\ can be imported from the 'evaluate' definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
11111111
\ #\n # With instructlab, model_name is synonomous with model_path\n\
11121112
\ mt_bench_evaluators = [\n MTBenchBranchEvaluator(\n \
11131113
\ model_name=candidate_model,\n judge_model_name=judge_model_name,\n\
@@ -1118,7 +1118,7 @@ deploymentSpec:
11181118
\ branch=base_branch,\n output_dir=output_dir,\n \
11191119
\ merge_system_user_message=merge_system_user_message,\n \
11201120
\ ),\n ]\n\n # ilab/evaluate uses a magic word for its mt_bench\
1121-
\ evaluator - `auto`\n # with `auto`, number of gpus allocated for serving\
1121+
\ evaluator - 'auto'\n # with 'auto', number of gpus allocated for serving\
11221122
\ is calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
11231123
\ if max_workers == \"auto\":\n try:\n usable_cpu_count\
11241124
\ = len(os.sched_getaffinity(0)) // 2\n except AttributeError:\n\
@@ -1197,7 +1197,7 @@ deploymentSpec:
11971197
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
11981198
\ *\n\ndef run_mt_bench_op(\n models_path_prefix: str,\n mt_bench_output:\
11991199
\ Output[Artifact],\n merge_system_user_message: bool,\n # generate_answers,judgment\
1200-
\ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
1200+
\ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
12011201
\ number of gpus allocated for serving is calculated based on environment\n\
12021202
\ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
12031203
\ max_workers: str,\n models_list: List[str] = None,\n models_folder:\
@@ -1215,7 +1215,7 @@ deploymentSpec:
12151215
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
12161216
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
12171217
)\n\n scores = {}\n all_mt_bench_data = []\n\n # generate_answers,judgment\
1218-
\ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
1218+
\ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
12191219
\ number of gpus allocated for serving is calculated based on environment\n\
12201220
\ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
12211221
\ if max_workers == \"auto\":\n try:\n usable_cpu_count\
@@ -1286,7 +1286,7 @@ deploymentSpec:
12861286
\ > 0) else \"empty\"\n\n print(\"Generating syntetic dataset for:\"\
12871287
)\n print()\n print(read_taxonomy(taxonomy.path, taxonomy_base))\n\
12881288
\n # generate_data has a magic word for its taxonomy_base argument -\
1289-
\ `empty`\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
1289+
\ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
12901290
\ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\
12911291
\ output_dir=sdg.path,\n taxonomy=taxonomy.path,\n \
12921292
\ taxonomy_base=taxonomy_base,\n model_name=model,\n chunk_word_count=1000,\n\

sdg/components.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def sdg_op(
5252
print()
5353
print(read_taxonomy(taxonomy.path, taxonomy_base))
5454

55-
# generate_data has a magic word for its taxonomy_base argument - `empty`
55+
# generate_data has a magic word for its taxonomy_base argument - 'empty'
5656
# it allows generating from the whole repo, see:
5757
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
5858
generate_data(

standalone/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ The script requires information regarding the location and method for accessing
9898
* `--eval-serving-model-name`: The name of the model to use for evaluation. **Required**
9999
* `--eval-serving-model-api-key`: The API key for the model to evaluate. `EVAL_SERVING_MODEL_API_KEY`
100100
environment variable can be used as well. **Required**
101+
* `--force-pull`: Force pull the data (sdg data and model) from the object store even if it already
102+
exists in the PVC. **Optional** - Default: false.
103+
* `--training-1-epoch-num`: The number of epochs to train the model for phase 1. **Optional** - Default: 7.
104+
* `--training-2-epoch-num`: The number of epochs to train the model for phase 2. **Optional** - Default: 10.
101105

102106

103107
## Example End-To-End Workflow

0 commit comments

Comments
 (0)