opendatahub-io
diff --git a/‎eval/final/components.py
+3-3 b/‎eval/final/components.py
+3-3
diff --git a/‎eval/mt_bench/components.py
+4-4 b/‎eval/mt_bench/components.py
+4-4
diff --git a/‎pipeline.py
+16-7 b/‎pipeline.py
+16-7
diff --git a/‎pipeline.yaml
+6-6 b/‎pipeline.yaml
+6-6
diff --git a/‎sdg/components.py
+1-1 b/‎sdg/components.py
+1-1
diff --git a/‎standalone/README.md
+4 b/‎standalone/README.md
+4
@@ -221,7 +221,7 @@ def find_node_dataset_directories(base_directory: str):
 
     ######################################################################
     # TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
-    # and when that happens, much of this logic can be imported from the `evaluate` definition:
+    # and when that happens, much of this logic can be imported from the 'evaluate' definition:
     # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
     #
     # With instructlab, model_name is synonomous with model_path
@@ -244,8 +244,8 @@ def find_node_dataset_directories(base_directory: str):
         ),
     ]
 
-    # ilab/evaluate uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # ilab/evaluate uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     if max_workers == "auto":
         try:
 
@@ -12,8 +12,8 @@ def run_mt_bench_op(
     models_path_prefix: str,
     mt_bench_output: Output[Artifact],
     merge_system_user_message: bool,
-    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     max_workers: str,
     models_list: List[str] = None,
@@ -53,8 +53,8 @@ def run_mt_bench_op(
     scores = {}
     all_mt_bench_data = []
 
-    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - `auto`
-    # with `auto`, number of gpus allocated for serving is calculated based on environment
+    # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
+    # with 'auto', number of gpus allocated for serving is calculated based on environment
     # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
     if max_workers == "auto":
         try:
 
@@ -348,7 +348,7 @@ def pipeline(
         final_eval_task.set_accelerator_type("nvidia.com/gpu")
         final_eval_task.set_accelerator_limit(1)
 
-        # Technically `output_model_task` and `output_data_task` can happen before evaluation,
+        # Technically 'output_model_task' and 'output_data_task' can happen before evaluation,
         # however the PVC can only be mounted once, so, setting these to _after_ so the eval proceeds.
         output_model_task = pvc_to_artifact_op(
             pvc_path="/output/data",
@@ -417,7 +417,7 @@ def gen_standalone():
     This function should be used when Kubeflow Pipelines are not available. It will generate a
     script that replicates the pipeline's functionality.
 
-    Example usage: ``` $ python pipeline.py gen-standalone ```
+    Example usage: ''' $ python pipeline.py gen-standalone '''
     """
     from os import path
 
@@ -442,11 +442,11 @@ def gen_standalone():
 
     # The list of executor names to extract details from to generate the standalone script
     executors = {
-        "exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/input_data/generated", model="/input_model", processed_data="/input_data/processed_data")',
-        "exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/input_data/taxonomy", sdg="/input_data/generated")',
+        "exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/data/data", model="/data/model", processed_data="/data/processed_data")',
+        "exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")',
         "exec-git-clone-op": {},
-        "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/input_model")',
-        "exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/output/mt-bench-results.txt", models_list="/output/model/model/hf_format", models_path_prefix="/output/model/hf_format", max_workers="auto", merge_system_user_message=False)',
+        "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
+        "exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_list="/data/model/model/hf_format", models_path_prefix="/data/model/hf_format", max_workers="auto", merge_system_user_message=False)',
     }
 
     details = {}
@@ -621,9 +621,18 @@ def change_dsl_function_to_normal_function(rendered_code: list):
         "import kfp": "",
         "from kfp import dsl": "",
         "from kfp.dsl import *": "",
-        ".path": "",  # super hacky, but works for now, the idea is that "taxonomy.path" is a string so we just remove the ".path" part
     }
 
+    import re
+
+    # Regular expression to match ".path" but not "os.path"
+    path_pattern = re.compile(r"(?<!os)\.path")
+
+    def remove_path_not_os_path(line):
+        return path_pattern.sub("", line)
+
+    rendered_code = [remove_path_not_os_path(line) for line in rendered_code]
+
     for old, new in replacements.items():
         rendered_code = [line.replace(old, new) for line in rendered_code]
     return rendered_code[-1].strip()
 
@@ -589,7 +589,7 @@ deploymentSpec:
           \    )\n\n    def data_processing(train_args: TrainingArgs) -> None:\n \
           \       # early validation logic here\n        if train_args.max_batch_len\
           \ < train_args.max_seq_len:\n            raise ValueError(\n           \
-          \     f\"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=}\
+          \     f\"the 'max_batch_len' cannot be less than 'max_seq_len': {train_args.max_batch_len=}\
           \ < {train_args.max_seq_len=}\"\n            )\n\n            # process\
           \ the training data\n        if not os.path.exists(train_args.data_output_dir):\n\
           \            os.makedirs(train_args.data_output_dir, exist_ok=True)\n  \
@@ -1107,7 +1107,7 @@ deploymentSpec:
           main\"\n\n    ######################################################################\n\
           \    # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
           \ external judge model\n    # and when that happens, much of this logic\
-          \ can be imported from the `evaluate` definition:\n    # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
+          \ can be imported from the 'evaluate' definition:\n    # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
           \    #\n    # With instructlab, model_name is synonomous with model_path\n\
           \    mt_bench_evaluators = [\n        MTBenchBranchEvaluator(\n        \
           \    model_name=candidate_model,\n            judge_model_name=judge_model_name,\n\
@@ -1118,7 +1118,7 @@ deploymentSpec:
           \            branch=base_branch,\n            output_dir=output_dir,\n \
           \           merge_system_user_message=merge_system_user_message,\n     \
           \   ),\n    ]\n\n    # ilab/evaluate uses a magic word for its mt_bench\
-          \ evaluator  - `auto`\n    # with `auto`, number of gpus allocated for serving\
+          \ evaluator  - 'auto'\n    # with 'auto', number of gpus allocated for serving\
           \ is calculated based on environment\n    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
           \    if max_workers == \"auto\":\n        try:\n            usable_cpu_count\
           \ = len(os.sched_getaffinity(0)) // 2\n        except AttributeError:\n\
@@ -1197,7 +1197,7 @@ deploymentSpec:
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
           \ *\n\ndef run_mt_bench_op(\n    models_path_prefix: str,\n    mt_bench_output:\
           \ Output[Artifact],\n    merge_system_user_message: bool,\n    # generate_answers,judgment\
-          \ uses a magic word for its mt_bench evaluator  - `auto`\n    # with `auto`,\
+          \ uses a magic word for its mt_bench evaluator  - 'auto'\n    # with 'auto',\
           \ number of gpus allocated for serving is calculated based on environment\n\
           \    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
           \    max_workers: str,\n    models_list: List[str] = None,\n    models_folder:\
@@ -1215,7 +1215,7 @@ deploymentSpec:
           \n    judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n    judge_model_name\
           \ = os.getenv(\"JUDGE_NAME\")\n    judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
           )\n\n    scores = {}\n    all_mt_bench_data = []\n\n    # generate_answers,judgment\
-          \ uses a magic word for its mt_bench evaluator  - `auto`\n    # with `auto`,\
+          \ uses a magic word for its mt_bench evaluator  - 'auto'\n    # with 'auto',\
           \ number of gpus allocated for serving is calculated based on environment\n\
           \    # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
           \    if max_workers == \"auto\":\n        try:\n            usable_cpu_count\
@@ -1286,7 +1286,7 @@ deploymentSpec:
           \ > 0) else \"empty\"\n\n    print(\"Generating syntetic dataset for:\"\
           )\n    print()\n    print(read_taxonomy(taxonomy.path, taxonomy_base))\n\
           \n    # generate_data has a magic word for its taxonomy_base argument -\
-          \ `empty`\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          \ 'empty'\n    # it allows generating from the whole repo, see:\n    # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \    generate_data(\n        client=client,\n        num_instructions_to_generate=num_instructions_to_generate,\n\
           \        output_dir=sdg.path,\n        taxonomy=taxonomy.path,\n       \
           \ taxonomy_base=taxonomy_base,\n        model_name=model,\n        chunk_word_count=1000,\n\
 
@@ -52,7 +52,7 @@ def sdg_op(
     print()
     print(read_taxonomy(taxonomy.path, taxonomy_base))
 
-    # generate_data has a magic word for its taxonomy_base argument - `empty`
+    # generate_data has a magic word for its taxonomy_base argument - 'empty'
     # it allows generating from the whole repo, see:
     # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
     generate_data(
 
@@ -98,6 +98,10 @@ The script requires information regarding the location and method for accessing
 * `--eval-serving-model-name`: The name of the model to use for evaluation. **Required**
 * `--eval-serving-model-api-key`: The API key for the model to evaluate. `EVAL_SERVING_MODEL_API_KEY`
   environment variable can be used as well. **Required**
+* `--force-pull`: Force pull the data (sdg data and model) from the object store even if it already
+  exists in the PVC. **Optional** - Default: false.
+* `--training-1-epoch-num`: The number of epochs to train the model for phase 1. **Optional** - Default: 7.
+* `--training-2-epoch-num`: The number of epochs to train the model for phase 2. **Optional** - Default: 10.
 
 
 ## Example End-To-End Workflow