update sdg generated line in knowledge-yaml

Signed-off-by: sallyom <[email protected]>
opendatahub-io · Oct 11, 2024 · adaf720 · adaf720
1 parent 8122f5b
commit adaf720
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 12 deletions.
diff --git a/eval/final/components.py b/eval/final/components.py
@@ -199,6 +199,34 @@ def branch_eval_summary_to_json(
 
     # MMLU_BRANCH
 
+    # This is very specific to `ilab generate`, necessary because the data generation and
+    # model evaluation are taking place in separate environments.
+    def update_test_lines_in_files(tasks_dir):
+        # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
+        regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
+
+        for root, dirs, files in os.walk(tasks_dir):
+            for file_name in files:
+                file_path = os.path.join(root, file_name)
+
+                with open(file_path, "r") as file:
+                    lines = file.readlines()
+
+                updated_lines = []
+                changed = False
+
+                for line in lines:
+                    # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
+                    new_line = re.sub(regex, rf"\1{tasks_dir}/\2\3", line)
+                    if new_line != line:
+                        changed = True  # Only rewrite the file if there's a change
+                    updated_lines.append(new_line)
+
+                if changed:
+                    with open(file_path, "w") as file:
+                        file.writelines(updated_lines)
+                    print(f"Updated: {file_path}")
+
     # find_node_dataset_directories to find sdg output node_datasets_*
     def find_node_dataset_directories(base_directory: str):
         import os
@@ -223,6 +251,11 @@ def find_node_dataset_directories(base_directory: str):
     # generates a node_datasets_ directory for MMLU custom tasks data
     if node_dataset_dirs:
         tasks_dir = node_dataset_dirs[0]
+        # From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
+        # This needs to be updated to run elsewhere.
+        # The line is:
+        #    test: /path/to/where/sdg/occured/node_datasets_*
+        update_test_lines_in_files(tasks_dir)
 
         mmlu_branch_evaluators = [
             MMLUBranchEvaluator(

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -1103,18 +1103,39 @@ deploymentSpec:
           \ gpu_available\n        else \"No GPU available\"\n    )\n    gpu_count\
           \ = torch.cuda.device_count() if gpu_available else 0\n\n    print(f\"GPU\
           \ Available: {gpu_available}, Using: {gpu_name}\")\n\n    # MMLU_BRANCH\n\
-          \n    # find_node_dataset_directories to find sdg output node_datasets_*\n\
-          \    def find_node_dataset_directories(base_directory: str):\n        import\
-          \ os\n        import re\n\n        # This is specific to ilab/eval output\n\
-          \        pattern = r\"node_datasets_\"\n        matching_dirs = []\n   \
-          \     regex = re.compile(pattern)\n\n        for root, dirs, files in os.walk(base_directory):\n\
-          \            for directory in dirs:\n                if regex.search(directory):\n\
-          \                    matching_dirs.append(os.path.join(root, directory))\n\
-          \n        return matching_dirs\n\n    mmlu_tasks = [\"mmlu_pr\"]\n\n   \
-          \ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n    # This\
-          \ assumes generated filesystem from ilab sdg, which\n    # generates a node_datasets_\
-          \ directory for MMLU custom tasks data\n    if node_dataset_dirs:\n    \
-          \    tasks_dir = node_dataset_dirs[0]\n\n        mmlu_branch_evaluators\
+          \n    # This is very specific to `ilab generate`, necessary because the\
+          \ data generation and\n    # model evaluation are taking place in separate\
+          \ environments.\n    def update_test_lines_in_files(tasks_dir):\n      \
+          \  # Define the regex to match lines starting with any indentation, 'test:',\
+          \ and containing 'node_datasets_*'\n        regex = re.compile(r\"(\\s*test:\\\
+          s*).*/(node_datasets_[^/]*)(.*)\")\n\n        for root, dirs, files in os.walk(tasks_dir):\n\
+          \            for file_name in files:\n                file_path = os.path.join(root,\
+          \ file_name)\n\n                with open(file_path, \"r\") as file:\n \
+          \                   lines = file.readlines()\n\n                updated_lines\
+          \ = []\n                changed = False\n\n                for line in lines:\n\
+          \                    # Replace the matched line with the desired format,\
+          \ keeping 'test:' and leading whitespace intact\n                    new_line\
+          \ = re.sub(regex, rf\"\\1{tasks_dir}/\\2\\3\", line)\n                 \
+          \   if new_line != line:\n                        changed = True  # Only\
+          \ rewrite the file if there's a change\n                    updated_lines.append(new_line)\n\
+          \n                if changed:\n                    with open(file_path,\
+          \ \"w\") as file:\n                        file.writelines(updated_lines)\n\
+          \                    print(f\"Updated: {file_path}\")\n\n    # find_node_dataset_directories\
+          \ to find sdg output node_datasets_*\n    def find_node_dataset_directories(base_directory:\
+          \ str):\n        import os\n        import re\n\n        # This is specific\
+          \ to ilab/eval output\n        pattern = r\"node_datasets_\"\n        matching_dirs\
+          \ = []\n        regex = re.compile(pattern)\n\n        for root, dirs, files\
+          \ in os.walk(base_directory):\n            for directory in dirs:\n    \
+          \            if regex.search(directory):\n                    matching_dirs.append(os.path.join(root,\
+          \ directory))\n\n        return matching_dirs\n\n    mmlu_tasks = [\"mmlu_pr\"\
+          ]\n\n    node_dataset_dirs = find_node_dataset_directories(tasks.path)\n\
+          \    # This assumes generated filesystem from ilab sdg, which\n    # generates\
+          \ a node_datasets_ directory for MMLU custom tasks data\n    if node_dataset_dirs:\n\
+          \        tasks_dir = node_dataset_dirs[0]\n        # From `ilab sdg` the\
+          \ knowledge_*_task.yaml files have a line that references where the SDG\
+          \ took place.\n        # This needs to be updated to run elsewhere.\n  \
+          \      # The line is:\n        #    test: /path/to/where/sdg/occured/node_datasets_*\n\
+          \        update_test_lines_in_files(tasks_dir)\n\n        mmlu_branch_evaluators\
           \ = [\n            MMLUBranchEvaluator(\n                model_path=candidate_model,\n\
           \                tasks_dir=tasks_dir,\n                tasks=mmlu_tasks,\n\
           \                few_shots=few_shots,\n                batch_size=batch_size,\n\

diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -1767,6 +1767,34 @@ def branch_eval_summary_to_json(
 
     # MMLU_BRANCH
 
+    # This is very specific to `ilab generate`, necessary because the data generation and
+    # model evaluation are taking place in separate environments.
+    def update_test_lines_in_files(tasks_dir):
+        # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
+        regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
+
+        for root, dirs, files in os.walk(tasks_dir):
+            for file_name in files:
+                file_path = os.path.join(root, file_name)
+
+                with open(file_path, "r") as file:
+                    lines = file.readlines()
+
+                updated_lines = []
+                changed = False
+
+                for line in lines:
+                    # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
+                    new_line = re.sub(regex, rf"\1{tasks_dir}/\2\3", line)
+                    if new_line != line:
+                        changed = True  # Only rewrite the file if there's a change
+                    updated_lines.append(new_line)
+
+                if changed:
+                    with open(file_path, "w") as file:
+                        file.writelines(updated_lines)
+                    print(f"Updated: {file_path}")
+
     # find_node_dataset_directories to find sdg output node_datasets_*
     def find_node_dataset_directories(base_directory: str):
         import os
@@ -1791,6 +1819,11 @@ def find_node_dataset_directories(base_directory: str):
     # generates a node_datasets_ directory for MMLU custom tasks data
     if node_dataset_dirs:
         tasks_dir = node_dataset_dirs[0]
+        # From `ilab sdg` the knowledge_*_task.yaml files have a line that references where the SDG took place.
+        # This needs to be updated to run elsewhere.
+        # The line is:
+        #    test: /path/to/where/sdg/occured/node_datasets_*
+        update_test_lines_in_files(tasks_dir)
 
         mmlu_branch_evaluators = [
             MMLUBranchEvaluator(