opendatahub-io · tumido · Jan 14, 2025 · Jan 7, 2025
diff --git a/eval/__init__.py b/eval/__init__.py
@@ -0,0 +1,4 @@
+from .final import run_final_eval_op
+from .mt_bench import run_mt_bench_op
+
+__all__ = ["run_final_eval_op", "run_mt_bench_op"]
diff --git a/eval/final/components.py → eval/final.py b/eval/final/components.py → eval/final.py
@@ -1,5 +1,5 @@
 # type: ignore
-# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
+# pylint: disable=import-outside-toplevel,import-error
 
 from kfp.dsl import Artifact, Output, component
 

diff --git a/eval/final/__init__.py b/eval/final/__init__.py
diff --git a/eval/mt_bench/components.py → eval/mt_bench.py b/eval/mt_bench/components.py → eval/mt_bench.py
@@ -1,5 +1,6 @@
 # type: ignore
-# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error
+# pylint: disable=import-outside-toplevel,import-error
+
 from typing import NamedTuple, Optional
 
 from kfp.dsl import component

diff --git a/eval/mt_bench/__init__.py b/eval/mt_bench/__init__.py
diff --git a/pipeline.py b/pipeline.py
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -1528,24 +1528,24 @@ deploymentSpec:
           \ *\n\ndef sdg_op(\n    num_instructions_to_generate: int,\n    pipeline:\
           \ str,\n    repo_branch: Optional[str],\n    repo_pr: Optional[int],\n \
           \   taxonomy_path: str = \"/data/taxonomy\",\n    sdg_path: str = \"/data/sdg\"\
-          ,\n    sdg_sampling_size: float = 1.0,\n):\n    import os\n    from os import\
-          \ getenv, path\n\n    import instructlab.sdg\n    import openai\n    import\
-          \ yaml\n\n    api_key = getenv(\"api_key\")\n    model = getenv(\"model\"\
-          )\n    endpoint = getenv(\"endpoint\")\n\n    sdg_ca_cert_path = getenv(\"\
-          SDG_CA_CERT_PATH\")\n    use_tls = os.path.exists(sdg_ca_cert_path) and\
-          \ (\n        os.path.getsize(sdg_ca_cert_path) > 0\n    )\n    if use_tls:\n\
-          \        import httpx\n\n        custom_http_client = httpx.Client(verify=sdg_ca_cert_path)\n\
-          \        client = openai.OpenAI(\n            base_url=endpoint, api_key=api_key,\
-          \ http_client=custom_http_client\n        )\n    else:\n        client =\
-          \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n    taxonomy_base\
-          \ = \"main\" if repo_branch or (repo_pr and int(repo_pr) > 0) else \"empty\"\
-          \n\n    print(\"Generating synthetic dataset for:\")\n    print()\n    print(\n\
-          \        instructlab.sdg.utils.taxonomy.read_taxonomy(\n            taxonomy_path,\
-          \ taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\n       \
-          \ )\n    )\n\n    # Generate synthetic dataset\n    # 1.0 is the default\
-          \ size\n    if sdg_sampling_size == 1.0:\n        # generate_data has a\
-          \ magic word for its taxonomy_base argument - 'empty'\n        # it allows\
-          \ generating from the whole repo, see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
+          ,\n    sdg_sampling_size: float = 1.0,\n):\n    import os\n    import shutil\n\
+          \    import tempfile\n\n    import instructlab.sdg\n    import openai\n\
+          \    import xdg_base_dirs\n    import yaml\n\n    api_key = os.getenv(\"\
+          api_key\")\n    model = os.getenv(\"model\")\n    endpoint = os.getenv(\"\
+          endpoint\")\n\n    sdg_ca_cert_path = os.getenv(\"SDG_CA_CERT_PATH\")\n\
+          \    use_tls = os.path.exists(sdg_ca_cert_path) and (\n        os.path.getsize(sdg_ca_cert_path)\
+          \ > 0\n    )\n    if use_tls:\n        import httpx\n\n        custom_http_client\
+          \ = httpx.Client(verify=sdg_ca_cert_path)\n        client = openai.OpenAI(\n\
+          \            base_url=endpoint, api_key=api_key, http_client=custom_http_client\n\
+          \        )\n    else:\n        client = openai.OpenAI(base_url=endpoint,\
+          \ api_key=api_key)\n\n    taxonomy_base = \"main\" if repo_branch or (repo_pr\
+          \ and int(repo_pr) > 0) else \"empty\"\n\n    print(\"Generating synthetic\
+          \ dataset for:\")\n    print()\n    print(\n        instructlab.sdg.utils.taxonomy.read_taxonomy(\n\
+          \            taxonomy_path, taxonomy_base, document_output_dir=f\"{sdg_path}/documents\"\
+          \n        )\n    )\n\n    # Generate synthetic dataset\n    # 1.0 is the\
+          \ default size\n    if sdg_sampling_size == 1.0:\n        # generate_data\
+          \ has a magic word for its taxonomy_base argument - 'empty'\n        # it\
+          \ allows generating from the whole repo, see:\n        # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
           \        instructlab.sdg.generate_data(\n            client=client,\n  \
           \          num_instructions_to_generate=num_instructions_to_generate,\n\
           \            output_dir=sdg_path,\n            taxonomy=taxonomy_path,\n\
@@ -1554,40 +1554,39 @@ deploymentSpec:
           \          server_ctx_size=4096,\n        )\n    # Tweak precomputed skills\
           \ data ratio if needed\n    else:\n        skills_recipe = \"/usr/share/instructlab/sdg/default_data_recipes/skills.yaml\"\
           \n\n        def set_precomputed_skills_data_ratio(sampling_size: float,\
-          \ skills_recipe: str):\n            if path.exists(skills_recipe):\n   \
-          \             with open(skills_recipe, \"r\", encoding=\"utf-8\") as file:\n\
-          \                    skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
+          \ skills_recipe: str):\n            if os.path.exists(skills_recipe):\n\
+          \                with open(skills_recipe, \"r\", encoding=\"utf-8\") as\
+          \ file:\n                    skills_yaml = yaml.load(file, Loader=yaml.Loader)\n\
           \n                skills_yaml[\"datasets\"][0][\"sampling_size\"] = sampling_size\n\
           \n                with open(skills_recipe, \"w\", encoding=\"utf-8\") as\
           \ file:\n                    yaml.dump(skills_yaml, file)\n\n        try:\n\
           \            set_precomputed_skills_data_ratio(\n                sampling_size=sdg_sampling_size,\
           \ skills_recipe=skills_recipe\n            )\n        except PermissionError:\n\
           \            print(\"Failed to set precomputed skills data ratio: Permission\
           \ denied\")\n            print(\"Attempting to move default data recipes\
-          \ to temporary directory\")\n            import os\n            import shutil\n\
-          \            import tempfile\n\n            import xdg_base_dirs\n\n   \
-          \         # Create a temporary directory\n            with tempfile.TemporaryDirectory()\
-          \ as temp_dir:\n                # Create a default_data_recipes directory\n\
-          \                temp_dir = path.join(temp_dir, \"default_data_recipes\"\
-          )\n                os.mkdir(temp_dir)\n\n                # Copy default_data_recipes/skills.yaml\
-          \ to the temporary directory\n                shutil.copy(skills_recipe,\
-          \ temp_dir)\n\n                # Also copy the current pipeline directory\
-          \ to the temporary directory - it's a small\n                # directory\
-          \ like 28KB\n                # This isn't needed if the pipeline is either\
-          \ \"full\" or \"simple\" but it's future-proofing\n                data_dirs\
-          \ = [\n                    os.path.join(str(dir), \"instructlab\", \"sdg\"\
-          )\n                    for dir in xdg_base_dirs.xdg_data_dirs()\n      \
-          \          ]\n                temp_pipeline_dir = path.join(temp_dir, \"\
-          pipeline\")\n                os.mkdir(temp_pipeline_dir)\n             \
-          \   for d in data_dirs:\n                    pipeline_path = os.path.join(d,\
-          \ \"pipelines\", pipeline)\n                    if os.path.exists(pipeline_path):\n\
-          \                        shutil.copytree(\n                            pipeline_path,\n\
+          \ to temporary directory\")\n\n            # Create a temporary directory\n\
+          \            with tempfile.TemporaryDirectory() as temp_dir:\n         \
+          \       # Create a default_data_recipes directory\n                temp_dir\
+          \ = os.path.join(temp_dir, \"default_data_recipes\")\n                os.mkdir(temp_dir)\n\
+          \n                # Copy default_data_recipes/skills.yaml to the temporary\
+          \ directory\n                shutil.copy(skills_recipe, temp_dir)\n\n  \
+          \              # Also copy the current pipeline directory to the temporary\
+          \ directory - it's a small\n                # directory like 28KB\n    \
+          \            # This isn't needed if the pipeline is either \"full\" or \"\
+          simple\" but it's future-proofing\n                data_dirs = [\n     \
+          \               os.path.join(str(dir), \"instructlab\", \"sdg\")\n     \
+          \               for dir in xdg_base_dirs.xdg_data_dirs()\n             \
+          \   ]\n                temp_pipeline_dir = os.path.join(temp_dir, \"pipeline\"\
+          )\n                os.mkdir(temp_pipeline_dir)\n                for d in\
+          \ data_dirs:\n                    pipeline_path = os.path.join(d, \"pipelines\"\
+          , pipeline)\n                    if os.path.exists(pipeline_path):\n   \
+          \                     shutil.copytree(\n                            pipeline_path,\n\
           \                            temp_pipeline_dir,\n                      \
           \      dirs_exist_ok=True,\n                        )\n                \
           \        break\n\n                # Build new skills.yaml path\n       \
-          \         new_skills_recipe = path.join(temp_dir, \"skills.yaml\")\n   \
-          \             print(f\"New skills recipe path: {new_skills_recipe}\")\n\n\
-          \                # Override XDG_DATA_DIRS with the temporary directory\n\
+          \         new_skills_recipe = os.path.join(temp_dir, \"skills.yaml\")\n\
+          \                print(f\"New skills recipe path: {new_skills_recipe}\"\
+          )\n\n                # Override XDG_DATA_DIRS with the temporary directory\n\
           \                # This allows SDG to read the new skills.yaml since it's\
           \ looking into XDG_DATA_DIRS\n                # and looks for a default_data_recipes\
           \ directory with a skills.yaml file\n                os.environ[\"XDG_DATA_DIRS\"\

diff --git a/sdg/__init__.py b/sdg/__init__.py
@@ -1,4 +1,3 @@
-from . import faked
 from .components import (
     git_clone_op,
     sdg_op,
@@ -11,5 +10,4 @@
     "sdg_op",
     "taxonomy_to_artifact_op",
     "sdg_to_artifact_op",
-    "faked",
 ]
diff --git a/sdg/components.py b/sdg/components.py
@@ -1,5 +1,5 @@
 # type: ignore
-# pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member
+# pylint: disable=import-outside-toplevel,import-error
 from typing import Optional
 
 from kfp import dsl
@@ -38,17 +38,19 @@ def sdg_op(
     sdg_sampling_size: float = 1.0,
 ):
     import os
-    from os import getenv, path
+    import shutil
+    import tempfile
 
     import instructlab.sdg
     import openai
+    import xdg_base_dirs
     import yaml
 
-    api_key = getenv("api_key")
-    model = getenv("model")
-    endpoint = getenv("endpoint")
+    api_key = os.getenv("api_key")
+    model = os.getenv("model")
+    endpoint = os.getenv("endpoint")
 
-    sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH")
+    sdg_ca_cert_path = os.getenv("SDG_CA_CERT_PATH")
     use_tls = os.path.exists(sdg_ca_cert_path) and (
         os.path.getsize(sdg_ca_cert_path) > 0
     )
@@ -94,7 +96,7 @@ def sdg_op(
         skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
 
         def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
-            if path.exists(skills_recipe):
+            if os.path.exists(skills_recipe):
                 with open(skills_recipe, "r", encoding="utf-8") as file:
                     skills_yaml = yaml.load(file, Loader=yaml.Loader)
 
@@ -110,16 +112,11 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
         except PermissionError:
             print("Failed to set precomputed skills data ratio: Permission denied")
             print("Attempting to move default data recipes to temporary directory")
-            import os
-            import shutil
-            import tempfile
-
-            import xdg_base_dirs
 
             # Create a temporary directory
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Create a default_data_recipes directory
-                temp_dir = path.join(temp_dir, "default_data_recipes")
+                temp_dir = os.path.join(temp_dir, "default_data_recipes")
                 os.mkdir(temp_dir)
 
                 # Copy default_data_recipes/skills.yaml to the temporary directory
@@ -132,7 +129,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                     os.path.join(str(dir), "instructlab", "sdg")
                     for dir in xdg_base_dirs.xdg_data_dirs()
                 ]
-                temp_pipeline_dir = path.join(temp_dir, "pipeline")
+                temp_pipeline_dir = os.path.join(temp_dir, "pipeline")
                 os.mkdir(temp_pipeline_dir)
                 for d in data_dirs:
                     pipeline_path = os.path.join(d, "pipelines", pipeline)
@@ -145,7 +142,7 @@ def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
                         break
 
                 # Build new skills.yaml path
-                new_skills_recipe = path.join(temp_dir, "skills.yaml")
+                new_skills_recipe = os.path.join(temp_dir, "skills.yaml")
                 print(f"New skills recipe path: {new_skills_recipe}")
 
                 # Override XDG_DATA_DIRS with the temporary directory

diff --git a/sdg/faked/__init__.py b/sdg/faked/__init__.py
diff --git a/sdg/faked/components.py b/sdg/faked/components.py