instructlab · leseb · Dec 9, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## v0.7.0
+
+### Features
+
+* Ability to configure skills recipe. This is managed through the `skills_recipe_path` option, which specifies the file path to a YAML file. This feature is helpful for adjusting the sample size.
+
 ## v0.6.1
 
 ### Fixes
@@ -6,7 +12,7 @@

 ## v0.6.0

 ### Features

 * Small knowledge datasets will automatically get upsampled during final data mixing based on the length of any precomputed skills datasets used during data mixing. This avoids issues where very large precomputed skills datasets were swamping the comparatively minor number of knowledge samples, resulting in lower than optimal knowledge retention during multiphase training. If a large precomputed dataset isn't in use during mixing (which is how things operate by default), this change is a no-op.
 * When chunking PDF documents, we'll now look for the docling models on-disk in `$XDG_DATA_HOME/instructlab/sdg/models` (as well as `$XDG_DATA_DIRS` with the same `instructlab/sdg/models` subdirectory). If they are not found on disk, they'll automatically be downloaded from HuggingFace.

diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -604,6 +604,7 @@ def __init__(
         sys_prompt,
         num_procs,
         auxiliary_inst=None,
+        skills_recipe_path=None,
     ):
         # HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
         #                far exceeds the size of our knowledge samples. This will be removed in the future
@@ -616,19 +617,32 @@ def __init__(
         self.num_procs = num_procs
         self.auxiliary_inst = auxiliary_inst
 
-        self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
-        self.skills_recipe = self._load_default_recipe("skills.yaml")
+        self.knowledge_recipe = self._load_recipe("knowledge.yaml")
+        self.skills_recipe = self._load_recipe("skills.yaml", skills_recipe_path)
 
         self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
         self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
         self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
         self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"
 
-    def _load_default_recipe(self, yaml_basename):
+    def _load_recipe(self, yaml_basename, recipe_path=None):
         """
-        Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
-        if it exists, otherwise return an empty recipe.
+        Load a recipe from the specified file path or fallback to the default recipe. If the default
+        recipe is not present in any of the data directories, an empty recipe is returned.
+
+        Args:
+            yaml_basename (str): The base name of the default recipe YAML file.
+            recipe_path (str, optional): Full path to a custom recipe file. Defaults to None.
+
+        Returns:
+            Recipe: Loaded recipe object.
+
+        Raises:
+            GenerateException: If the provided path is a directory or does not exist.
         """
+        if recipe_path:
+            _validate_yaml_file_path(recipe_path)
+            return Recipe(recipe_path=recipe_path, sys_prompt=self.sys_prompt)
         for d in self.data_dirs:
             default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
             if os.path.exists(default_recipe_path):
@@ -750,3 +764,24 @@ def generate(self):
             self.output_file_skills_recipe,
             self.output_file_mixed_skills,
         )
+
+
+def _validate_yaml_file_path(file_path):
+    try:
+        # Existing checks
+        with open(file_path, "r", encoding="utf-8") as f:
+            yaml.safe_load(f)  # Ensure valid YAML format
+    except IsADirectoryError as e:
+        raise GenerateException(
+            f"Recipe path is a directory, not a file: {file_path}"
+        ) from e
+    except PermissionError as e:
+        raise GenerateException(
+            f"Permission denied for recipe file: {file_path}"
+        ) from e
+    except FileNotFoundError as e:
+        raise GenerateException(f"Recipe file not found: {file_path}") from e
+    except yaml.YAMLError as e:
+        raise GenerateException(
+            f"Invalid YAML format in recipe file: {file_path}"
+        ) from e
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -271,6 +271,7 @@ def _mixer_init(
     date_suffix,
     knowledge_auxiliary_inst,
     system_prompt,
+    skills_recipe_path=None,
 ):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
@@ -282,6 +283,7 @@ def _mixer_init(
         system_prompt,
         ctx.dataset_num_procs,
         knowledge_auxiliary_inst,
+        skills_recipe_path,
     )
 
 
@@ -308,6 +310,7 @@ def generate_data(
     batch_size: Optional[int] = None,
     checkpoint_dir: Optional[str] = None,
     max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+    skills_recipe_path: Optional[str] = None,
 ) -> None:
     """Generate data for training and testing a model.
 
@@ -322,6 +325,7 @@ def generate_data(
                   or an absolute path to a directory containing the pipeline YAML files.
                   We expect three files to be present in this directory: "knowledge.yaml",
                     "freeform_skills.yaml", and "grounded_skills.yaml".
+        skills_recipe_path: Path to a YAML file containing the skills recipe.
     """
     generate_start = time.time()
 
@@ -390,6 +394,7 @@ def generate_data(
         date_suffix,
         knowledge_pipe.auxiliary_inst,
         system_prompt,
+        skills_recipe_path,
     )
 
     if console_output:

diff --git a/tests/test_datamixing.py b/tests/test_datamixing.py
@@ -8,12 +8,16 @@
 from importlib import resources
 from unittest.mock import patch
 import os
+import pathlib
 
 # Third Party
 from datasets import Dataset
+import pytest
+import yaml
 
 # First Party
 from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
+from instructlab.sdg.utils import GenerateException
 
 # We mock out the actual things that use num_procs anyway, but just
 # for a consistent value in the tests...
@@ -55,6 +59,79 @@ def test_datamixer_can_load_default_recipes():
     assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
 
 
+@pytest.mark.parametrize(
+    "skills_recipe_case", ["directory", "non_existent", "invalid_yaml", "valid"]
+)
+def test_datamixer_can_load_specifix_recipe_file(
+    tmp_path: pathlib.Path, skills_recipe_case
+):
+    """
+    Test that DataMixer can load a given recipe file by pointing
+    it at a simple set of test recipe file.
+    This test checks when the skills_recipe_path is a directory,
+    does not exist, or is a valid file.
+    """
+    date_suffix = "2024-07-25T15_52_10"
+    prompt = "You are a useful AI assistant."
+
+    if skills_recipe_case == "valid":
+        skills_recipe_path = tmp_path / "skills_recipe_path.yaml"
+        skills_recipe_path.write_text("""
+        datasets:
+          - path: "test/skills.jsonl"
+            sampling_size: 1.0
+        """)
+    elif skills_recipe_case == "directory":
+        skills_recipe_path = tmp_path / "skills_recipe_dir"
+        skills_recipe_path.mkdir()
+    elif skills_recipe_case == "invalid_yaml":
+        skills_recipe_path = tmp_path / "invalid_yaml_recipe.yaml"
+        skills_recipe_path.write_text("[invalid_yaml")
+    else:  # non_existent
+        skills_recipe_path = tmp_path / "non_existent_recipe.yaml"
+
+    if skills_recipe_case == "valid":
+        mixer = DataMixer(
+            [TEST_DATA_DIR],
+            TEST_DATA_DIR,
+            date_suffix,
+            prompt,
+            TEST_NUM_PROCS,
+            skills_recipe_path=str(skills_recipe_path),
+        )
+        assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
+    elif skills_recipe_case == "directory":
+        with pytest.raises(GenerateException, match="Recipe path is a directory"):
+            DataMixer(
+                [TEST_DATA_DIR],
+                TEST_DATA_DIR,
+                date_suffix,
+                prompt,
+                TEST_NUM_PROCS,
+                skills_recipe_path=str(skills_recipe_path),
+            )
+    elif skills_recipe_case == "invalid_yaml":
+        with pytest.raises(GenerateException, match="Invalid YAML format"):
+            DataMixer(
+                [TEST_DATA_DIR],
+                TEST_DATA_DIR,
+                date_suffix,
+                prompt,
+                TEST_NUM_PROCS,
+                skills_recipe_path=str(skills_recipe_path),
+            )
+    else:  # non_existent
+        with pytest.raises(GenerateException, match="Recipe file not found"):
+            DataMixer(
+                [TEST_DATA_DIR],
+                TEST_DATA_DIR,
+                date_suffix,
+                prompt,
+                TEST_NUM_PROCS,
+                skills_recipe_path=str(skills_recipe_path),
+            )
+
+
 def test_recipe_init_with_empty_params_adds_dataset():
     """
     Test that an empty-initialized recipe can add datasets