fix: upsample the phase10 knowledge dataset

When we mix the knowledge dataset with skills today, we do not account for the potential discrepancy in size between the generated knowledge data and skills data. This leads to the models potentially forgetting the data it was trained on in the knowledge phase. As a simple workaround, we simply upsample the knowledge samples before mixing them in with the generated skills dataset. Signed-off-by: Oleg S <[email protected]>
instructlab · Nov 15, 2024 · 6fb7222 · 6fb7222
1 parent b6f07a8
commit 6fb7222
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 5 deletions.
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
-from typing import Dict, List, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, TypedDict
 import json
 import logging
 import os.path
@@ -16,10 +17,24 @@
 from instructlab.sdg.utils import GenerateException, pandas
 from instructlab.sdg.utils.pandas import dataset_from_pandas_dataframe
 
+# XXX(osilkin): This value represents the ratio between knowledge & skills data
+#               below which we upsample knowledge samples from. This only applies
+#               when |knowledge| << |skills|
+MIN_UPSAMPLE_THRESHOLD = 0.03
 ALLOWED_COLS = ["id", "messages", "metadata"]
 logger = logging.getLogger(__name__)
 
 
+class DatasetListing(TypedDict):
+    """
+    TypedDict class that represents the dataset listings passed around in the
+    `.datasets` key of each recipe.
+    """
+
+    sampling_size: float
+    path: str
+
+
 def _adjust_train_sample_size(ds: Dataset, num_samples: int):
     """
     Return a dataset with num_samples random samples selected from the
@@ -99,7 +114,7 @@ def __init__(
 
         # Defaults if no recipe path given or these values don't
         # exist in the given recipe file
-        self.datasets = []
+        self.datasets: List[DatasetListing] = []
         if recipe_path is not None:
             recipe = self._load_recipe()
             if "datasets" in recipe:
@@ -507,6 +522,47 @@ def _create_phase07_ds(
     return phase07
 
 
+def _total_length_of_datasets(datasets: List[DatasetListing]) -> int:
+    """
+    Iterate through the datasets and return the total number of samples
+    per the the sampling ratio.
+
+    Args:
+        datasets (List[DatasetListing]): List containing `DatasetListing` entries.
+
+    Returns:
+        int: Combined length of all datasets in the given list.
+    """
+    total_length = 0
+    for dataset in datasets:
+        if "path" not in dataset:
+            # this really shouldn't happen because it'd be weird for a dataset
+            # listing to not have a path
+            continue
+
+        ds_path = Path(dataset["path"])
+        if not ds_path.exists():
+            # we should ideally error out here, but this was the existing functionality
+            # so we should not introduce this type of error boundary for a hackaround
+            continue
+
+        # Calculate the length of dataset by reading in the JSONL file and assuming every sample
+        # is on a single line. Assume also if a line is empty then it is not a sample (should only be the last line).
+        unscaled_length = sum(
+            1 for l in ds_path.read_text("utf-8").splitlines() if l.strip()
+        )
+        sampling_size = dataset["sampling_size"]
+        if isinstance(sampling_size, float):
+            total_length += int(unscaled_length * sampling_size)
+        elif isinstance(sampling_size, int):
+            total_length += sampling_size
+        else:
+            # maybe we should do nothing instead?
+            raise ValueError(f"invalid type for `sampling_size`: {type(sampling_size)}")
+
+    return total_length
+
+
 def _convert_to_leaf_node_messages(sample: dict, sys_prompt: str):
     """
     Convert a sample dictionary to contain a 'messages' column required
@@ -549,6 +605,10 @@ def __init__(
         num_procs,
         auxiliary_inst=None,
     ):
+        # HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
+        #                far exceeds the size of our knowledge samples. This will be removed in the future
+        #                in favor of a smarter way to do the upsampling.
+        self._precomputed_skills_length: int | None = None
         self.data_dirs = data_dirs
         self.output_dir = output_dir
         self.sys_prompt = sys_prompt
@@ -572,9 +632,17 @@ def _load_default_recipe(self, yaml_basename):
         for d in self.data_dirs:
             default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
             if os.path.exists(default_recipe_path):
-                return Recipe(
+                recipe = Recipe(
                     recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
                 )
+                if "skills" in yaml_basename and recipe.datasets:
+                    # HACK(osilkin): we need to balance out the knowledge such that it doesn't
+                    #                get drowned out in the skills dataset. This workaround allows us
+                    #                to re-balance such that skills consists of at least 3% skills
+                    self._precomputed_skills_length = _total_length_of_datasets(
+                        recipe.datasets
+                    )
+                return recipe
         return Recipe(sys_prompt=self.sys_prompt)
 
     def _gen_leaf_node_data(
@@ -615,10 +683,31 @@ def collect(
             output_file_leaf_skills = (
                 f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
             )
+            # HACK(osilkin): `knowledge_upsample_amount` is currently used when the generated knowledge data
+            #                 is orders of magnitude smaller (approx. < 3%) than the skills dataset.
+            #                 It is used to upsample that dataset so that the model doesn't forget it in training.
+            #
+            #                 This work around is currently hacky as we lack insight into the size of both datasets
+            #                 when we generate this data, and it may vary across different scenarios
+            sampling_size: int | float = 1.0
+            if self._precomputed_skills_length:
+                knowledge_to_skills_ratio = (
+                    len(skills_phase_data) / self._precomputed_skills_length
+                )
+                if knowledge_to_skills_ratio < MIN_UPSAMPLE_THRESHOLD:
+                    sampling_size = int(self._precomputed_skills_length * 0.03)
+
+                    print(
+                        "\033[93m"
+                        + f"Knowledge detected to be less than {MIN_UPSAMPLE_THRESHOLD*100:.2f}% of skills ({knowledge_to_skills_ratio:.2f}%), upsampling to: {sampling_size}"
+                        + "\033[0m"
+                    )
+
             self._gen_leaf_node_data(
                 skills_phase_data,
                 self.skills_recipe,
                 output_file_leaf_skills,
+                sampling_size=sampling_size,
             )
         else:
             messages = new_generated_data.map(

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -254,7 +254,13 @@ def load_pipeline(yaml_basename):
     )
 
 
-def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst, system_prompt):
+def _mixer_init(
+    ctx,
+    output_dir,
+    date_suffix,
+    knowledge_auxiliary_inst,
+    system_prompt,
+):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
@@ -372,7 +378,11 @@ def generate_data(
     mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)
 
     mixer = _mixer_init(
-        ctx, output_dir, date_suffix, knowledge_pipe.auxiliary_inst, system_prompt
+        ctx,
+        output_dir,
+        date_suffix,
+        knowledge_pipe.auxiliary_inst,
+        system_prompt,
     )
 
     if console_output: