From 18e7e428ac1885f4957c8fefebdb80be188141e0 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Wed, 13 Nov 2024 17:20:53 -0500
Subject: [PATCH] fix: upsample the phase10 knowledge dataset

When we mix the knowledge dataset with skills today, we do not account for the potential discrepancy
in size between the generated knowledge data and skills data. This leads to the models potentially
forgetting the data it was trained on in the knowledge phase. As a simple workaround, we simply
upsample the knowledge samples before mixing them in with the generated skills dataset.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/sdg/datamixing.py    | 96 +++++++++++++++++++++++++++-
 src/instructlab/sdg/generate_data.py | 14 +++-
 2 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
index 5172fdfb..e6ca8675 100644
--- a/src/instructlab/sdg/datamixing.py
+++ b/src/instructlab/sdg/datamixing.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
-from typing import Dict, List, Optional
+from pathlib import Path
+from typing import Dict, List, Optional, TypedDict
 import json
 import logging
 import os.path
@@ -16,10 +17,24 @@
 from instructlab.sdg.utils import GenerateException, pandas
 from instructlab.sdg.utils.pandas import dataset_from_pandas_dataframe
 
+# XXX(osilkin): This value represents the ratio between knowledge & skills data
+#               below which we upsample knowledge samples from. This only applies
+#               when |knowledge| << |skills|
+MIN_UPSAMPLE_THRESHOLD = 0.03
 ALLOWED_COLS = ["id", "messages", "metadata"]
 logger = logging.getLogger(__name__)
 
 
+class DatasetListing(TypedDict):
+    """
+    TypedDict class that represents the dataset listings passed around in the
+    `.datasets` key of each recipe.
+    """
+
+    sampling_size: float
+    path: str
+
+
 def _adjust_train_sample_size(ds: Dataset, num_samples: int):
     """
     Return a dataset with num_samples random samples selected from the
@@ -99,7 +114,7 @@ def __init__(
 
         # Defaults if no recipe path given or these values don't
         # exist in the given recipe file
-        self.datasets = []
+        self.datasets: List[DatasetListing] = []
         if recipe_path is not None:
             recipe = self._load_recipe()
             if "datasets" in recipe:
@@ -507,6 +522,47 @@ def _create_phase07_ds(
     return phase07
 
 
+def _total_length_of_datasets(datasets: List[DatasetListing]) -> int:
+    """
+    Iterate through the datasets and return the total number of samples
+    per the the sampling ratio.
+
+    Args:
+        datasets (List[DatasetListing]): List containing `DatasetListing` entries.
+
+    Returns:
+        int: Combined length of all datasets in the given list.
+    """
+    total_length = 0
+    for dataset in datasets:
+        if "path" not in dataset:
+            # this really shouldn't happen because it'd be weird for a dataset
+            # listing to not have a path
+            continue
+
+        ds_path = Path(dataset["path"])
+        if not ds_path.exists():
+            # we should ideally error out here, but this was the existing functionality
+            # so we should not introduce this type of error boundary for a hackaround
+            continue
+
+        # Calculate the length of dataset by reading in the JSONL file and assuming every sample
+        # is on a single line. Assume also if a line is empty then it is not a sample (should only be the last line).
+        unscaled_length = sum(
+            1 for l in ds_path.read_text("utf-8").splitlines() if l.strip()
+        )
+        sampling_size = dataset["sampling_size"]
+        if isinstance(sampling_size, float):
+            total_length += int(unscaled_length * sampling_size)
+        elif isinstance(sampling_size, int):
+            total_length += sampling_size
+        else:
+            # maybe we should do nothing instead?
+            raise ValueError(f"invalid type for `sampling_size`: {type(sampling_size)}")
+
+    return total_length
+
+
 def _convert_to_leaf_node_messages(sample: dict, sys_prompt: str):
     """
     Convert a sample dictionary to contain a 'messages' column required
@@ -549,6 +605,10 @@ def __init__(
         num_procs,
         auxiliary_inst=None,
     ):
+        # HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
+        #                far exceeds the size of our knowledge samples. This will be removed in the future
+        #                in favor of a smarter way to do the upsampling.
+        self._precomputed_skills_length: int | None = None
         self.data_dirs = data_dirs
         self.output_dir = output_dir
         self.sys_prompt = sys_prompt
@@ -572,9 +632,17 @@ def _load_default_recipe(self, yaml_basename):
         for d in self.data_dirs:
             default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
             if os.path.exists(default_recipe_path):
-                return Recipe(
+                recipe = Recipe(
                     recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
                 )
+                if "skills" in yaml_basename and recipe.datasets:
+                    # HACK(osilkin): we need to balance out the knowledge such that it doesn't
+                    #                get drowned out in the skills dataset. This workaround allows us
+                    #                to re-balance such that skills consists of at least 3% skills
+                    self._precomputed_skills_length = _total_length_of_datasets(
+                        recipe.datasets
+                    )
+                return recipe
         return Recipe(sys_prompt=self.sys_prompt)
 
     def _gen_leaf_node_data(
@@ -615,10 +683,32 @@ def collect(
             output_file_leaf_skills = (
                 f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
             )
+            # HACK(osilkin): `knowledge_upsample_amount` is currently used when the generated knowledge data
+            #                 is orders of magnitude smaller (approx. < 3%) than the skills dataset.
+            #                 It is used to upsample that dataset so that the model doesn't forget it in training.
+            #
+            #                 This work around is currently hacky as we lack insight into the size of both datasets
+            #                 when we generate this data, and it may vary across different scenarios
+            sampling_size: int | float = 1.0
+            if self._precomputed_skills_length:
+                knowledge_to_skills_ratio = (
+                    len(skills_phase_data) / self._precomputed_skills_length
+                )
+                if knowledge_to_skills_ratio < MIN_UPSAMPLE_THRESHOLD:
+                    sampling_size = int(self._precomputed_skills_length * 0.03)
+
+                    logger.info(
+                        "\033[93mKnowledge detected to be less than %.2f%% of skills (%.2f%%), upsampling to: %d\033[0m",
+                        MIN_UPSAMPLE_THRESHOLD * 100,
+                        knowledge_to_skills_ratio * 100,
+                        sampling_size,
+                    )
+
             self._gen_leaf_node_data(
                 skills_phase_data,
                 self.skills_recipe,
                 output_file_leaf_skills,
+                sampling_size=sampling_size,
             )
         else:
             messages = new_generated_data.map(
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index cf65ae14..60969ca6 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -254,7 +254,13 @@ def load_pipeline(yaml_basename):
     )
 
 
-def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst, system_prompt):
+def _mixer_init(
+    ctx,
+    output_dir,
+    date_suffix,
+    knowledge_auxiliary_inst,
+    system_prompt,
+):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
@@ -372,7 +378,11 @@ def generate_data(
     mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)
 
     mixer = _mixer_init(
-        ctx, output_dir, date_suffix, knowledge_pipe.auxiliary_inst, system_prompt
+        ctx,
+        output_dir,
+        date_suffix,
+        knowledge_pipe.auxiliary_inst,
+        system_prompt,
     )
 
     if console_output: