Skip to content

Commit

Permalink
fix: upsample the phase10 knowledge dataset
Browse files Browse the repository at this point in the history
When we mix the knowledge dataset with skills today, we do not account for the potential discrepancy
in size between the generated knowledge data and skills data. This leads to the models potentially
forgetting the data it was trained on in the knowledge phase. As a simple workaround, we simply
upsample the knowledge samples before mixing them in with the generated skills dataset.

Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Nov 15, 2024
1 parent b6f07a8 commit 18e7e42
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 5 deletions.
96 changes: 93 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from typing import Dict, List, Optional
from pathlib import Path
from typing import Dict, List, Optional, TypedDict
import json
import logging
import os.path
Expand All @@ -16,10 +17,24 @@
from instructlab.sdg.utils import GenerateException, pandas
from instructlab.sdg.utils.pandas import dataset_from_pandas_dataframe

# XXX(osilkin): This value represents the ratio between knowledge & skills data
# below which we upsample knowledge samples from. This only applies
# when |knowledge| << |skills|
MIN_UPSAMPLE_THRESHOLD = 0.03
ALLOWED_COLS = ["id", "messages", "metadata"]
logger = logging.getLogger(__name__)


class DatasetListing(TypedDict):
"""
TypedDict class that represents the dataset listings passed around in the
`.datasets` key of each recipe.
"""

sampling_size: float
path: str


def _adjust_train_sample_size(ds: Dataset, num_samples: int):
"""
Return a dataset with num_samples random samples selected from the
Expand Down Expand Up @@ -99,7 +114,7 @@ def __init__(

# Defaults if no recipe path given or these values don't
# exist in the given recipe file
self.datasets = []
self.datasets: List[DatasetListing] = []
if recipe_path is not None:
recipe = self._load_recipe()
if "datasets" in recipe:
Expand Down Expand Up @@ -507,6 +522,47 @@ def _create_phase07_ds(
return phase07


def _total_length_of_datasets(datasets: List[DatasetListing]) -> int:
"""
Iterate through the datasets and return the total number of samples
per the the sampling ratio.
Args:
datasets (List[DatasetListing]): List containing `DatasetListing` entries.
Returns:
int: Combined length of all datasets in the given list.
"""
total_length = 0
for dataset in datasets:
if "path" not in dataset:
# this really shouldn't happen because it'd be weird for a dataset
# listing to not have a path
continue

ds_path = Path(dataset["path"])
if not ds_path.exists():
# we should ideally error out here, but this was the existing functionality
# so we should not introduce this type of error boundary for a hackaround
continue

# Calculate the length of dataset by reading in the JSONL file and assuming every sample
# is on a single line. Assume also if a line is empty then it is not a sample (should only be the last line).
unscaled_length = sum(
1 for l in ds_path.read_text("utf-8").splitlines() if l.strip()
)
sampling_size = dataset["sampling_size"]
if isinstance(sampling_size, float):
total_length += int(unscaled_length * sampling_size)
elif isinstance(sampling_size, int):
total_length += sampling_size
else:
# maybe we should do nothing instead?
raise ValueError(f"invalid type for `sampling_size`: {type(sampling_size)}")

return total_length


def _convert_to_leaf_node_messages(sample: dict, sys_prompt: str):
"""
Convert a sample dictionary to contain a 'messages' column required
Expand Down Expand Up @@ -549,6 +605,10 @@ def __init__(
num_procs,
auxiliary_inst=None,
):
# HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
# far exceeds the size of our knowledge samples. This will be removed in the future
# in favor of a smarter way to do the upsampling.
self._precomputed_skills_length: int | None = None
self.data_dirs = data_dirs
self.output_dir = output_dir
self.sys_prompt = sys_prompt
Expand All @@ -572,9 +632,17 @@ def _load_default_recipe(self, yaml_basename):
for d in self.data_dirs:
default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
if os.path.exists(default_recipe_path):
return Recipe(
recipe = Recipe(
recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
)
if "skills" in yaml_basename and recipe.datasets:
# HACK(osilkin): we need to balance out the knowledge such that it doesn't
# get drowned out in the skills dataset. This workaround allows us
# to re-balance such that skills consists of at least 3% skills
self._precomputed_skills_length = _total_length_of_datasets(
recipe.datasets
)
return recipe
return Recipe(sys_prompt=self.sys_prompt)

def _gen_leaf_node_data(
Expand Down Expand Up @@ -615,10 +683,32 @@ def collect(
output_file_leaf_skills = (
f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
)
# HACK(osilkin): `knowledge_upsample_amount` is currently used when the generated knowledge data
# is orders of magnitude smaller (approx. < 3%) than the skills dataset.
# It is used to upsample that dataset so that the model doesn't forget it in training.
#
# This work around is currently hacky as we lack insight into the size of both datasets
# when we generate this data, and it may vary across different scenarios
sampling_size: int | float = 1.0
if self._precomputed_skills_length:
knowledge_to_skills_ratio = (
len(skills_phase_data) / self._precomputed_skills_length
)
if knowledge_to_skills_ratio < MIN_UPSAMPLE_THRESHOLD:
sampling_size = int(self._precomputed_skills_length * 0.03)

logger.info(
"\033[93mKnowledge detected to be less than %.2f%% of skills (%.2f%%), upsampling to: %d\033[0m",
MIN_UPSAMPLE_THRESHOLD * 100,
knowledge_to_skills_ratio * 100,
sampling_size,
)

self._gen_leaf_node_data(
skills_phase_data,
self.skills_recipe,
output_file_leaf_skills,
sampling_size=sampling_size,
)
else:
messages = new_generated_data.map(
Expand Down
14 changes: 12 additions & 2 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,13 @@ def load_pipeline(yaml_basename):
)


def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst, system_prompt):
def _mixer_init(
ctx,
output_dir,
date_suffix,
knowledge_auxiliary_inst,
system_prompt,
):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

Expand Down Expand Up @@ -372,7 +378,11 @@ def generate_data(
mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)

mixer = _mixer_init(
ctx, output_dir, date_suffix, knowledge_pipe.auxiliary_inst, system_prompt
ctx,
output_dir,
date_suffix,
knowledge_pipe.auxiliary_inst,
system_prompt,
)

if console_output:
Expand Down

0 comments on commit 18e7e42

Please sign in to comment.