Skip to content

Commit

Permalink
Merge pull request #377 from RobotSail/upscale-factor
Browse files Browse the repository at this point in the history
fix: upsample the phase10 knowledge dataset
  • Loading branch information
mergify[bot] authored Nov 15, 2024
2 parents e8bbd06 + 18e7e42 commit f42ea19
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 5 deletions.
96 changes: 93 additions & 3 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from typing import Dict, List, Optional
from pathlib import Path
from typing import Dict, List, Optional, TypedDict
import json
import logging
import os.path
Expand All @@ -16,10 +17,24 @@
from instructlab.sdg.utils import GenerateException, pandas
from instructlab.sdg.utils.pandas import dataset_from_pandas_dataframe

# XXX(osilkin): This value represents the ratio between knowledge & skills data
# below which we upsample knowledge samples from. This only applies
# when |knowledge| << |skills|
MIN_UPSAMPLE_THRESHOLD = 0.03
ALLOWED_COLS = ["id", "messages", "metadata"]
logger = logging.getLogger(__name__)


class DatasetListing(TypedDict):
"""
TypedDict class that represents the dataset listings passed around in the
`.datasets` key of each recipe.
"""

sampling_size: float
path: str


def _adjust_train_sample_size(ds: Dataset, num_samples: int):
"""
Return a dataset with num_samples random samples selected from the
Expand Down Expand Up @@ -99,7 +114,7 @@ def __init__(

# Defaults if no recipe path given or these values don't
# exist in the given recipe file
self.datasets = []
self.datasets: List[DatasetListing] = []
if recipe_path is not None:
recipe = self._load_recipe()
if "datasets" in recipe:
Expand Down Expand Up @@ -507,6 +522,47 @@ def _create_phase07_ds(
return phase07


def _total_length_of_datasets(datasets: List[DatasetListing]) -> int:
"""
Iterate through the datasets and return the total number of samples
per the the sampling ratio.
Args:
datasets (List[DatasetListing]): List containing `DatasetListing` entries.
Returns:
int: Combined length of all datasets in the given list.
"""
total_length = 0
for dataset in datasets:
if "path" not in dataset:
# this really shouldn't happen because it'd be weird for a dataset
# listing to not have a path
continue

ds_path = Path(dataset["path"])
if not ds_path.exists():
# we should ideally error out here, but this was the existing functionality
# so we should not introduce this type of error boundary for a hackaround
continue

# Calculate the length of dataset by reading in the JSONL file and assuming every sample
# is on a single line. Assume also if a line is empty then it is not a sample (should only be the last line).
unscaled_length = sum(
1 for l in ds_path.read_text("utf-8").splitlines() if l.strip()
)
sampling_size = dataset["sampling_size"]
if isinstance(sampling_size, float):
total_length += int(unscaled_length * sampling_size)
elif isinstance(sampling_size, int):
total_length += sampling_size
else:
# maybe we should do nothing instead?
raise ValueError(f"invalid type for `sampling_size`: {type(sampling_size)}")

return total_length


def _convert_to_leaf_node_messages(sample: dict, sys_prompt: str):
"""
Convert a sample dictionary to contain a 'messages' column required
Expand Down Expand Up @@ -549,6 +605,10 @@ def __init__(
num_procs,
auxiliary_inst=None,
):
# HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
# far exceeds the size of our knowledge samples. This will be removed in the future
# in favor of a smarter way to do the upsampling.
self._precomputed_skills_length: int | None = None
self.data_dirs = data_dirs
self.output_dir = output_dir
self.sys_prompt = sys_prompt
Expand All @@ -572,9 +632,17 @@ def _load_default_recipe(self, yaml_basename):
for d in self.data_dirs:
default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
if os.path.exists(default_recipe_path):
return Recipe(
recipe = Recipe(
recipe_path=default_recipe_path, sys_prompt=self.sys_prompt
)
if "skills" in yaml_basename and recipe.datasets:
# HACK(osilkin): we need to balance out the knowledge such that it doesn't
# get drowned out in the skills dataset. This workaround allows us
# to re-balance such that skills consists of at least 3% skills
self._precomputed_skills_length = _total_length_of_datasets(
recipe.datasets
)
return recipe
return Recipe(sys_prompt=self.sys_prompt)

def _gen_leaf_node_data(
Expand Down Expand Up @@ -615,10 +683,32 @@ def collect(
output_file_leaf_skills = (
f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
)
# HACK(osilkin): `knowledge_upsample_amount` is currently used when the generated knowledge data
# is orders of magnitude smaller (approx. < 3%) than the skills dataset.
# It is used to upsample that dataset so that the model doesn't forget it in training.
#
# This work around is currently hacky as we lack insight into the size of both datasets
# when we generate this data, and it may vary across different scenarios
sampling_size: int | float = 1.0
if self._precomputed_skills_length:
knowledge_to_skills_ratio = (
len(skills_phase_data) / self._precomputed_skills_length
)
if knowledge_to_skills_ratio < MIN_UPSAMPLE_THRESHOLD:
sampling_size = int(self._precomputed_skills_length * 0.03)

logger.info(
"\033[93mKnowledge detected to be less than %.2f%% of skills (%.2f%%), upsampling to: %d\033[0m",
MIN_UPSAMPLE_THRESHOLD * 100,
knowledge_to_skills_ratio * 100,
sampling_size,
)

self._gen_leaf_node_data(
skills_phase_data,
self.skills_recipe,
output_file_leaf_skills,
sampling_size=sampling_size,
)
else:
messages = new_generated_data.map(
Expand Down
14 changes: 12 additions & 2 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,13 @@ def load_pipeline(yaml_basename):
)


def _mixer_init(ctx, output_dir, date_suffix, knowledge_auxiliary_inst, system_prompt):
def _mixer_init(
ctx,
output_dir,
date_suffix,
knowledge_auxiliary_inst,
system_prompt,
):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

Expand Down Expand Up @@ -391,7 +397,11 @@ def generate_data(
mmlu_bench_pipe = mmlubench_pipe_init(mmlu_ctx)

mixer = _mixer_init(
ctx, output_dir, date_suffix, knowledge_pipe.auxiliary_inst, system_prompt
ctx,
output_dir,
date_suffix,
knowledge_pipe.auxiliary_inst,
system_prompt,
)

if console_output:
Expand Down

0 comments on commit f42ea19

Please sign in to comment.