Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow configurable skills recipe #432

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## v0.7.0

### Features

* Ability to configure skills recipe. This is managed through the `skills_recipe_path` option, which specifies the file path to a YAML file. This feature is helpful for adjusting the sample size.

## v0.6.1

### Fixes
Expand All @@ -6,7 +12,7 @@

## v0.6.0

### Features

Check failure on line 15 in CHANGELOG.md

View workflow job for this annotation

GitHub Actions / markdown-lint

Multiple headings with the same content

CHANGELOG.md:15 MD024/no-duplicate-heading Multiple headings with the same content [Context: "Features"] https://github.com/DavidAnson/markdownlint/blob/v0.36.1/doc/md024.md

* Small knowledge datasets will automatically get upsampled during final data mixing based on the length of any precomputed skills datasets used during data mixing. This avoids issues where very large precomputed skills datasets were swamping the comparatively minor number of knowledge samples, resulting in lower than optimal knowledge retention during multiphase training. If a large precomputed dataset isn't in use during mixing (which is how things operate by default), this change is a no-op.
* When chunking PDF documents, we'll now look for the docling models on-disk in `$XDG_DATA_HOME/instructlab/sdg/models` (as well as `$XDG_DATA_DIRS` with the same `instructlab/sdg/models` subdirectory). If they are not found on disk, they'll automatically be downloaded from HuggingFace.
Expand Down
45 changes: 40 additions & 5 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ def __init__(
sys_prompt,
num_procs,
auxiliary_inst=None,
skills_recipe_path=None,
):
# HACK(osilkin): This is used to upsample the knowledge dataset when the **pre-computed** skills dataset
# far exceeds the size of our knowledge samples. This will be removed in the future
Expand All @@ -616,19 +617,32 @@ def __init__(
self.num_procs = num_procs
self.auxiliary_inst = auxiliary_inst

self.knowledge_recipe = self._load_default_recipe("knowledge.yaml")
self.skills_recipe = self._load_default_recipe("skills.yaml")
self.knowledge_recipe = self._load_recipe("knowledge.yaml")
self.skills_recipe = self._load_recipe("skills.yaml", skills_recipe_path)

self.output_file_knowledge_recipe = f"knowledge_recipe_{date_suffix}.yaml"
self.output_file_skills_recipe = f"skills_recipe_{date_suffix}.yaml"
self.output_file_mixed_knowledge = f"knowledge_train_msgs_{date_suffix}.jsonl"
self.output_file_mixed_skills = f"skills_train_msgs_{date_suffix}.jsonl"

def _load_default_recipe(self, yaml_basename):
def _load_recipe(self, yaml_basename, recipe_path=None):
"""
Load a default system recipe from e.g. /usr/share/instructlab/sdg/default_data_recipes
if it exists, otherwise return an empty recipe.
Load a recipe from the specified file path or fallback to the default recipe. If the default
recipe is not present in any of the data directories, an empty recipe is returned.

Args:
yaml_basename (str): The base name of the default recipe YAML file.
recipe_path (str, optional): Full path to a custom recipe file. Defaults to None.

Returns:
Recipe: Loaded recipe object.

Raises:
GenerateException: If the provided path is a directory or does not exist.
"""
if recipe_path:
_validate_yaml_file_path(recipe_path)
return Recipe(recipe_path=recipe_path, sys_prompt=self.sys_prompt)
for d in self.data_dirs:
default_recipe_path = os.path.join(d, "default_data_recipes", yaml_basename)
if os.path.exists(default_recipe_path):
Expand Down Expand Up @@ -750,3 +764,24 @@ def generate(self):
self.output_file_skills_recipe,
self.output_file_mixed_skills,
)


def _validate_yaml_file_path(file_path):
try:
# Existing checks
with open(file_path, "r", encoding="utf-8") as f:
yaml.safe_load(f) # Ensure valid YAML format
except IsADirectoryError as e:
raise GenerateException(
f"Recipe path is a directory, not a file: {file_path}"
) from e
except PermissionError as e:
raise GenerateException(
f"Permission denied for recipe file: {file_path}"
) from e
except FileNotFoundError as e:
raise GenerateException(f"Recipe file not found: {file_path}") from e
except yaml.YAMLError as e:
raise GenerateException(
f"Invalid YAML format in recipe file: {file_path}"
) from e
5 changes: 5 additions & 0 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def _mixer_init(
date_suffix,
knowledge_auxiliary_inst,
system_prompt,
skills_recipe_path=None,
):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
Expand All @@ -282,6 +283,7 @@ def _mixer_init(
system_prompt,
ctx.dataset_num_procs,
knowledge_auxiliary_inst,
skills_recipe_path,
)


Expand All @@ -308,6 +310,7 @@ def generate_data(
batch_size: Optional[int] = None,
checkpoint_dir: Optional[str] = None,
max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
skills_recipe_path: Optional[str] = None,
) -> None:
"""Generate data for training and testing a model.

Expand All @@ -322,6 +325,7 @@ def generate_data(
or an absolute path to a directory containing the pipeline YAML files.
We expect three files to be present in this directory: "knowledge.yaml",
"freeform_skills.yaml", and "grounded_skills.yaml".
skills_recipe_path: Path to a YAML file containing the skills recipe.
"""
generate_start = time.time()

Expand Down Expand Up @@ -390,6 +394,7 @@ def generate_data(
date_suffix,
knowledge_pipe.auxiliary_inst,
system_prompt,
skills_recipe_path,
)

if console_output:
Expand Down
77 changes: 77 additions & 0 deletions tests/test_datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from importlib import resources
from unittest.mock import patch
import os
import pathlib

# Third Party
from datasets import Dataset
import pytest
import yaml

# First Party
from instructlab.sdg.datamixing import DataMixer, Recipe, _add_extra_contexts_to_samples
from instructlab.sdg.utils import GenerateException

# We mock out the actual things that use num_procs anyway, but just
# for a consistent value in the tests...
Expand Down Expand Up @@ -55,6 +59,79 @@ def test_datamixer_can_load_default_recipes():
assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"


@pytest.mark.parametrize(
"skills_recipe_case", ["directory", "non_existent", "invalid_yaml", "valid"]
)
def test_datamixer_can_load_specifix_recipe_file(
tmp_path: pathlib.Path, skills_recipe_case
):
"""
Test that DataMixer can load a given recipe file by pointing
it at a simple set of test recipe file.
This test checks when the skills_recipe_path is a directory,
does not exist, or is a valid file.
"""
date_suffix = "2024-07-25T15_52_10"
prompt = "You are a useful AI assistant."

if skills_recipe_case == "valid":
skills_recipe_path = tmp_path / "skills_recipe_path.yaml"
skills_recipe_path.write_text("""
datasets:
- path: "test/skills.jsonl"
sampling_size: 1.0
""")
elif skills_recipe_case == "directory":
skills_recipe_path = tmp_path / "skills_recipe_dir"
skills_recipe_path.mkdir()
elif skills_recipe_case == "invalid_yaml":
skills_recipe_path = tmp_path / "invalid_yaml_recipe.yaml"
skills_recipe_path.write_text("[invalid_yaml")
else: # non_existent
skills_recipe_path = tmp_path / "non_existent_recipe.yaml"

if skills_recipe_case == "valid":
mixer = DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
assert mixer.skills_recipe.datasets[0]["path"] == "test/skills.jsonl"
elif skills_recipe_case == "directory":
with pytest.raises(GenerateException, match="Recipe path is a directory"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
elif skills_recipe_case == "invalid_yaml":
with pytest.raises(GenerateException, match="Invalid YAML format"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)
else: # non_existent
with pytest.raises(GenerateException, match="Recipe file not found"):
DataMixer(
[TEST_DATA_DIR],
TEST_DATA_DIR,
date_suffix,
prompt,
TEST_NUM_PROCS,
skills_recipe_path=str(skills_recipe_path),
)


def test_recipe_init_with_empty_params_adds_dataset():
"""
Test that an empty-initialized recipe can add datasets
Expand Down
Loading