Split generate_data into multiple discrete steps

This doesn't move things out into separate files yet, but it does split the existing functionality of `generate_date` into multiple discrete steps and changes `generate_date` to just call those steps. This is a step towards cleaner separation between the steps and creating top-level Python APIs for each discrete step for advanced use-cases that don't just want an entire single step generation pipeline. Signed-off-by: Ben Browning <[email protected]>
instructlab · Dec 11, 2024 · b17b08d · b17b08d
1 parent f11b365
commit b17b08d
Show file tree

Hide file tree

Showing 8 changed files with 378 additions and 187 deletions.
diff --git a/src/instructlab/sdg/__init__.py b/src/instructlab/sdg/__init__.py
@@ -29,7 +29,7 @@
     "FULL_PIPELINES_PACKAGE",
     "SIMPLE_PIPELINES_PACKAGE",
     "generate_data",
-    "taxonomy_to_samples",
+    "preprocess_taxonomy",
 )
 
 # Local
@@ -62,6 +62,6 @@
     PipelineContext,
 )
 from .registry import BlockRegistry, PromptRegistry
-from .taxonomy import taxonomy_to_samples
+from .taxonomy import preprocess_taxonomy
 from .utils import GenerateException
 from .utils.taxonomy import TaxonomyReadingException
diff --git a/...nstructlab/sdg/cli/taxonomy_to_samples.py → ...nstructlab/sdg/cli/preprocess_taxonomy.py b/...nstructlab/sdg/cli/taxonomy_to_samples.py → ...nstructlab/sdg/cli/preprocess_taxonomy.py
@@ -8,7 +8,7 @@
     DEFAULT_CHUNK_WORD_COUNT,
     DEFAULT_SERVER_CTX_SIZE,
     DEFAULT_TAXONOMY_BASE,
-    taxonomy_to_samples,
+    preprocess_taxonomy,
 )
 from instructlab.sdg.utils.logging import setup_logger
 
@@ -68,7 +68,7 @@
 
     args = parser.parse_args()
     setup_logger(args.log_level)
-    taxonomy_to_samples(
+    preprocess_taxonomy(
         args.taxonomy_path,
         args.output_dir,
         chunk_word_count=args.chunk_word_count,
@@ -78,5 +78,5 @@
     )
 
 """
-python -m instructlab.sdg.cli.taxonomy_to_samples --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
+python -m instructlab.sdg.cli.preprocess_taxonomy --taxonomy-path /path/to/my/taxonomy --output-dir /path/to/my/output
 """
diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py
@@ -160,7 +160,7 @@ def _create_mixed_dataset(self, num_proc):
         Create the final mixed dataset by loading, sampling, and
         concatenating all datasets in this recipe
         """
-        if not self.dataset_added:
+        if not self.datasets:
             logger.error("No dataset added to the recipe")
 
         mixed_ds = self._load_and_sample_datasets(num_proc)
@@ -726,19 +726,36 @@ def collect(
                 sampling_size=self.NUM_SYNTH_SKILLS,
             )
 
+    def _write_mixed_recipe(self, recipe, output_file_recipe):
+        """
+        Write the recipes created during data mixing without writing the actual
+        mixed datasets to disk.
+        """
+        full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
+        recipe.save_recipe(full_recipe_path)
+
     def _gen_mixed_data(self, recipe, output_file_recipe, output_file_data):
         """
         Mix the generated leaf node data into a single dataset and write it to
         disk. The heavy lifting is delegated to the Recipe class.
         """
+        self._write_mixed_recipe(recipe, output_file_recipe)
         if recipe.dataset_added:
-            full_recipe_path = os.path.join(self.output_dir, output_file_recipe)
-            recipe.save_recipe(full_recipe_path)
             recipe.save_mixed_dataset(
                 os.path.join(self.output_dir, output_file_data),
                 self.num_procs,
             )
 
+    def write_recipes(self):
+        self._write_mixed_recipe(
+            self.knowledge_recipe,
+            self.output_file_knowledge_recipe,
+        )
+        self._write_mixed_recipe(
+            self.skills_recipe,
+            self.output_file_skills_recipe,
+        )
+
     def generate(self):
         self._gen_mixed_data(
             self.knowledge_recipe,