From c2be3eabb8573423f193f392e8df8ad10301e9f0 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Fri, 8 Nov 2024 17:01:57 -0500
Subject: [PATCH 01/12] Update generate_data.py to add docling model path

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 9b8fb0bb6d7bb5ff12d8760f5b40915eb07ce31b)
---
 src/instructlab/sdg/generate_data.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index cf65ae14..2235be5e 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -14,6 +14,7 @@
 # Third Party
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 import openai
 
 # First Party
@@ -41,7 +42,6 @@
 
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
-
 def _unescape(s):
     return bytes(s, "utf-8").decode("utf-8").strip()
 
@@ -210,15 +210,28 @@ def _context_init(
         **extra_kwargs,
     )
 
-
 def _sdg_init(ctx, pipeline):
     pipeline_pkg = None
 
     # Search for the pipeline in User and Site data directories
     # then for a package defined pipeline
     # and finally pipelines referenced by absolute path
-    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
-    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
+    data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling")
+    data_dirs = [data_dir]
+    data_dirs.extend(
+        os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs()
+    )
+
+    # Set `docling_model_path` to consistently use `data_dir`
+    docling_model_path = Path(data_dir)
+    os.makedirs(docling_model_path, exist_ok=True)
+
+    if not os.listdir(docling_model_path):
+        # Download models if directory is empty
+        logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...")
+        StandardPdfPipeline.download_models_hf()
+    else:
+        logger.info(f"Using existing Docling models from: {docling_model_path}")
 
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
@@ -295,6 +308,7 @@ def generate_data(
     batch_size: Optional[int] = None,
     checkpoint_dir: Optional[str] = None,
     max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
+    docling_model_path: Optional[str] = None,
 ) -> None:
     """Generate data for training and testing a model.
 
@@ -392,6 +406,7 @@ def generate_data(
             chunk_word_count,
             document_output_dir,
             model_name,
+            docling_model_path=docling_model_path,
         )
 
         if not samples:

From 46dc46a397dd00107fbc14ee7941e88534e2e990 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Fri, 8 Nov 2024 17:02:46 -0500
Subject: [PATCH 02/12] Update taxonomy.py to add docling model path

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 9de4d353631379829e1b14781ffac6091f379181)
---
 src/instructlab/sdg/utils/taxonomy.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index a6f9b381..39684f63 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples(
     chunk_word_count,
     document_output_dir,
     model_name,
+    docling_model_path=None,
 ):
     chunker = DocumentChunker(
         leaf_node=leaf_node,
@@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples(
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
         tokenizer_model_name=model_name,
+        docling_model_path=docling_model_path
     )
     chunks = chunker.chunk_documents()
 
@@ -453,6 +455,7 @@ def leaf_node_to_samples(
     chunk_word_count,
     document_output_dir,
     model_name,
+    docling_model_path=None,
 ):
     if not leaf_node:
         return []
@@ -464,5 +467,6 @@ def leaf_node_to_samples(
             chunk_word_count,
             document_output_dir,
             model_name,
+            docling_model_path=docling_model_path,
         )
     return _skill_leaf_node_to_samples(leaf_node)

From 09df1255f6fa20791ad2125892ecea45776bf99c Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 12 Nov 2024 20:51:18 -0500
Subject: [PATCH 03/12] Rebase

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 1b984e06ae669405a12b432a45cf381e99b243de)

# Conflicts:
#	src/instructlab/sdg/utils/chunkers.py
---
 src/instructlab/sdg/utils/chunkers.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 50fd692c..87063fdc 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -90,6 +90,7 @@ def __new__(
         server_ctx_size=4096,
         chunk_word_count=1024,
         tokenizer_model_name: str | None = None,
+        docling_model_path: str | None = None,
     ):
         """Insantiate the appropriate chunker for the provided document
 
@@ -145,6 +146,7 @@ def __new__(
                 output_dir,
                 chunk_word_count,
                 tokenizer_model_name,
+                docling_model_path=docling_model_path,
             )
 
     @staticmethod
@@ -219,6 +221,7 @@ def __init__(
         output_dir: Path,
         chunk_word_count: int,
         tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
+        docling_model_path=None
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
@@ -231,6 +234,7 @@ def __init__(
         )
 
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
+        self.docling_model_path = docling_model_path
 
     def chunk_documents(self) -> List:
         """Semantically chunk PDF documents.
@@ -247,6 +251,7 @@ def chunk_documents(self) -> List:
         if self.document_paths == []:
             return []
 
+<<<<<<< HEAD
         model_artifacts_path = StandardPdfPipeline.download_models_hf()
         pipeline_options = PdfPipelineOptions(
             artifacts_path=model_artifacts_path,
@@ -256,6 +261,15 @@ def chunk_documents(self) -> List:
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options
+=======
+        if not self.docling_model_path.exists():
+            raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
+        print("docling_model_path", docling_model_path)
+        pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)
+
+        # Keep OCR models on the CPU instead of GPU
+        pipeline_options.ocr_options.use_gpu = False
+>>>>>>> 1b984e0 (Rebase)
         converter = DocumentConverter(
             format_options={
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)

From 821dd40531a50f8dfab9262042e8a21a0e407f40 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 12 Nov 2024 21:29:01 -0500
Subject: [PATCH 04/12] Update docling model path and move hf model download to
 chunkers

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit a6b6454947204db733ebb5955e4a7965e44ae1ad)
---
 src/instructlab/sdg/generate_data.py  | 24 +++++++++---------------
 src/instructlab/sdg/utils/chunkers.py | 17 +++++++++++++----
 src/instructlab/sdg/utils/taxonomy.py |  2 +-
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 2235be5e..7a6fe666 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -12,9 +12,10 @@
 import time
 
 # Third Party
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 import openai
 
 # First Party
@@ -42,6 +43,7 @@
 
 _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant."
 
+
 def _unescape(s):
     return bytes(s, "utf-8").decode("utf-8").strip()
 
@@ -210,29 +212,21 @@ def _context_init(
         **extra_kwargs,
     )
 
+
 def _sdg_init(ctx, pipeline):
     pipeline_pkg = None
 
     # Search for the pipeline in User and Site data directories
     # then for a package defined pipeline
     # and finally pipelines referenced by absolute path
-    data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling")
-    data_dirs = [data_dir]
-    data_dirs.extend(
-        os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs()
-    )
+    data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
+    data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
-    # Set `docling_model_path` to consistently use `data_dir`
-    docling_model_path = Path(data_dir)
+    # Initialize docling model path
+    docling_model_path = os.path.join(xdg_data_home(), "models", "docling")
+    # Ensure the `docling_model_path` directory exists
     os.makedirs(docling_model_path, exist_ok=True)
 
-    if not os.listdir(docling_model_path):
-        # Download models if directory is empty
-        logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...")
-        StandardPdfPipeline.download_models_hf()
-    else:
-        logger.info(f"Using existing Docling models from: {docling_model_path}")
-
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 87063fdc..22cb5325 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -6,6 +6,7 @@
 from typing import DefaultDict, Iterable, List, Tuple
 import json
 import logging
+import os
 import re
 
 # Third Party
@@ -221,7 +222,7 @@ def __init__(
         output_dir: Path,
         chunk_word_count: int,
         tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        docling_model_path=None
+        docling_model_path=None,
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
@@ -263,9 +264,17 @@ def chunk_documents(self) -> List:
             pipeline_options.ocr_options = ocr_options
 =======
         if not self.docling_model_path.exists():
-            raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}")
-        print("docling_model_path", docling_model_path)
-        pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path)
+            logger.info(
+                f"Docling model path {self.docling_model_path} not found, downloading models..."
+            )
+            os.makedirs(self.docling_model_path, exist_ok=True)
+            StandardPdfPipeline.download_models_hf(
+                destination_path=self.docling_model_path
+            )
+        else:
+            logger.info("Found the docling models")
+
+        pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path)
 
         # Keep OCR models on the CPU instead of GPU
         pipeline_options.ocr_options.use_gpu = False
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 39684f63..ab45b5df 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -425,7 +425,7 @@ def _knowledge_leaf_node_to_samples(
         server_ctx_size=server_ctx_size,
         chunk_word_count=chunk_word_count,
         tokenizer_model_name=model_name,
-        docling_model_path=docling_model_path
+        docling_model_path=docling_model_path,
     )
     chunks = chunker.chunk_documents()
 

From 3b467a9a58223dcc1ec5b1a61f4128c81ae61d3c Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 12 Nov 2024 21:30:49 -0500
Subject: [PATCH 05/12] Remove extra import

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit ec4ff80e1cfad6b0b3c3f9358d1766abe778f7ec)
---
 src/instructlab/sdg/generate_data.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 7a6fe666..6b9f7adb 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -12,8 +12,6 @@
 import time
 
 # Third Party
-from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai

From 0c1995eca82465d273b049d29300caf38229a57b Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 12 Nov 2024 22:26:40 -0500
Subject: [PATCH 06/12] Update artifacts path

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 1da59becff4426ac81c803a95d45ee9adf72811d)
---
 src/instructlab/sdg/utils/chunkers.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 22cb5325..02a4fe15 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -268,9 +268,7 @@ def chunk_documents(self) -> List:
                 f"Docling model path {self.docling_model_path} not found, downloading models..."
             )
             os.makedirs(self.docling_model_path, exist_ok=True)
-            StandardPdfPipeline.download_models_hf(
-                destination_path=self.docling_model_path
-            )
+            self.docling_model_path = StandardPdfPipeline.download_models_hf()
         else:
             logger.info("Found the docling models")
 

From 0f7139977446f0bba6b135062aeaf73d45fc08d5 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 13 Nov 2024 08:37:10 -0500
Subject: [PATCH 07/12] Update src/instructlab/sdg/generate_data.py

Co-authored-by: Jaideep Rao <jaideep.r97@gmail.com>
Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 01041bb3a1e3a03ace1f31e05a55d2152ff8d4da)
---
 src/instructlab/sdg/generate_data.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 6b9f7adb..4bd76c01 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -220,11 +220,19 @@ def _sdg_init(ctx, pipeline):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
-    # Initialize docling model path
-    docling_model_path = os.path.join(xdg_data_home(), "models", "docling")
-    # Ensure the `docling_model_path` directory exists
-    os.makedirs(docling_model_path, exist_ok=True)
-
+    sdg_models_path = docling_models_path = None 
+    for d in data_dirs:
+        if os.path.exists(os.path.join(d, "models")):
+          sdg_models_path = os.path.join(d, "models")
+          break
+     
+     if sdg_models_path is not None:
+       try:
+          with open(os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8") as file:
+              config = yaml.safe_load(file)
+              docling_models_path = config['models'][0]['path']
+        except (FileNotFoundError, NotADirectoryError, PermissionsError) as e:
+              log.warning(f"unable to read docling models path from config.yaml")
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):

From c37a37db5cedbe372ce711ecf2914d68aa0a7317 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 13 Nov 2024 08:37:21 -0500
Subject: [PATCH 08/12] Update src/instructlab/sdg/utils/chunkers.py

Co-authored-by: Jaideep Rao <jaideep.r97@gmail.com>
Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit f8f69591ac6070a7a67fef0195b328ce660bb76b)

# Conflicts:
#	src/instructlab/sdg/utils/chunkers.py
---
 src/instructlab/sdg/utils/chunkers.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 02a4fe15..6a6895d9 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -252,6 +252,7 @@ def chunk_documents(self) -> List:
         if self.document_paths == []:
             return []
 
+<<<<<<< HEAD
 <<<<<<< HEAD
         model_artifacts_path = StandardPdfPipeline.download_models_hf()
         pipeline_options = PdfPipelineOptions(
@@ -264,10 +265,12 @@ def chunk_documents(self) -> List:
             pipeline_options.ocr_options = ocr_options
 =======
         if not self.docling_model_path.exists():
+=======
+        if self.docling_model_path is None:
+>>>>>>> f8f6959 (Update src/instructlab/sdg/utils/chunkers.py)
             logger.info(
-                f"Docling model path {self.docling_model_path} not found, downloading models..."
+                f"Docling models not found on disk, downloading models..."
             )
-            os.makedirs(self.docling_model_path, exist_ok=True)
             self.docling_model_path = StandardPdfPipeline.download_models_hf()
         else:
             logger.info("Found the docling models")

From b72cbb0a7471457e2240fd6b1c234dcb9ac1d0f1 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 13 Nov 2024 08:38:46 -0500
Subject: [PATCH 09/12] Address mypy issues and small typos

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 16c6f45e89a357bca008aad90da09d67dcd8f5b1)

# Conflicts:
#	src/instructlab/sdg/utils/chunkers.py
---
 src/instructlab/sdg/generate_data.py  | 29 +++++++++++++++++----------
 src/instructlab/sdg/utils/chunkers.py |  5 ++++-
 src/instructlab/sdg/utils/taxonomy.py |  2 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 4bd76c01..bc5d81b2 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -15,6 +15,7 @@
 # instructlab - All of these need to go away (other than sdg) - issue #6
 from xdg_base_dirs import xdg_data_dirs, xdg_data_home
 import openai
+import yaml
 
 # First Party
 # pylint: disable=ungrouped-imports
@@ -220,19 +221,25 @@ def _sdg_init(ctx, pipeline):
     data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
     data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())
 
-    sdg_models_path = docling_models_path = None 
+    docling_model_path = None
+    sdg_models_path = docling_model_path
     for d in data_dirs:
         if os.path.exists(os.path.join(d, "models")):
-          sdg_models_path = os.path.join(d, "models")
-          break
-     
-     if sdg_models_path is not None:
-       try:
-          with open(os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8") as file:
-              config = yaml.safe_load(file)
-              docling_models_path = config['models'][0]['path']
-        except (FileNotFoundError, NotADirectoryError, PermissionsError) as e:
-              log.warning(f"unable to read docling models path from config.yaml")
+            sdg_models_path = os.path.join(d, "models")
+            break
+
+        if sdg_models_path is not None:
+            try:
+                with open(
+                    os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
+                ) as file:
+                    config = yaml.safe_load(file)
+                    docling_model_path = config["models"][0]["path"]
+            except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
+                logger.warning(
+                    f"unable to read docling models path from config.yaml {e}"
+                )
+
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
         if os.path.exists(pipeline_path):
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 6a6895d9..0a72e3c4 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -6,7 +6,6 @@
 from typing import DefaultDict, Iterable, List, Tuple
 import json
 import logging
-import os
 import re
 
 # Third Party
@@ -267,10 +266,14 @@ def chunk_documents(self) -> List:
         if not self.docling_model_path.exists():
 =======
         if self.docling_model_path is None:
+<<<<<<< HEAD
 >>>>>>> f8f6959 (Update src/instructlab/sdg/utils/chunkers.py)
             logger.info(
                 f"Docling models not found on disk, downloading models..."
             )
+=======
+            logger.info("Docling models not found on disk, downloading models...")
+>>>>>>> 16c6f45 (Address mypy issues and small typos)
             self.docling_model_path = StandardPdfPipeline.download_models_hf()
         else:
             logger.info("Found the docling models")
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index ab45b5df..00743d93 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -467,6 +467,6 @@ def leaf_node_to_samples(
             chunk_word_count,
             document_output_dir,
             model_name,
-            docling_model_path=docling_model_path,
+            docling_model_path,
         )
     return _skill_leaf_node_to_samples(leaf_node)

From 781207553a1c6497324264184f18af0ef793d1cf Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 14 Nov 2024 11:05:50 -0500
Subject: [PATCH 10/12] Update the way docling_model_path is passed to
 generate_data

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit 2a00cb3b33be5e38060697f205118ac39cecbc3d)
---
 src/instructlab/sdg/generate_data.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index bc5d81b2..583da5d3 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -228,17 +228,17 @@ def _sdg_init(ctx, pipeline):
             sdg_models_path = os.path.join(d, "models")
             break
 
-        if sdg_models_path is not None:
-            try:
-                with open(
-                    os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
-                ) as file:
-                    config = yaml.safe_load(file)
-                    docling_model_path = config["models"][0]["path"]
-            except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
-                logger.warning(
-                    f"unable to read docling models path from config.yaml {e}"
-                )
+    if sdg_models_path is not None:
+        try:
+            with open(
+                os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
+            ) as file:
+                config = yaml.safe_load(file)
+                docling_model_path = config["models"][0]["path"]
+        except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
+            logger.warning(
+                f"unable to read docling models path from config.yaml {e}"
+            )
 
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
@@ -271,6 +271,7 @@ def load_pipeline(yaml_basename):
         load_pipeline("knowledge.yaml"),
         load_pipeline("freeform_skills.yaml"),
         load_pipeline("grounded_skills.yaml"),
+        docling_model_path
     )
 
 
@@ -315,7 +316,6 @@ def generate_data(
     batch_size: Optional[int] = None,
     checkpoint_dir: Optional[str] = None,
     max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS,
-    docling_model_path: Optional[str] = None,
 ) -> None:
     """Generate data for training and testing a model.
 
@@ -384,7 +384,7 @@ def generate_data(
         max_num_tokens=max_num_tokens,
     )
 
-    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
+    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = _sdg_init(
         ctx, pipeline
     )
 

From 5244929ad719605e5ee049eab25489703a7a9f1c Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 14 Nov 2024 11:30:02 -0500
Subject: [PATCH 11/12] Fix ruff issues

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
(cherry picked from commit b5733ab2f853bb3e007d0a7ee93f0b740772fe09)
---
 src/instructlab/sdg/generate_data.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
index 583da5d3..2aac5028 100644
--- a/src/instructlab/sdg/generate_data.py
+++ b/src/instructlab/sdg/generate_data.py
@@ -236,9 +236,7 @@ def _sdg_init(ctx, pipeline):
                 config = yaml.safe_load(file)
                 docling_model_path = config["models"][0]["path"]
         except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
-            logger.warning(
-                f"unable to read docling models path from config.yaml {e}"
-            )
+            logger.warning(f"unable to read docling models path from config.yaml {e}")
 
     for d in data_dirs:
         pipeline_path = os.path.join(d, "pipelines", pipeline)
@@ -271,7 +269,7 @@ def load_pipeline(yaml_basename):
         load_pipeline("knowledge.yaml"),
         load_pipeline("freeform_skills.yaml"),
         load_pipeline("grounded_skills.yaml"),
-        docling_model_path
+        docling_model_path,
     )
 
 
@@ -384,8 +382,8 @@ def generate_data(
         max_num_tokens=max_num_tokens,
     )
 
-    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = _sdg_init(
-        ctx, pipeline
+    knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = (
+        _sdg_init(ctx, pipeline)
     )
 
     # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)

From a6689e3984cbeb20353ff35679da503ebbe47066 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 14 Nov 2024 12:26:12 -0500
Subject: [PATCH 12/12] Add two unit tests for docling model path

These simple unit tests just test the cases where we found a
config.yaml to parse for the docling model path and where we didn't.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
(cherry picked from commit 0e9d75d6872ad469d6e5476a2c90d5546a80ed9b)
---
 tests/conftest.py                             | 10 ++++++
 tests/test_generate_data.py                   | 36 ++++++++++++++++++-
 .../instructlab/sdg/models/config.yaml        |  4 +++
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml

diff --git a/tests/conftest.py b/tests/conftest.py
index 80d61903..ed3fd8c4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,8 @@
 
 # Standard
 from unittest import mock
+import pathlib
+import typing
 
 # Third Party
 from datasets import Dataset
@@ -17,6 +19,14 @@
 # Local
 from .taxonomy import MockTaxonomy
 
+TESTS_PATH = pathlib.Path(__file__).parent.absolute()
+
+
+@pytest.fixture
+def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
+    """Path to local test data directory"""
+    yield TESTS_PATH / "testdata"
+
 
 def get_ctx(**kwargs) -> PipelineContext:
     kwargs.setdefault("client", mock.MagicMock())
diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py
index f382a351..0d04a80f 100644
--- a/tests/test_generate_data.py
+++ b/tests/test_generate_data.py
@@ -20,7 +20,7 @@
 import yaml
 
 # First Party
-from instructlab.sdg.generate_data import _context_init, generate_data
+from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data
 from instructlab.sdg.llmblock import LLMBlock
 from instructlab.sdg.pipeline import PipelineContext
 
@@ -548,3 +548,37 @@ def test_context_init_batch_size_optional():
         batch_num_workers=32,
     )
     assert ctx.batch_size == 20
+
+
+def test_sdg_init_docling_path_config_found(testdata_path):
+    with patch.dict(os.environ):
+        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir"))
+        ctx = _context_init(
+            None,
+            "mixtral",
+            "foo.bar",
+            1,
+            "/checkpoint/dir",
+            1,
+            batch_size=20,
+            batch_num_workers=32,
+        )
+        _, _, _, docling_model_path = _sdg_init(ctx, "full")
+        assert docling_model_path == "/mock/docling-models"
+
+
+def test_sdg_init_docling_path_config_not_found(testdata_path):
+    with patch.dict(os.environ):
+        os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir"))
+        ctx = _context_init(
+            None,
+            "mixtral",
+            "foo.bar",
+            1,
+            "/checkpoint/dir",
+            1,
+            batch_size=20,
+            batch_num_workers=32,
+        )
+        _, _, _, docling_model_path = _sdg_init(ctx, "full")
+        assert docling_model_path is None
diff --git a/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml
new file mode 100644
index 00000000..657cfdf3
--- /dev/null
+++ b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml
@@ -0,0 +1,4 @@
+models:
+- path: /mock/docling-models
+  source: https://huggingface.co/ds4sd/docling-models
+  revision: main