From c2be3eabb8573423f193f392e8df8ad10301e9f0 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Fri, 8 Nov 2024 17:01:57 -0500 Subject: [PATCH 01/12] Update generate_data.py to add docling model path Signed-off-by: Aakanksha Duggal (cherry picked from commit 9b8fb0bb6d7bb5ff12d8760f5b40915eb07ce31b) --- src/instructlab/sdg/generate_data.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index cf65ae14..2235be5e 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -14,6 +14,7 @@ # Third Party # instructlab - All of these need to go away (other than sdg) - issue #6 from xdg_base_dirs import xdg_data_dirs, xdg_data_home +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline import openai # First Party @@ -41,7 +42,6 @@ _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant." - def _unescape(s): return bytes(s, "utf-8").decode("utf-8").strip() @@ -210,15 +210,28 @@ def _context_init( **extra_kwargs, ) - def _sdg_init(ctx, pipeline): pipeline_pkg = None # Search for the pipeline in User and Site data directories # then for a package defined pipeline # and finally pipelines referenced by absolute path - data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] - data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling") + data_dirs = [data_dir] + data_dirs.extend( + os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs() + ) + + # Set `docling_model_path` to consistently use `data_dir` + docling_model_path = Path(data_dir) + os.makedirs(docling_model_path, exist_ok=True) + + if not os.listdir(docling_model_path): + # Download models if directory is empty + logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...") + StandardPdfPipeline.download_models_hf() + else: + logger.info(f"Using existing Docling models from: {docling_model_path}") for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) @@ -295,6 +308,7 @@ def generate_data( batch_size: Optional[int] = None, checkpoint_dir: Optional[str] = None, max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, + docling_model_path: Optional[str] = None, ) -> None: """Generate data for training and testing a model. @@ -392,6 +406,7 @@ def generate_data( chunk_word_count, document_output_dir, model_name, + docling_model_path=docling_model_path, ) if not samples: From 46dc46a397dd00107fbc14ee7941e88534e2e990 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Fri, 8 Nov 2024 17:02:46 -0500 Subject: [PATCH 02/12] Update taxonomy.py to add docling model path Signed-off-by: Aakanksha Duggal (cherry picked from commit 9de4d353631379829e1b14781ffac6091f379181) --- src/instructlab/sdg/utils/taxonomy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index a6f9b381..39684f63 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path=None, ): chunker = DocumentChunker( leaf_node=leaf_node, @@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples( server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, tokenizer_model_name=model_name, + docling_model_path=docling_model_path ) chunks = chunker.chunk_documents() @@ -453,6 +455,7 @@ def leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path=None, ): if not leaf_node: return [] @@ -464,5 +467,6 @@ def leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path=docling_model_path, ) return _skill_leaf_node_to_samples(leaf_node) From 09df1255f6fa20791ad2125892ecea45776bf99c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 12 Nov 2024 20:51:18 -0500 Subject: [PATCH 03/12] Rebase Signed-off-by: Aakanksha Duggal (cherry picked from commit 1b984e06ae669405a12b432a45cf381e99b243de) # Conflicts: # src/instructlab/sdg/utils/chunkers.py --- src/instructlab/sdg/utils/chunkers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 50fd692c..87063fdc 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -90,6 +90,7 @@ def __new__( server_ctx_size=4096, chunk_word_count=1024, tokenizer_model_name: str | None = None, + docling_model_path: str | None = None, ): """Insantiate the appropriate chunker for the provided document @@ -145,6 +146,7 @@ def __new__( output_dir, chunk_word_count, tokenizer_model_name, + docling_model_path=docling_model_path, ) @staticmethod @@ -219,6 +221,7 @@ def __init__( output_dir: Path, chunk_word_count: int, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", + docling_model_path=None ): self.document_paths = document_paths self.filepaths = filepaths @@ -231,6 +234,7 @@ def __init__( ) self.tokenizer = self.create_tokenizer(tokenizer_model_name) + self.docling_model_path = docling_model_path def chunk_documents(self) -> List: """Semantically chunk PDF documents. @@ -247,6 +251,7 @@ def chunk_documents(self) -> List: if self.document_paths == []: return [] +<<<<<<< HEAD model_artifacts_path = StandardPdfPipeline.download_models_hf() pipeline_options = PdfPipelineOptions( artifacts_path=model_artifacts_path, @@ -256,6 +261,15 @@ def chunk_documents(self) -> List: if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options +======= + if not self.docling_model_path.exists(): + raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}") + print("docling_model_path", docling_model_path) + pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path) + + # Keep OCR models on the CPU instead of GPU + pipeline_options.ocr_options.use_gpu = False +>>>>>>> 1b984e0 (Rebase) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) From 821dd40531a50f8dfab9262042e8a21a0e407f40 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 12 Nov 2024 21:29:01 -0500 Subject: [PATCH 04/12] Update docling model path and move hf model download to chunkers Signed-off-by: Aakanksha Duggal (cherry picked from commit a6b6454947204db733ebb5955e4a7965e44ae1ad) --- src/instructlab/sdg/generate_data.py | 24 +++++++++--------------- src/instructlab/sdg/utils/chunkers.py | 17 +++++++++++++---- src/instructlab/sdg/utils/taxonomy.py | 2 +- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 2235be5e..7a6fe666 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -12,9 +12,10 @@ import time # Third Party +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + # instructlab - All of these need to go away (other than sdg) - issue #6 from xdg_base_dirs import xdg_data_dirs, xdg_data_home -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline import openai # First Party @@ -42,6 +43,7 @@ _SYS_PROMPT = "I am a Red Hat® Instruct Model, an AI language model developed by Red Hat and IBM Research based on the granite-3.0-8b-base model. My primary role is to serve as a chat assistant." + def _unescape(s): return bytes(s, "utf-8").decode("utf-8").strip() @@ -210,29 +212,21 @@ def _context_init( **extra_kwargs, ) + def _sdg_init(ctx, pipeline): pipeline_pkg = None # Search for the pipeline in User and Site data directories # then for a package defined pipeline # and finally pipelines referenced by absolute path - data_dir = os.path.join(xdg_data_home(), "instructlab", "sdg", "models", "docling") - data_dirs = [data_dir] - data_dirs.extend( - os.path.join(dir, "instructlab", "sdg", "models", "docling") for dir in xdg_data_dirs() - ) + data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] + data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - # Set `docling_model_path` to consistently use `data_dir` - docling_model_path = Path(data_dir) + # Initialize docling model path + docling_model_path = os.path.join(xdg_data_home(), "models", "docling") + # Ensure the `docling_model_path` directory exists os.makedirs(docling_model_path, exist_ok=True) - if not os.listdir(docling_model_path): - # Download models if directory is empty - logger.info("Docling models for chunking not found locally. Downloading from Hugging Face...") - StandardPdfPipeline.download_models_hf() - else: - logger.info(f"Using existing Docling models from: {docling_model_path}") - for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 87063fdc..22cb5325 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -6,6 +6,7 @@ from typing import DefaultDict, Iterable, List, Tuple import json import logging +import os import re # Third Party @@ -221,7 +222,7 @@ def __init__( output_dir: Path, chunk_word_count: int, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", - docling_model_path=None + docling_model_path=None, ): self.document_paths = document_paths self.filepaths = filepaths @@ -263,9 +264,17 @@ def chunk_documents(self) -> List: pipeline_options.ocr_options = ocr_options ======= if not self.docling_model_path.exists(): - raise FileNotFoundError(f"Docling model path not found: {self.docling_model_path}") - print("docling_model_path", docling_model_path) - pipeline_options = PdfPipelineOptions(artifacts_path=docling_model_path) + logger.info( + f"Docling model path {self.docling_model_path} not found, downloading models..." + ) + os.makedirs(self.docling_model_path, exist_ok=True) + StandardPdfPipeline.download_models_hf( + destination_path=self.docling_model_path + ) + else: + logger.info("Found the docling models") + + pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path) # Keep OCR models on the CPU instead of GPU pipeline_options.ocr_options.use_gpu = False diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 39684f63..ab45b5df 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -425,7 +425,7 @@ def _knowledge_leaf_node_to_samples( server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, tokenizer_model_name=model_name, - docling_model_path=docling_model_path + docling_model_path=docling_model_path, ) chunks = chunker.chunk_documents() From 3b467a9a58223dcc1ec5b1a61f4128c81ae61d3c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 12 Nov 2024 21:30:49 -0500 Subject: [PATCH 05/12] Remove extra import Signed-off-by: Aakanksha Duggal (cherry picked from commit ec4ff80e1cfad6b0b3c3f9358d1766abe778f7ec) --- src/instructlab/sdg/generate_data.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 7a6fe666..6b9f7adb 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -12,8 +12,6 @@ import time # Third Party -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline - # instructlab - All of these need to go away (other than sdg) - issue #6 from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai From 0c1995eca82465d273b049d29300caf38229a57b Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 12 Nov 2024 22:26:40 -0500 Subject: [PATCH 06/12] Update artifacts path Signed-off-by: Aakanksha Duggal (cherry picked from commit 1da59becff4426ac81c803a95d45ee9adf72811d) --- src/instructlab/sdg/utils/chunkers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 22cb5325..02a4fe15 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -268,9 +268,7 @@ def chunk_documents(self) -> List: f"Docling model path {self.docling_model_path} not found, downloading models..." ) os.makedirs(self.docling_model_path, exist_ok=True) - StandardPdfPipeline.download_models_hf( - destination_path=self.docling_model_path - ) + self.docling_model_path = StandardPdfPipeline.download_models_hf() else: logger.info("Found the docling models") From 0f7139977446f0bba6b135062aeaf73d45fc08d5 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 13 Nov 2024 08:37:10 -0500 Subject: [PATCH 07/12] Update src/instructlab/sdg/generate_data.py Co-authored-by: Jaideep Rao Signed-off-by: Aakanksha Duggal (cherry picked from commit 01041bb3a1e3a03ace1f31e05a55d2152ff8d4da) --- src/instructlab/sdg/generate_data.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 6b9f7adb..4bd76c01 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -220,11 +220,19 @@ def _sdg_init(ctx, pipeline): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - # Initialize docling model path - docling_model_path = os.path.join(xdg_data_home(), "models", "docling") - # Ensure the `docling_model_path` directory exists - os.makedirs(docling_model_path, exist_ok=True) - + sdg_models_path = docling_models_path = None + for d in data_dirs: + if os.path.exists(os.path.join(d, "models")): + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open(os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8") as file: + config = yaml.safe_load(file) + docling_models_path = config['models'][0]['path'] + except (FileNotFoundError, NotADirectoryError, PermissionsError) as e: + log.warning(f"unable to read docling models path from config.yaml") for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): From c37a37db5cedbe372ce711ecf2914d68aa0a7317 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 13 Nov 2024 08:37:21 -0500 Subject: [PATCH 08/12] Update src/instructlab/sdg/utils/chunkers.py Co-authored-by: Jaideep Rao Signed-off-by: Aakanksha Duggal (cherry picked from commit f8f69591ac6070a7a67fef0195b328ce660bb76b) # Conflicts: # src/instructlab/sdg/utils/chunkers.py --- src/instructlab/sdg/utils/chunkers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 02a4fe15..6a6895d9 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -252,6 +252,7 @@ def chunk_documents(self) -> List: if self.document_paths == []: return [] +<<<<<<< HEAD <<<<<<< HEAD model_artifacts_path = StandardPdfPipeline.download_models_hf() pipeline_options = PdfPipelineOptions( @@ -264,10 +265,12 @@ def chunk_documents(self) -> List: pipeline_options.ocr_options = ocr_options ======= if not self.docling_model_path.exists(): +======= + if self.docling_model_path is None: +>>>>>>> f8f6959 (Update src/instructlab/sdg/utils/chunkers.py) logger.info( - f"Docling model path {self.docling_model_path} not found, downloading models..." + f"Docling models not found on disk, downloading models..." ) - os.makedirs(self.docling_model_path, exist_ok=True) self.docling_model_path = StandardPdfPipeline.download_models_hf() else: logger.info("Found the docling models") From b72cbb0a7471457e2240fd6b1c234dcb9ac1d0f1 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 13 Nov 2024 08:38:46 -0500 Subject: [PATCH 09/12] Address mypy issues and small typos Signed-off-by: Aakanksha Duggal (cherry picked from commit 16c6f45e89a357bca008aad90da09d67dcd8f5b1) # Conflicts: # src/instructlab/sdg/utils/chunkers.py --- src/instructlab/sdg/generate_data.py | 29 +++++++++++++++++---------- src/instructlab/sdg/utils/chunkers.py | 5 ++++- src/instructlab/sdg/utils/taxonomy.py | 2 +- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 4bd76c01..bc5d81b2 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -15,6 +15,7 @@ # instructlab - All of these need to go away (other than sdg) - issue #6 from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai +import yaml # First Party # pylint: disable=ungrouped-imports @@ -220,19 +221,25 @@ def _sdg_init(ctx, pipeline): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) - sdg_models_path = docling_models_path = None + docling_model_path = None + sdg_models_path = docling_model_path for d in data_dirs: if os.path.exists(os.path.join(d, "models")): - sdg_models_path = os.path.join(d, "models") - break - - if sdg_models_path is not None: - try: - with open(os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8") as file: - config = yaml.safe_load(file) - docling_models_path = config['models'][0]['path'] - except (FileNotFoundError, NotADirectoryError, PermissionsError) as e: - log.warning(f"unable to read docling models path from config.yaml") + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_model_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + logger.warning( + f"unable to read docling models path from config.yaml {e}" + ) + for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 6a6895d9..0a72e3c4 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -6,7 +6,6 @@ from typing import DefaultDict, Iterable, List, Tuple import json import logging -import os import re # Third Party @@ -267,10 +266,14 @@ def chunk_documents(self) -> List: if not self.docling_model_path.exists(): ======= if self.docling_model_path is None: +<<<<<<< HEAD >>>>>>> f8f6959 (Update src/instructlab/sdg/utils/chunkers.py) logger.info( f"Docling models not found on disk, downloading models..." ) +======= + logger.info("Docling models not found on disk, downloading models...") +>>>>>>> 16c6f45 (Address mypy issues and small typos) self.docling_model_path = StandardPdfPipeline.download_models_hf() else: logger.info("Found the docling models") diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index ab45b5df..00743d93 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -467,6 +467,6 @@ def leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, - docling_model_path=docling_model_path, + docling_model_path, ) return _skill_leaf_node_to_samples(leaf_node) From 781207553a1c6497324264184f18af0ef793d1cf Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 14 Nov 2024 11:05:50 -0500 Subject: [PATCH 10/12] Update the way docling_model_path is passed to generate_data Signed-off-by: Aakanksha Duggal (cherry picked from commit 2a00cb3b33be5e38060697f205118ac39cecbc3d) --- src/instructlab/sdg/generate_data.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index bc5d81b2..583da5d3 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -228,17 +228,17 @@ def _sdg_init(ctx, pipeline): sdg_models_path = os.path.join(d, "models") break - if sdg_models_path is not None: - try: - with open( - os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" - ) as file: - config = yaml.safe_load(file) - docling_model_path = config["models"][0]["path"] - except (FileNotFoundError, NotADirectoryError, PermissionError) as e: - logger.warning( - f"unable to read docling models path from config.yaml {e}" - ) + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_model_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + logger.warning( + f"unable to read docling models path from config.yaml {e}" + ) for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) @@ -271,6 +271,7 @@ def load_pipeline(yaml_basename): load_pipeline("knowledge.yaml"), load_pipeline("freeform_skills.yaml"), load_pipeline("grounded_skills.yaml"), + docling_model_path ) @@ -315,7 +316,6 @@ def generate_data( batch_size: Optional[int] = None, checkpoint_dir: Optional[str] = None, max_num_tokens: Optional[int] = DEFAULT_MAX_NUM_TOKENS, - docling_model_path: Optional[str] = None, ) -> None: """Generate data for training and testing a model. @@ -384,7 +384,7 @@ def generate_data( max_num_tokens=max_num_tokens, ) - knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init( + knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = _sdg_init( ctx, pipeline ) From 5244929ad719605e5ee049eab25489703a7a9f1c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 14 Nov 2024 11:30:02 -0500 Subject: [PATCH 11/12] Fix ruff issues Signed-off-by: Aakanksha Duggal (cherry picked from commit b5733ab2f853bb3e007d0a7ee93f0b740772fe09) --- src/instructlab/sdg/generate_data.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index 583da5d3..2aac5028 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -236,9 +236,7 @@ def _sdg_init(ctx, pipeline): config = yaml.safe_load(file) docling_model_path = config["models"][0]["path"] except (FileNotFoundError, NotADirectoryError, PermissionError) as e: - logger.warning( - f"unable to read docling models path from config.yaml {e}" - ) + logger.warning(f"unable to read docling models path from config.yaml {e}") for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) @@ -271,7 +269,7 @@ def load_pipeline(yaml_basename): load_pipeline("knowledge.yaml"), load_pipeline("freeform_skills.yaml"), load_pipeline("grounded_skills.yaml"), - docling_model_path + docling_model_path, ) @@ -384,8 +382,8 @@ def generate_data( max_num_tokens=max_num_tokens, ) - knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = _sdg_init( - ctx, pipeline + knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = ( + _sdg_init(ctx, pipeline) ) # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline) From a6689e3984cbeb20353ff35679da503ebbe47066 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 14 Nov 2024 12:26:12 -0500 Subject: [PATCH 12/12] Add two unit tests for docling model path These simple unit tests just test the cases where we found a config.yaml to parse for the docling model path and where we didn't. Signed-off-by: Ben Browning (cherry picked from commit 0e9d75d6872ad469d6e5476a2c90d5546a80ed9b) --- tests/conftest.py | 10 ++++++ tests/test_generate_data.py | 36 ++++++++++++++++++- .../instructlab/sdg/models/config.yaml | 4 +++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml diff --git a/tests/conftest.py b/tests/conftest.py index 80d61903..ed3fd8c4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,8 @@ # Standard from unittest import mock +import pathlib +import typing # Third Party from datasets import Dataset @@ -17,6 +19,14 @@ # Local from .taxonomy import MockTaxonomy +TESTS_PATH = pathlib.Path(__file__).parent.absolute() + + +@pytest.fixture +def testdata_path() -> typing.Generator[pathlib.Path, None, None]: + """Path to local test data directory""" + yield TESTS_PATH / "testdata" + def get_ctx(**kwargs) -> PipelineContext: kwargs.setdefault("client", mock.MagicMock()) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index f382a351..0d04a80f 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -20,7 +20,7 @@ import yaml # First Party -from instructlab.sdg.generate_data import _context_init, generate_data +from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data from instructlab.sdg.llmblock import LLMBlock from instructlab.sdg.pipeline import PipelineContext @@ -548,3 +548,37 @@ def test_context_init_batch_size_optional(): batch_num_workers=32, ) assert ctx.batch_size == 20 + + +def test_sdg_init_docling_path_config_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir")) + ctx = _context_init( + None, + "mixtral", + "foo.bar", + 1, + "/checkpoint/dir", + 1, + batch_size=20, + batch_num_workers=32, + ) + _, _, _, docling_model_path = _sdg_init(ctx, "full") + assert docling_model_path == "/mock/docling-models" + + +def test_sdg_init_docling_path_config_not_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir")) + ctx = _context_init( + None, + "mixtral", + "foo.bar", + 1, + "/checkpoint/dir", + 1, + batch_size=20, + batch_num_workers=32, + ) + _, _, _, docling_model_path = _sdg_init(ctx, "full") + assert docling_model_path is None diff --git a/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml new file mode 100644 index 00000000..657cfdf3 --- /dev/null +++ b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml @@ -0,0 +1,4 @@ +models: +- path: /mock/docling-models + source: https://huggingface.co/ds4sd/docling-models + revision: main