diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py index cf65ae14..2aac5028 100644 --- a/src/instructlab/sdg/generate_data.py +++ b/src/instructlab/sdg/generate_data.py @@ -15,6 +15,7 @@ # instructlab - All of these need to go away (other than sdg) - issue #6 from xdg_base_dirs import xdg_data_dirs, xdg_data_home import openai +import yaml # First Party # pylint: disable=ungrouped-imports @@ -220,6 +221,23 @@ def _sdg_init(ctx, pipeline): data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")] data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs()) + docling_model_path = None + sdg_models_path = docling_model_path + for d in data_dirs: + if os.path.exists(os.path.join(d, "models")): + sdg_models_path = os.path.join(d, "models") + break + + if sdg_models_path is not None: + try: + with open( + os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8" + ) as file: + config = yaml.safe_load(file) + docling_model_path = config["models"][0]["path"] + except (FileNotFoundError, NotADirectoryError, PermissionError) as e: + logger.warning(f"unable to read docling models path from config.yaml {e}") + for d in data_dirs: pipeline_path = os.path.join(d, "pipelines", pipeline) if os.path.exists(pipeline_path): @@ -251,6 +269,7 @@ def load_pipeline(yaml_basename): load_pipeline("knowledge.yaml"), load_pipeline("freeform_skills.yaml"), load_pipeline("grounded_skills.yaml"), + docling_model_path, ) @@ -363,8 +382,8 @@ def generate_data( max_num_tokens=max_num_tokens, ) - knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init( - ctx, pipeline + knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = ( + _sdg_init(ctx, pipeline) ) # Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline) @@ -392,6 +411,7 @@ def generate_data( chunk_word_count, document_output_dir, model_name, + docling_model_path=docling_model_path, ) if not samples: diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 50fd692c..0a72e3c4 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -90,6 +90,7 @@ def __new__( server_ctx_size=4096, chunk_word_count=1024, tokenizer_model_name: str | None = None, + docling_model_path: str | None = None, ): """Insantiate the appropriate chunker for the provided document @@ -145,6 +146,7 @@ def __new__( output_dir, chunk_word_count, tokenizer_model_name, + docling_model_path=docling_model_path, ) @staticmethod @@ -219,6 +221,7 @@ def __init__( output_dir: Path, chunk_word_count: int, tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1", + docling_model_path=None, ): self.document_paths = document_paths self.filepaths = filepaths @@ -231,6 +234,7 @@ def __init__( ) self.tokenizer = self.create_tokenizer(tokenizer_model_name) + self.docling_model_path = docling_model_path def chunk_documents(self) -> List: """Semantically chunk PDF documents. @@ -247,6 +251,8 @@ def chunk_documents(self) -> List: if self.document_paths == []: return [] +<<<<<<< HEAD +<<<<<<< HEAD model_artifacts_path = StandardPdfPipeline.download_models_hf() pipeline_options = PdfPipelineOptions( artifacts_path=model_artifacts_path, @@ -256,6 +262,27 @@ def chunk_documents(self) -> List: if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options +======= + if not self.docling_model_path.exists(): +======= + if self.docling_model_path is None: +<<<<<<< HEAD +>>>>>>> f8f6959 (Update src/instructlab/sdg/utils/chunkers.py) + logger.info( + f"Docling models not found on disk, downloading models..." + ) +======= + logger.info("Docling models not found on disk, downloading models...") +>>>>>>> 16c6f45 (Address mypy issues and small typos) + self.docling_model_path = StandardPdfPipeline.download_models_hf() + else: + logger.info("Found the docling models") + + pipeline_options = PdfPipelineOptions(artifacts_path=self.docling_model_path) + + # Keep OCR models on the CPU instead of GPU + pipeline_options.ocr_options.use_gpu = False +>>>>>>> 1b984e0 (Rebase) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index a6f9b381..00743d93 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path=None, ): chunker = DocumentChunker( leaf_node=leaf_node, @@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples( server_ctx_size=server_ctx_size, chunk_word_count=chunk_word_count, tokenizer_model_name=model_name, + docling_model_path=docling_model_path, ) chunks = chunker.chunk_documents() @@ -453,6 +455,7 @@ def leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path=None, ): if not leaf_node: return [] @@ -464,5 +467,6 @@ def leaf_node_to_samples( chunk_word_count, document_output_dir, model_name, + docling_model_path, ) return _skill_leaf_node_to_samples(leaf_node) diff --git a/tests/conftest.py b/tests/conftest.py index 80d61903..ed3fd8c4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,6 +6,8 @@ # Standard from unittest import mock +import pathlib +import typing # Third Party from datasets import Dataset @@ -17,6 +19,14 @@ # Local from .taxonomy import MockTaxonomy +TESTS_PATH = pathlib.Path(__file__).parent.absolute() + + +@pytest.fixture +def testdata_path() -> typing.Generator[pathlib.Path, None, None]: + """Path to local test data directory""" + yield TESTS_PATH / "testdata" + def get_ctx(**kwargs) -> PipelineContext: kwargs.setdefault("client", mock.MagicMock()) diff --git a/tests/test_generate_data.py b/tests/test_generate_data.py index f382a351..0d04a80f 100644 --- a/tests/test_generate_data.py +++ b/tests/test_generate_data.py @@ -20,7 +20,7 @@ import yaml # First Party -from instructlab.sdg.generate_data import _context_init, generate_data +from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data from instructlab.sdg.llmblock import LLMBlock from instructlab.sdg.pipeline import PipelineContext @@ -548,3 +548,37 @@ def test_context_init_batch_size_optional(): batch_num_workers=32, ) assert ctx.batch_size == 20 + + +def test_sdg_init_docling_path_config_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir")) + ctx = _context_init( + None, + "mixtral", + "foo.bar", + 1, + "/checkpoint/dir", + 1, + batch_size=20, + batch_num_workers=32, + ) + _, _, _, docling_model_path = _sdg_init(ctx, "full") + assert docling_model_path == "/mock/docling-models" + + +def test_sdg_init_docling_path_config_not_found(testdata_path): + with patch.dict(os.environ): + os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir")) + ctx = _context_init( + None, + "mixtral", + "foo.bar", + 1, + "/checkpoint/dir", + 1, + batch_size=20, + batch_num_workers=32, + ) + _, _, _, docling_model_path = _sdg_init(ctx, "full") + assert docling_model_path is None diff --git a/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml new file mode 100644 index 00000000..657cfdf3 --- /dev/null +++ b/tests/testdata/mock_xdg_data_dir/instructlab/sdg/models/config.yaml @@ -0,0 +1,4 @@ +models: +- path: /mock/docling-models + source: https://huggingface.co/ds4sd/docling-models + revision: main