Skip to content

Commit

Permalink
Merge pull request #362 from aakankshaduggal/docling-model-path
Browse files Browse the repository at this point in the history
Docling models path
  • Loading branch information
mergify[bot] authored Nov 14, 2024
2 parents 49ef4f1 + 0e9d75d commit 9d4ed74
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 5 deletions.
24 changes: 22 additions & 2 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# instructlab - All of these need to go away (other than sdg) - issue #6
from xdg_base_dirs import xdg_data_dirs, xdg_data_home
import openai
import yaml

# First Party
# pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -220,6 +221,23 @@ def _sdg_init(ctx, pipeline):
data_dirs = [os.path.join(xdg_data_home(), "instructlab", "sdg")]
data_dirs.extend(os.path.join(dir, "instructlab", "sdg") for dir in xdg_data_dirs())

docling_model_path = None
sdg_models_path = docling_model_path
for d in data_dirs:
if os.path.exists(os.path.join(d, "models")):
sdg_models_path = os.path.join(d, "models")
break

if sdg_models_path is not None:
try:
with open(
os.path.join(sdg_models_path, "config.yaml"), "r", encoding="utf-8"
) as file:
config = yaml.safe_load(file)
docling_model_path = config["models"][0]["path"]
except (FileNotFoundError, NotADirectoryError, PermissionError) as e:
logger.warning(f"unable to read docling models path from config.yaml {e}")

for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
Expand Down Expand Up @@ -251,6 +269,7 @@ def load_pipeline(yaml_basename):
load_pipeline("knowledge.yaml"),
load_pipeline("freeform_skills.yaml"),
load_pipeline("grounded_skills.yaml"),
docling_model_path,
)


Expand Down Expand Up @@ -363,8 +382,8 @@ def generate_data(
max_num_tokens=max_num_tokens,
)

knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe = _sdg_init(
ctx, pipeline
knowledge_pipe, freeform_skills_pipe, grounded_skills_pipe, docling_model_path = (
_sdg_init(ctx, pipeline)
)

# Make sure checkpointing is disabled (we don't want this pipeline to load checkpoints from the main pipeline)
Expand Down Expand Up @@ -392,6 +411,7 @@ def generate_data(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=docling_model_path,
)

if not samples:
Expand Down
14 changes: 12 additions & 2 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def __new__(
server_ctx_size=4096,
chunk_word_count=1024,
tokenizer_model_name: str | None = None,
docling_model_path: str | None = None,
):
"""Insantiate the appropriate chunker for the provided document
Expand Down Expand Up @@ -145,6 +146,7 @@ def __new__(
output_dir,
chunk_word_count,
tokenizer_model_name,
docling_model_path=docling_model_path,
)

@staticmethod
Expand Down Expand Up @@ -219,6 +221,7 @@ def __init__(
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
docling_model_path=None,
):
self.document_paths = document_paths
self.filepaths = filepaths
Expand All @@ -231,6 +234,7 @@ def __init__(
)

self.tokenizer = self.create_tokenizer(tokenizer_model_name)
self.docling_model_path = docling_model_path

def chunk_documents(self) -> List:
"""Semantically chunk PDF documents.
Expand All @@ -247,15 +251,21 @@ def chunk_documents(self) -> List:
if self.document_paths == []:
return []

model_artifacts_path = StandardPdfPipeline.download_models_hf()
if self.docling_model_path is None:
logger.info("Docling models not found on disk, downloading models...")
self.docling_model_path = StandardPdfPipeline.download_models_hf()
else:
logger.info("Found the docling models")

pipeline_options = PdfPipelineOptions(
artifacts_path=model_artifacts_path,
artifacts_path=self.docling_model_path,
do_ocr=False,
)
ocr_options = resolve_ocr_options()
if ocr_options is not None:
pipeline_options.do_ocr = True
pipeline_options.ocr_options = ocr_options

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
Expand Down
4 changes: 4 additions & 0 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ def _knowledge_leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=None,
):
chunker = DocumentChunker(
leaf_node=leaf_node,
Expand All @@ -424,6 +425,7 @@ def _knowledge_leaf_node_to_samples(
server_ctx_size=server_ctx_size,
chunk_word_count=chunk_word_count,
tokenizer_model_name=model_name,
docling_model_path=docling_model_path,
)
chunks = chunker.chunk_documents()

Expand Down Expand Up @@ -453,6 +455,7 @@ def leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path=None,
):
if not leaf_node:
return []
Expand All @@ -464,5 +467,6 @@ def leaf_node_to_samples(
chunk_word_count,
document_output_dir,
model_name,
docling_model_path,
)
return _skill_leaf_node_to_samples(leaf_node)
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# Standard
from unittest import mock
import pathlib
import typing

# Third Party
from datasets import Dataset
Expand All @@ -17,6 +19,14 @@
# Local
from .taxonomy import MockTaxonomy

TESTS_PATH = pathlib.Path(__file__).parent.absolute()


@pytest.fixture
def testdata_path() -> typing.Generator[pathlib.Path, None, None]:
"""Path to local test data directory"""
yield TESTS_PATH / "testdata"


def get_ctx(**kwargs) -> PipelineContext:
kwargs.setdefault("client", mock.MagicMock())
Expand Down
36 changes: 35 additions & 1 deletion tests/test_generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import yaml

# First Party
from instructlab.sdg.generate_data import _context_init, generate_data
from instructlab.sdg.generate_data import _context_init, _sdg_init, generate_data
from instructlab.sdg.llmblock import LLMBlock
from instructlab.sdg.pipeline import PipelineContext

Expand Down Expand Up @@ -548,3 +548,37 @@ def test_context_init_batch_size_optional():
batch_num_workers=32,
)
assert ctx.batch_size == 20


def test_sdg_init_docling_path_config_found(testdata_path):
with patch.dict(os.environ):
os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("mock_xdg_data_dir"))
ctx = _context_init(
None,
"mixtral",
"foo.bar",
1,
"/checkpoint/dir",
1,
batch_size=20,
batch_num_workers=32,
)
_, _, _, docling_model_path = _sdg_init(ctx, "full")
assert docling_model_path == "/mock/docling-models"


def test_sdg_init_docling_path_config_not_found(testdata_path):
with patch.dict(os.environ):
os.environ["XDG_DATA_HOME"] = str(testdata_path.joinpath("nonexistent_dir"))
ctx = _context_init(
None,
"mixtral",
"foo.bar",
1,
"/checkpoint/dir",
1,
batch_size=20,
batch_num_workers=32,
)
_, _, _, docling_model_path = _sdg_init(ctx, "full")
assert docling_model_path is None
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
models:
- path: /mock/docling-models
source: https://huggingface.co/ds4sd/docling-models
revision: main

0 comments on commit 9d4ed74

Please sign in to comment.