instructlab · mergify · Nov 8, 2024 · Nov 7, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -100,7 +100,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install tox tox-gh>=1.2
 
-      - name: Run unit tests with tox
+      - name: Run unit and functional tests with tox
         run: |
           tox
 

diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
@@ -13,3 +13,4 @@ ignores:
   - ".github/**"
   - "venv/**"
   - ".venv/**"
+  - "**/testdata/**"
diff --git a/.spellcheck.yml b/.spellcheck.yml
@@ -8,7 +8,7 @@ matrix:
     camel-case: true
     mode: markdown
   sources:
-  - "**/*.md|!.tox/**|!venv/**"
+  - "**/*.md|!.tox/**|!venv/**|!**/testdata/**"
   dictionary:
     wordlists:
     - .spellcheck-en-custom.txt

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling>=2.3.0,<3.0.0
+docling>=2.4.2,<3.0.0
 GitPython>=3.1.42,<4.0.0
 httpx>=0.25.0,<1.0.0
 instructlab-schema>=0.4.0

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -95,6 +95,8 @@ def __new__(
         doc_dict = cls._split_docs_by_filetype(documents, filepaths)
         if len(doc_dict.keys()) > 1:
             raise ValueError("Received multiple document types")
+        if len(doc_dict.keys()) < 1:
+            raise ValueError("Received no document types")
 
         if FileTypes.MD in doc_dict:
             doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
@@ -0,0 +1,56 @@
+# Standard
+from pathlib import Path
+import os
+
+# First Party
+from instructlab.sdg.utils.chunkers import DocumentChunker
+
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
+
+
+def test_chunk_pdf(tmp_path):
+    leaf_node = [
+        {
+            "documents": ["Lorem ipsum"],
+            "filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))],
+            "taxonomy_path": "knowledge",
+        }
+    ]
+    chunker = DocumentChunker(
+        leaf_node=leaf_node,
+        taxonomy_path=tmp_path,
+        output_dir=tmp_path,
+        server_ctx_size=4096,
+        chunk_word_count=500,
+        tokenizer_model_name="instructlab/merlinite-7b-lab",
+    )
+    chunks = chunker.chunk_documents()
+    assert len(chunks) > 9
+    assert "Phoenix is a minor constellation" in chunks[0]
+    for chunk in chunks:
+        # inexact sanity-checking of chunk max length
+        assert len(chunk) < 2500
+
+
+def test_chunk_md(tmp_path):
+    markdown_path = Path(os.path.join(TEST_DATA_DIR, "phoenix.md"))
+    leaf_node = [
+        {
+            "documents": [markdown_path.read_text(encoding="utf-8")],
+            "filepaths": [markdown_path],
+            "taxonomy_path": "knowledge",
+        }
+    ]
+    chunker = DocumentChunker(
+        leaf_node=leaf_node,
+        taxonomy_path=tmp_path,
+        output_dir=tmp_path,
+        server_ctx_size=4096,
+        chunk_word_count=500,
+        tokenizer_model_name="instructlab/merlinite-7b-lab",
+    )
+    chunks = chunker.chunk_documents()
+    assert len(chunks) > 7
+    for chunk in chunks:
+        # inexact sanity-checking of chunk max length
+        assert len(chunk) < 2500
diff --git a/tests/functional/testdata/phoenix.md b/tests/functional/testdata/phoenix.md
diff --git a/tests/functional/testdata/phoenix.pdf b/tests/functional/testdata/phoenix.pdf
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -67,3 +67,22 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
                 output_dir=temp_dir,
                 tokenizer_model_name="instructlab/merlinite-7b-lab",
             )
+
+
+def test_chunker_factory_empty_filetype(documents_dir):
+    """Test that the DocumentChunker factory class fails when provided no document"""
+    leaf_node = [
+        {
+            "documents": [],
+            "taxonomy_path": "",
+            "filepaths": [],
+        }
+    ]
+    with pytest.raises(ValueError):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _ = DocumentChunker(
+                leaf_node=leaf_node,
+                taxonomy_path=documents_dir,
+                output_dir=temp_dir,
+                tokenizer_model_name="instructlab/merlinite-7b-lab",
+            )
diff --git a/tox.ini b/tox.ini
@@ -3,11 +3,11 @@
 [tox]
 # py3-unit runs unit tests with 'python3'
 # py311-unit runs the same tests with 'python3.11'
-envlist = ruff, lint, mypy, spellcheck, py3-unit
+envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional}
 minversion = 4.4
 
 [testenv]
-description = run tests (unit, unitcov)
+description = run tests (unit, unitcov, functional)
 # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =
@@ -16,8 +16,16 @@ package = wheel
 wheel_build_env = pkg
 deps = -r requirements-dev.txt
 commands =
-    unit: {envpython} -m pytest {posargs:tests}
-    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}
+    unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional}
+    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"}
+    functional: {envpython} -m pytest {posargs:tests/functional}
+allowlist_externals =
+    functional: ./scripts/functional-tests.sh
+
+[testenv:py3-functional]
+setenv =
+    OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1}
+    OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY}
 
 # format, check, and linting targets don't build and install the project to
 # speed up testing.
@@ -82,5 +90,5 @@ commands =
 
 [gh]
 python =
-    3.11 = py311-unitcov
-    3.10 = py310-unitcov
+    3.11 = py311-{unitcov, functional}
+    3.10 = py310-{unitcov, functional}