Expand chunking testing, including new functional tests

This adds pytest-based functional tests to our repo with a basic PDF chunking test. While writing that test I discovered a minor bug in DocumentChunker, resulting in an additional unit test and a minor change to handle the case where we're given no documents and were previously not instantiating any chunker. Signed-off-by: Ben Browning <[email protected]>
instructlab · Nov 8, 2024 · f76d1a1 · f76d1a1
1 parent eacae02
commit f76d1a1
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 7 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -100,7 +100,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install tox tox-gh>=1.2
 
-      - name: Run unit tests with tox
+      - name: Run unit and functional tests with tox
         run: |
           tox
 

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -97,6 +97,8 @@ def __new__(
         doc_dict = cls._split_docs_by_filetype(documents, filepaths)
         if len(doc_dict.keys()) > 1:
             raise ValueError("Received multiple document types")
+        if len(doc_dict.keys()) < 1:
+            raise ValueError("Received no document types")
 
         if FileTypes.MD in doc_dict:
             doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
@@ -0,0 +1,51 @@
+# Standard
+from pathlib import Path
+import os
+
+# Third Party
+from openai import OpenAI
+
+# First Party
+from instructlab.sdg.utils.chunkers import DocumentChunker
+
+TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
+
+openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
+openai_api_base = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1")
+
+# TODO: Apparently we don't really need any contents in the qna.yaml?
+knowledge_pdf_qna = """
+version: 3
+domain: astronomy
+"""
+
+
+def test_chunk_pdf(tmp_path):
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    qna_dir = os.path.join(tmp_path, "knowledge")
+    os.makedirs(qna_dir)
+    with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f:
+        f.write(knowledge_pdf_qna)
+
+    leaf_node = [
+        {
+            "documents": ["Lorem ipsum"],
+            "filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))],
+            "taxonomy_path": "knowledge",
+        }
+    ]
+    chunker = DocumentChunker(
+        leaf_node=leaf_node,
+        taxonomy_path=tmp_path,
+        output_dir=tmp_path,
+        server_ctx_size=4096,
+        chunk_word_count=500,
+        tokenizer_model_name="instructlab/merlinite-7b-lab",
+    )
+    chunks = chunker.chunk_documents()
+    assert len(chunks) > 1
+    assert "Phoenix is a minor constellation" in chunks[0]
diff --git a/tests/functional/testdata/phoenix.pdf b/tests/functional/testdata/phoenix.pdf
diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -67,3 +67,22 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
                 output_dir=temp_dir,
                 tokenizer_model_name="instructlab/merlinite-7b-lab",
             )
+
+
+def test_chunker_factory_empty_filetype(documents_dir):
+    """Test that the DocumentChunker factory class fails when provided no document"""
+    leaf_node = [
+        {
+            "documents": [],
+            "taxonomy_path": "",
+            "filepaths": [],
+        }
+    ]
+    with pytest.raises(ValueError):
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _ = DocumentChunker(
+                leaf_node=leaf_node,
+                taxonomy_path=documents_dir,
+                output_dir=temp_dir,
+                tokenizer_model_name="instructlab/merlinite-7b-lab",
+            )
diff --git a/tox.ini b/tox.ini
@@ -3,11 +3,11 @@
 [tox]
 # py3-unit runs unit tests with 'python3'
 # py311-unit runs the same tests with 'python3.11'
-envlist = ruff, lint, mypy, spellcheck, py3-unit
+envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional}
 minversion = 4.4
 
 [testenv]
-description = run tests (unit, unitcov)
+description = run tests (unit, unitcov, functional)
 # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =
@@ -16,8 +16,16 @@ package = wheel
 wheel_build_env = pkg
 deps = -r requirements-dev.txt
 commands =
-    unit: {envpython} -m pytest {posargs:tests}
-    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}
+    unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional}
+    unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"}
+    functional: {envpython} -m pytest {posargs:tests/functional}
+allowlist_externals =
+    functional: ./scripts/functional-tests.sh
+
+[testenv:py3-functional]
+setenv =
+    OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1}
+    OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY}
 
 # format, check, and linting targets don't build and install the project to
 # speed up testing.
@@ -82,5 +90,5 @@ commands =
 
 [gh]
 python =
-    3.11 = py311-unitcov
-    3.10 = py310-unitcov
+    3.11 = py311-{unitcov, functional}
+    3.10 = py310-{unitcov, functional}