Skip to content

Commit

Permalink
Expand chunking testing, including new functional tests
Browse files Browse the repository at this point in the history
This adds pytest-based functional tests to our repo with a basic PDF
chunking test.

While writing that test I discovered a minor bug in DocumentChunker,
resulting in an additional unit test and a minor change to handle the
case where we're given no documents and were previously not
instantiating any chunker.

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Nov 8, 2024
1 parent eacae02 commit be4ef14
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
python -m pip install --upgrade pip
python -m pip install tox tox-gh>=1.2
- name: Run unit tests with tox
- name: Run unit and functional tests with tox
run: |
tox
Expand Down
2 changes: 2 additions & 0 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def __new__(
doc_dict = cls._split_docs_by_filetype(documents, filepaths)
if len(doc_dict.keys()) > 1:
raise ValueError("Received multiple document types")
if len(doc_dict.keys()) < 1:
raise ValueError("Received no document types")

if FileTypes.MD in doc_dict:
doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]
Expand Down
48 changes: 48 additions & 0 deletions tests/functional/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Standard
import os
from pathlib import Path

# Third Party
from openai import OpenAI

# First Party
from instructlab.sdg.utils.chunkers import DocumentChunker

TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")

openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
openai_api_base = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1")

# TODO: Apparently we don't really need any contents in the qna.yaml?
knowledge_pdf_qna = """
version: 3
domain: astronomy
"""

def test_chunk_pdf(tmp_path):
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

qna_dir = os.path.join(tmp_path, "knowledge")
os.makedirs(qna_dir)
with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f:
f.write(knowledge_pdf_qna)

leaf_node = [{
"documents": ["Lorem ipsum"],
"filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))],
"taxonomy_path": "knowledge",
}]
chunker = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=tmp_path,
output_dir=tmp_path,
server_ctx_size=4096,
chunk_word_count=500,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
chunks = chunker.chunk_documents()
assert len(chunks) > 1
assert "Phoenix is a minor constellation" in chunks[0]
Binary file added tests/functional/testdata/phoenix.pdf
Binary file not shown.
18 changes: 18 additions & 0 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,21 @@ def test_chunker_factory_unsupported_filetype(documents_dir):
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)

def test_chunker_factory_empty_filetype(documents_dir):
"""Test that the DocumentChunker factory class fails when provided no document"""
leaf_node = [
{
"documents": [],
"taxonomy_path": "",
"filepaths": [],
}
]
with pytest.raises(ValueError):
with tempfile.TemporaryDirectory() as temp_dir:
_ = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=documents_dir,
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)
16 changes: 12 additions & 4 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
[tox]
# py3-unit runs unit tests with 'python3'
# py311-unit runs the same tests with 'python3.11'
envlist = ruff, lint, mypy, spellcheck, py3-unit
envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional}
minversion = 4.4

[testenv]
description = run tests (unit, unitcov)
description = run tests (unit, unitcov, functional)
# Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies
# are huge. This reduces venv from 5.7 GB to 1.5 GB.
setenv =
Expand All @@ -16,8 +16,16 @@ package = wheel
wheel_build_env = pkg
deps = -r requirements-dev.txt
commands =
unit: {envpython} -m pytest {posargs:tests}
unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"}
unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional}
unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"}
functional: {envpython} -m pytest {posargs:tests/functional}
allowlist_externals =
functional: ./scripts/functional-tests.sh

[testenv:py3-functional]
setenv =
OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1}
OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY}

# format, check, and linting targets don't build and install the project to
# speed up testing.
Expand Down

0 comments on commit be4ef14

Please sign in to comment.