diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dc3163e4..983a308b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -100,7 +100,7 @@ jobs: python -m pip install --upgrade pip python -m pip install tox tox-gh>=1.2 - - name: Run unit tests with tox + - name: Run unit and functional tests with tox run: | tox diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 881153dc..3bdea8a2 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -97,6 +97,8 @@ def __new__( doc_dict = cls._split_docs_by_filetype(documents, filepaths) if len(doc_dict.keys()) > 1: raise ValueError("Received multiple document types") + if len(doc_dict.keys()) < 1: + raise ValueError("Received no document types") if FileTypes.MD in doc_dict: doc_contents = [d for d, _ in doc_dict[FileTypes.MD]] diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py new file mode 100644 index 00000000..08e51987 --- /dev/null +++ b/tests/functional/test_chunkers.py @@ -0,0 +1,48 @@ +# Standard +import os +from pathlib import Path + +# Third Party +from openai import OpenAI + +# First Party +from instructlab.sdg.utils.chunkers import DocumentChunker + +TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata") + +openai_api_key = os.environ.get("OPENAI_API_KEY", "EMPTY") +openai_api_base = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1") + +# TODO: Apparently we don't really need any contents in the qna.yaml? +knowledge_pdf_qna = """ +version: 3 +domain: astronomy +""" + +def test_chunk_pdf(tmp_path): + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + qna_dir = os.path.join(tmp_path, "knowledge") + os.makedirs(qna_dir) + with open(os.path.join(qna_dir, "qna.yaml"), "w", encoding="utf-8") as f: + f.write(knowledge_pdf_qna) + + leaf_node = [{ + "documents": ["Lorem ipsum"], + "filepaths": [Path(os.path.join(TEST_DATA_DIR, "phoenix.pdf"))], + "taxonomy_path": "knowledge", + }] + chunker = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=tmp_path, + output_dir=tmp_path, + server_ctx_size=4096, + chunk_word_count=500, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) + chunks = chunker.chunk_documents() + assert len(chunks) > 1 + assert "Phoenix is a minor constellation" in chunks[0] diff --git a/tests/functional/testdata/phoenix.pdf b/tests/functional/testdata/phoenix.pdf new file mode 100644 index 00000000..5430e78d Binary files /dev/null and b/tests/functional/testdata/phoenix.pdf differ diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 04970d24..84bfb0e1 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -67,3 +67,21 @@ def test_chunker_factory_unsupported_filetype(documents_dir): output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) + +def test_chunker_factory_empty_filetype(documents_dir): + """Test that the DocumentChunker factory class fails when provided no document""" + leaf_node = [ + { + "documents": [], + "taxonomy_path": "", + "filepaths": [], + } + ] + with pytest.raises(ValueError): + with tempfile.TemporaryDirectory() as temp_dir: + _ = DocumentChunker( + leaf_node=leaf_node, + taxonomy_path=documents_dir, + output_dir=temp_dir, + tokenizer_model_name="instructlab/merlinite-7b-lab", + ) diff --git a/tox.ini b/tox.ini index 1c6d2812..38f42650 100644 --- a/tox.ini +++ b/tox.ini @@ -3,11 +3,11 @@ [tox] # py3-unit runs unit tests with 'python3' # py311-unit runs the same tests with 'python3.11' -envlist = ruff, lint, mypy, spellcheck, py3-unit +envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional} minversion = 4.4 [testenv] -description = run tests (unit, unitcov) +description = run tests (unit, unitcov, functional) # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies # are huge. This reduces venv from 5.7 GB to 1.5 GB. setenv = @@ -16,8 +16,16 @@ package = wheel wheel_build_env = pkg deps = -r requirements-dev.txt commands = - unit: {envpython} -m pytest {posargs:tests} - unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"} + unit: {envpython} -m pytest {posargs:tests --ignore=tests/functional} + unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.sdg --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests --ignore=tests/functional -m "not (examples or slow)"} + functional: {envpython} -m pytest {posargs:tests/functional} +allowlist_externals = + functional: ./scripts/functional-tests.sh + +[testenv:py3-functional] +setenv = + OPENAI_API_BASE={env:OPENAI_API_BASE:http://localhost:8000/v1} + OPENAI_API_KEY={env:OPENAI_API_KEY:EMPTY} # format, check, and linting targets don't build and install the project to # speed up testing.