Skip to content

Commit

Permalink
Increase Exception specificity for invalid model paths
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 14, 2024
1 parent cca058f commit 192e500
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 11 deletions.
8 changes: 5 additions & 3 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
PdfFormatOption,
)
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from instructlab.model.backends.backends import is_model_gguf, is_model_safetensors
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from tabulate import tabulate
from transformers import AutoTokenizer

# First Party
from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors

logger = logging.getLogger(__name__)
_DEFAULT_CHUNK_OVERLAP = 100

Expand Down Expand Up @@ -320,16 +322,16 @@ def create_tokenizer(model_name: Optional[str]):
tokenizer = AutoTokenizer.from_pretrained(model_path)

elif is_model_gguf(model_path):
model_dir, model_filename = model_path.parent, model_path.name
error_info_message = error_info_message.format(
download_args=f"--repository {model_dir} --filename {model_filename}"
)
model_dir, model_filename = model_path.parent, model_path.name
tokenizer = AutoTokenizer.from_pretrained(
model_dir, gguf_file=model_filename
)

else:
raise Exception(f"Received path to invalid model format {model_path}")
raise ValueError(f"Received path to invalid model format {model_path}")

logger.info(f"Successfully loaded tokenizer from: {model_path}")
return tokenizer
Expand Down
1 change: 0 additions & 1 deletion src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,6 @@ def _knowledge_leaf_node_to_samples(
document_output_dir,
model_name,
):
import ipdb; ipdb.set_trace()
chunker = DocumentChunker(
leaf_node=leaf_node,
taxonomy_path=taxonomy_path,
Expand Down
18 changes: 11 additions & 7 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
import os
from pathlib import Path
import os
import tempfile

# Third Party
Expand All @@ -21,6 +21,7 @@

TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")


@pytest.fixture
def documents_dir():
return Path(TEST_DATA_DIR) / "sample_documents"
Expand Down Expand Up @@ -93,18 +94,21 @@ def test_chunker_factory_empty_filetype(documents_dir):
output_dir=temp_dir,
tokenizer_model_name=tokenizer_model_name,
)


def test_create_tokenizer(tokenizer_model_name):
ContextAwareChunker.create_tokenizer(tokenizer_model_name)


@pytest.mark.parametrize(
"model_name",
[
os.path.join(TEST_DATA_DIR, "models/invalid_gguf.gguf"),
os.path.join(TEST_DATA_DIR, "models/invalid_safetensors_dir/"),
os.path.join(TEST_DATA_DIR, "bad_path)"),
]
"models/invalid_gguf.gguf",
"models/invalid_safetensors_dir/",
"bad_path",
],
)
def test_invalid_tokenizer(model_name):
with pytest.raises(Exception):
ContextAwareChunker.create_tokenizer(model_name)
model_path = os.path.join(TEST_DATA_DIR, model_name)
with pytest.raises(ValueError):
ContextAwareChunker.create_tokenizer(model_path)

0 comments on commit 192e500

Please sign in to comment.