Increase Exception specificity for invalid model paths

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Nov 14, 2024 · 192e500 · 192e500
1 parent cca058f
commit 192e500
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 11 deletions.
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -19,11 +19,13 @@
     PdfFormatOption,
 )
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
-from instructlab.model.backends.backends import is_model_gguf, is_model_safetensors
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 from tabulate import tabulate
 from transformers import AutoTokenizer
 
+# First Party
+from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors
+
 logger = logging.getLogger(__name__)
 _DEFAULT_CHUNK_OVERLAP = 100
 
@@ -320,16 +322,16 @@ def create_tokenizer(model_name: Optional[str]):
                 tokenizer = AutoTokenizer.from_pretrained(model_path)
 
             elif is_model_gguf(model_path):
+                model_dir, model_filename = model_path.parent, model_path.name
                 error_info_message = error_info_message.format(
                     download_args=f"--repository {model_dir} --filename {model_filename}"
                 )
-                model_dir, model_filename = model_path.parent, model_path.name
                 tokenizer = AutoTokenizer.from_pretrained(
                     model_dir, gguf_file=model_filename
                 )
 
             else:
-                raise Exception(f"Received path to invalid model format {model_path}")
+                raise ValueError(f"Received path to invalid model format {model_path}")
 
             logger.info(f"Successfully loaded tokenizer from: {model_path}")
             return tokenizer

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -417,7 +417,6 @@ def _knowledge_leaf_node_to_samples(
     document_output_dir,
     model_name,
 ):
-    import ipdb; ipdb.set_trace()
     chunker = DocumentChunker(
         leaf_node=leaf_node,
         taxonomy_path=taxonomy_path,

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
-import os
 from pathlib import Path
+import os
 import tempfile
 
 # Third Party
@@ -21,6 +21,7 @@
 
 TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "testdata")
 
+
 @pytest.fixture
 def documents_dir():
     return Path(TEST_DATA_DIR) / "sample_documents"
@@ -93,18 +94,21 @@ def test_chunker_factory_empty_filetype(documents_dir):
                 output_dir=temp_dir,
                 tokenizer_model_name=tokenizer_model_name,
             )
+
+
 def test_create_tokenizer(tokenizer_model_name):
     ContextAwareChunker.create_tokenizer(tokenizer_model_name)
 
 
 @pytest.mark.parametrize(
     "model_name",
     [
-        os.path.join(TEST_DATA_DIR, "models/invalid_gguf.gguf"),
-        os.path.join(TEST_DATA_DIR, "models/invalid_safetensors_dir/"),
-        os.path.join(TEST_DATA_DIR, "bad_path)"),
-    ]
+        "models/invalid_gguf.gguf",
+        "models/invalid_safetensors_dir/",
+        "bad_path",
+    ],
 )
 def test_invalid_tokenizer(model_name):
-    with pytest.raises(Exception):
-        ContextAwareChunker.create_tokenizer(model_name)
+    model_path = os.path.join(TEST_DATA_DIR, model_name)
+    with pytest.raises(ValueError):
+        ContextAwareChunker.create_tokenizer(model_path)