Skip to content

Commit

Permalink
Add error logging for failed tokenizer loading
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 12, 2024
1 parent 2aa3aee commit 13537a7
Showing 1 changed file with 13 additions and 10 deletions.
23 changes: 13 additions & 10 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ def fuse_texts(

return fused_texts

def create_tokenizer(self, model_name: str):
@staticmethod
def create_tokenizer(cls, model_name: str):
"""
Create a tokenizer instance from a pre-trained model or a local directory.
Expand All @@ -301,24 +302,26 @@ def create_tokenizer(self, model_name: str):
AutoTokenizer: The tokenizer instance.
"""
# Third Party
import ipdb

ipdb.set_trace()
model_path = Path(model_name)
error_info_message = "Please run ilab model download {download_args} and try again"
try:
if is_model_safetensors(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path)
error_info_message = error_info_message.format(download_args=f"--repository {model_path}")
elif is_model_gguf(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name)
model_dir, model_filename = model_path.parent, model_path.name
tokenizer = AutoTokenizer.from_pretrained(model_dir, gguf_file=model_filename)
error_info_message = error_info_message.format(download_args=f"--repository {model_dir} --filename {model_filename}")
else:
raise Exception(f"Received path to invalid model format {model_path}")
logger.info(f"Successfully loaded tokenizer from: {model_path}")
return tokenizer
except Exception as e:

except (OSError, ValueError) as e:

Check warning on line 320 in src/instructlab/sdg/utils/chunkers.py

View workflow job for this annotation

GitHub Actions / pylint

W0719: Raising too general exception: Exception (broad-exception-raised)
logger.error(
f"Failed to load tokenizer as model was not found at {model_path}."
"Please run `ilab model download {model_name} and try again\n"
"{str(e)}"
f"Failed to load tokenizer as model was not found at {model_path}. {error_info_message}"
)
raise
raise e

def get_token_count(self, text, tokenizer):
"""
Expand Down

0 comments on commit 13537a7

Please sign in to comment.