diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 393221bf..1db7ecb2 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -305,18 +305,23 @@ def create_tokenizer(self, model_name: str): ipdb.set_trace() model_path = Path(model_name) + error_info_message = "Please run ilab model download {download_args} and try again" try: if is_model_safetensors(model_path): tokenizer = AutoTokenizer.from_pretrained(model_path) + error_info_message = error_info_message.format(download_args=f"--repository {model_path}") elif is_model_gguf(model_path): - tokenizer = AutoTokenizer.from_pretrained(model_path.parent, gguf_file=model_path.name) + model_dir, model_filename = model_path.parent, model_path.name + tokenizer = AutoTokenizer.from_pretrained(model_dir, gguf_file=model_filename) + error_info_message = error_info_message.format(download_args=f"--repository {model_dir} --filename {model_filename}") + else: + raise Exception(f"Received path to invalid model format {model_path}") logger.info(f"Successfully loaded tokenizer from: {model_path}") return tokenizer - except Exception as e: + except (OSError, ValueError) as e: logger.error( - f"Failed to load tokenizer as model was not found at {model_path}." - "Please run `ilab model download {model_name} and try again\n" - "{str(e)}" + str(e), + f"Failed to load tokenizer as model was not found at {model_path}. {error_info_message}" ) raise