Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: Internal: src/sentencepiece_processor.cc(1101) [model_proto->ParseFromArray(serialized.data(), serialized.size())] #42

Open
liuyang77886 opened this issue Jun 24, 2023 · 0 comments

Comments

@liuyang77886
Copy link

环境:
Python 3.10.8
cat /proc/version
Linux version 5.4.0-99-generic (buildd@lgw01-amd64-007) (gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu120.04)) #112-Ubuntu SMP Thu Feb 3 13:50:55 UTC 2022
root@autodl-container-28d9119efa-9edf73a5:
/autodl-tmp/Chinese-LangChain# cat /etc/issue
Ubuntu 22.04.1 LTS \n \l

我自己下载了模型
/root/autodl-tmp/THUDM/chatglm-6b-int4
/root/autodl-tmp/GanymedeNil/text2vec-large-chinese

做了如下修改:

llm_model_name = '/root/autodl-tmp/THUDM/chatglm-6b-int4'  # 本地模型文件 or huggingface远程仓库
embedding_model_name = '/root/autodl-tmp/GanymedeNil/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库

运行:
python main.py
报错:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /root/autodl-tmp/Chinese-LangChain/main.py:26 in │
│ │
│ 23 │
│ 24 │
│ 25 config = LangChainCFG() │
│ ❱ 26 application = LangChainApplication(config) │
│ 27 │
│ 28 application.source_service.init_source_vector() │
│ 29 │
│ │
│ /root/autodl-tmp/Chinese-LangChain/clc/langchain_application.py:24 in init
│ │
│ 21 │ def init(self, config): │
│ 22 │ │ self.config = config │
│ 23 │ │ self.llm_service = ChatGLMService() │
│ ❱ 24 │ │ self.llm_service.load_model(model_name_or_path=self.config.llm_model_name) │
│ 25 │ │ # self.llm_service.load_model_on_gpus(model_name_or_path=self.config.llm_model_n │
│ 26 │ │ self.source_service = SourceService(config) │
│ 27 │
│ │
│ /root/autodl-tmp/Chinese-LangChain/clc/gpt_service.py:55 in load_model │
│ │
│ 52 │ │
│ 53 │ def load_model(self, │
│ 54 │ │ │ │ model_name_or_path: str = "THUDM/chatglm-6b"): │
│ ❱ 55 │ │ self.tokenizer = AutoTokenizer.from_pretrained( │
│ 56 │ │ │ model_name_or_path, │
│ 57 │ │ │ trust_remote_code=True │
│ 58 │ │ ) │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:693 │
│ in from_pretrained │
│ │
│ 690 │ │ │ │ raise ValueError( │
│ 691 │ │ │ │ │ f"Tokenizer class {tokenizer_class_candidate} does not exist or is n │
│ 692 │ │ │ │ ) │
│ ❱ 693 │ │ │ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input │
│ 694 │ │ │
│ 695 │ │ # Otherwise we have to be creative. │
│ 696 │ │ # if model is an encoder decoder, the encoder tokenizer class is used by default │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1812 in │
│ from_pretrained │
│ │
│ 1809 │ │ │ else: │
│ 1810 │ │ │ │ logger.info(f"loading file {file_path} from cache at {resolved_vocab_fil │
│ 1811 │ │ │
│ ❱ 1812 │ │ return cls._from_pretrained( │
│ 1813 │ │ │ resolved_vocab_files, │
│ 1814 │ │ │ pretrained_model_name_or_path, │
│ 1815 │ │ │ init_configuration, │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1975 in │
│ _from_pretrained │
│ │
│ 1972 │ │ │
│ 1973 │ │ # Instantiate tokenizer. │
│ 1974 │ │ try: │
│ ❱ 1975 │ │ │ tokenizer = cls(*init_inputs, **init_kwargs) │
│ 1976 │ │ except OSError: │
│ 1977 │ │ │ raise OSError( │
│ 1978 │ │ │ │ "Unable to load vocabulary from file. " │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │
│ 1 in init
│ │
│ 218 │ │ self.mask_token = mask_token │
│ 219 │ │ self.gmask_token = gmask_token │
│ 220 │ │ │
│ ❱ 221 │ │ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens) │
│ 222 │ │ │
│ 223 │ │ """ Initialisation """ │
│ 224 │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:64 │
│ in init
│ │
│ 61 │ │ self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "", "< │
│ 62 │ │ self.max_blank_length = max_blank_length │
│ 63 │ │ self.byte_fallback = byte_fallback │
│ ❱ 64 │ │ self.text_tokenizer = TextTokenizer(vocab_file) │
│ 65 │ │
│ 66 │ def _get_text_tokenizer(self): │
│ 67 │ │ return self.text_tokenizer │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │
│ in init
│ │
│ 19 class TextTokenizer: │
│ 20 │ def init(self, model_path): │
│ 21 │ │ self.sp = spm.SentencePieceProcessor() │
│ ❱ 22 │ │ self.sp.Load(model_path) │
│ 23 │ │ self.num_tokens = self.sp.vocab_size() │
│ 24 │ │
│ 25 │ def encode(self, text): │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:905 in Load │
│ │
│ 902 │ │ raise RuntimeError('model_file and model_proto must be exclusive.') │
│ 903 │ if model_proto: │
│ 904 │ │ return self.LoadFromSerializedProto(model_proto) │
│ ❱ 905 │ return self.LoadFromFile(model_file) │
│ 906 │
│ 907 │
│ 908 # Register SentencePieceProcessor in _sentencepiece: │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:310 in LoadFromFile │
│ │
│ 307 │ │ return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self) │
│ 308 │ │
│ 309 │ def LoadFromFile(self, arg): │
│ ❱ 310 │ │ return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) │
│ 311 │ │
│ 312 │ def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, r │
│ 313 │ │ return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sam │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Internal: src/sentencepiece_processor.cc(1101) [model_proto->ParseFromArray(serialized.data(), serialized.size())]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant