We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
环境: Python 3.10.8 cat /proc/version Linux version 5.4.0-99-generic (buildd@lgw01-amd64-007) (gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu120.04)) #112-Ubuntu SMP Thu Feb 3 13:50:55 UTC 2022 root@autodl-container-28d9119efa-9edf73a5:/autodl-tmp/Chinese-LangChain# cat /etc/issue Ubuntu 22.04.1 LTS \n \l
我自己下载了模型 /root/autodl-tmp/THUDM/chatglm-6b-int4 /root/autodl-tmp/GanymedeNil/text2vec-large-chinese
做了如下修改:
llm_model_name = '/root/autodl-tmp/THUDM/chatglm-6b-int4' # 本地模型文件 or huggingface远程仓库 embedding_model_name = '/root/autodl-tmp/GanymedeNil/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
运行: python main.py 报错:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /root/autodl-tmp/Chinese-LangChain/main.py:26 in │ │ │ │ 23 │ │ 24 │ │ 25 config = LangChainCFG() │ │ ❱ 26 application = LangChainApplication(config) │ │ 27 │ │ 28 application.source_service.init_source_vector() │ │ 29 │ │ │ │ /root/autodl-tmp/Chinese-LangChain/clc/langchain_application.py:24 in init │ │ │ │ 21 │ def init(self, config): │ │ 22 │ │ self.config = config │ │ 23 │ │ self.llm_service = ChatGLMService() │ │ ❱ 24 │ │ self.llm_service.load_model(model_name_or_path=self.config.llm_model_name) │ │ 25 │ │ # self.llm_service.load_model_on_gpus(model_name_or_path=self.config.llm_model_n │ │ 26 │ │ self.source_service = SourceService(config) │ │ 27 │ │ │ │ /root/autodl-tmp/Chinese-LangChain/clc/gpt_service.py:55 in load_model │ │ │ │ 52 │ │ │ 53 │ def load_model(self, │ │ 54 │ │ │ │ model_name_or_path: str = "THUDM/chatglm-6b"): │ │ ❱ 55 │ │ self.tokenizer = AutoTokenizer.from_pretrained( │ │ 56 │ │ │ model_name_or_path, │ │ 57 │ │ │ trust_remote_code=True │ │ 58 │ │ ) │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:693 │ │ in from_pretrained │ │ │ │ 690 │ │ │ │ raise ValueError( │ │ 691 │ │ │ │ │ f"Tokenizer class {tokenizer_class_candidate} does not exist or is n │ │ 692 │ │ │ │ ) │ │ ❱ 693 │ │ │ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input │ │ 694 │ │ │ │ 695 │ │ # Otherwise we have to be creative. │ │ 696 │ │ # if model is an encoder decoder, the encoder tokenizer class is used by default │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1812 in │ │ from_pretrained │ │ │ │ 1809 │ │ │ else: │ │ 1810 │ │ │ │ logger.info(f"loading file {file_path} from cache at {resolved_vocab_fil │ │ 1811 │ │ │ │ ❱ 1812 │ │ return cls._from_pretrained( │ │ 1813 │ │ │ resolved_vocab_files, │ │ 1814 │ │ │ pretrained_model_name_or_path, │ │ 1815 │ │ │ init_configuration, │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1975 in │ │ _from_pretrained │ │ │ │ 1972 │ │ │ │ 1973 │ │ # Instantiate tokenizer. │ │ 1974 │ │ try: │ │ ❱ 1975 │ │ │ tokenizer = cls(*init_inputs, **init_kwargs) │ │ 1976 │ │ except OSError: │ │ 1977 │ │ │ raise OSError( │ │ 1978 │ │ │ │ "Unable to load vocabulary from file. " │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │ │ 1 in init │ │ │ │ 218 │ │ self.mask_token = mask_token │ │ 219 │ │ self.gmask_token = gmask_token │ │ 220 │ │ │ │ ❱ 221 │ │ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens) │ │ 222 │ │ │ │ 223 │ │ """ Initialisation """ │ │ 224 │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:64 │ │ in init │ │ │ │ 61 │ │ self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "", "< │ │ 62 │ │ self.max_blank_length = max_blank_length │ │ 63 │ │ self.byte_fallback = byte_fallback │ │ ❱ 64 │ │ self.text_tokenizer = TextTokenizer(vocab_file) │ │ 65 │ │ │ 66 │ def _get_text_tokenizer(self): │ │ 67 │ │ return self.text_tokenizer │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │ │ in init │ │ │ │ 19 class TextTokenizer: │ │ 20 │ def init(self, model_path): │ │ 21 │ │ self.sp = spm.SentencePieceProcessor() │ │ ❱ 22 │ │ self.sp.Load(model_path) │ │ 23 │ │ self.num_tokens = self.sp.vocab_size() │ │ 24 │ │ │ 25 │ def encode(self, text): │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:905 in Load │ │ │ │ 902 │ │ raise RuntimeError('model_file and model_proto must be exclusive.') │ │ 903 │ if model_proto: │ │ 904 │ │ return self.LoadFromSerializedProto(model_proto) │ │ ❱ 905 │ return self.LoadFromFile(model_file) │ │ 906 │ │ 907 │ │ 908 # Register SentencePieceProcessor in _sentencepiece: │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:310 in LoadFromFile │ │ │ │ 307 │ │ return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self) │ │ 308 │ │ │ 309 │ def LoadFromFile(self, arg): │ │ ❱ 310 │ │ return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) │ │ 311 │ │ │ 312 │ def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, r │ │ 313 │ │ return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sam │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Internal: src/sentencepiece_processor.cc(1101) [model_proto->ParseFromArray(serialized.data(), serialized.size())]
The text was updated successfully, but these errors were encountered:
No branches or pull requests
环境:
Python 3.10.8
cat /proc/version
Linux version 5.4.0-99-generic (buildd@lgw01-amd64-007) (gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1
20.04)) #112-Ubuntu SMP Thu Feb 3 13:50:55 UTC 2022/autodl-tmp/Chinese-LangChain# cat /etc/issueroot@autodl-container-28d9119efa-9edf73a5:
Ubuntu 22.04.1 LTS \n \l
我自己下载了模型
/root/autodl-tmp/THUDM/chatglm-6b-int4
/root/autodl-tmp/GanymedeNil/text2vec-large-chinese
做了如下修改:
运行:
python main.py
报错:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /root/autodl-tmp/Chinese-LangChain/main.py:26 in │
│ │
│ 23 │
│ 24 │
│ 25 config = LangChainCFG() │
│ ❱ 26 application = LangChainApplication(config) │
│ 27 │
│ 28 application.source_service.init_source_vector() │
│ 29 │
│ │
│ /root/autodl-tmp/Chinese-LangChain/clc/langchain_application.py:24 in init │
│ │
│ 21 │ def init(self, config): │
│ 22 │ │ self.config = config │
│ 23 │ │ self.llm_service = ChatGLMService() │
│ ❱ 24 │ │ self.llm_service.load_model(model_name_or_path=self.config.llm_model_name) │
│ 25 │ │ # self.llm_service.load_model_on_gpus(model_name_or_path=self.config.llm_model_n │
│ 26 │ │ self.source_service = SourceService(config) │
│ 27 │
│ │
│ /root/autodl-tmp/Chinese-LangChain/clc/gpt_service.py:55 in load_model │
│ │
│ 52 │ │
│ 53 │ def load_model(self, │
│ 54 │ │ │ │ model_name_or_path: str = "THUDM/chatglm-6b"): │
│ ❱ 55 │ │ self.tokenizer = AutoTokenizer.from_pretrained( │
│ 56 │ │ │ model_name_or_path, │
│ 57 │ │ │ trust_remote_code=True │
│ 58 │ │ ) │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:693 │
│ in from_pretrained │
│ │
│ 690 │ │ │ │ raise ValueError( │
│ 691 │ │ │ │ │ f"Tokenizer class {tokenizer_class_candidate} does not exist or is n │
│ 692 │ │ │ │ ) │
│ ❱ 693 │ │ │ return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *input │
│ 694 │ │ │
│ 695 │ │ # Otherwise we have to be creative. │
│ 696 │ │ # if model is an encoder decoder, the encoder tokenizer class is used by default │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1812 in │
│ from_pretrained │
│ │
│ 1809 │ │ │ else: │
│ 1810 │ │ │ │ logger.info(f"loading file {file_path} from cache at {resolved_vocab_fil │
│ 1811 │ │ │
│ ❱ 1812 │ │ return cls._from_pretrained( │
│ 1813 │ │ │ resolved_vocab_files, │
│ 1814 │ │ │ pretrained_model_name_or_path, │
│ 1815 │ │ │ init_configuration, │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1975 in │
│ _from_pretrained │
│ │
│ 1972 │ │ │
│ 1973 │ │ # Instantiate tokenizer. │
│ 1974 │ │ try: │
│ ❱ 1975 │ │ │ tokenizer = cls(*init_inputs, **init_kwargs) │
│ 1976 │ │ except OSError: │
│ 1977 │ │ │ raise OSError( │
│ 1978 │ │ │ │ "Unable to load vocabulary from file. " │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │
│ 1 in init │
│ │
│ 218 │ │ self.mask_token = mask_token │
│ 219 │ │ self.gmask_token = gmask_token │
│ 220 │ │ │
│ ❱ 221 │ │ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens) │
│ 222 │ │ │
│ 223 │ │ """ Initialisation """ │
│ 224 │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:64 │
│ in init │
│ │
│ 61 │ │ self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "", "< │
│ 62 │ │ self.max_blank_length = max_blank_length │
│ 63 │ │ self.byte_fallback = byte_fallback │
│ ❱ 64 │ │ self.text_tokenizer = TextTokenizer(vocab_file) │
│ 65 │ │
│ 66 │ def _get_text_tokenizer(self): │
│ 67 │ │ return self.text_tokenizer │
│ │
│ /root/.cache/huggingface/modules/transformers_modules/chatglm-6b-int4/tokenization_chatglm.py:22 │
│ in init │
│ │
│ 19 class TextTokenizer: │
│ 20 │ def init(self, model_path): │
│ 21 │ │ self.sp = spm.SentencePieceProcessor() │
│ ❱ 22 │ │ self.sp.Load(model_path) │
│ 23 │ │ self.num_tokens = self.sp.vocab_size() │
│ 24 │ │
│ 25 │ def encode(self, text): │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:905 in Load │
│ │
│ 902 │ │ raise RuntimeError('model_file and model_proto must be exclusive.') │
│ 903 │ if model_proto: │
│ 904 │ │ return self.LoadFromSerializedProto(model_proto) │
│ ❱ 905 │ return self.LoadFromFile(model_file) │
│ 906 │
│ 907 │
│ 908 # Register SentencePieceProcessor in _sentencepiece: │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/sentencepiece/init.py:310 in LoadFromFile │
│ │
│ 307 │ │ return _sentencepiece.SentencePieceProcessor_serialized_model_proto(self) │
│ 308 │ │
│ 309 │ def LoadFromFile(self, arg): │
│ ❱ 310 │ │ return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg) │
│ 311 │ │
│ 312 │ def _EncodeAsIds(self, text, enable_sampling, nbest_size, alpha, add_bos, add_eos, r │
│ 313 │ │ return _sentencepiece.SentencePieceProcessor__EncodeAsIds(self, text, enable_sam │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Internal: src/sentencepiece_processor.cc(1101) [model_proto->ParseFromArray(serialized.data(), serialized.size())]
The text was updated successfully, but these errors were encountered: