Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions xinference/model/llm/llm_family.json
Original file line number Diff line number Diff line change
Expand Up @@ -18670,5 +18670,48 @@
"#system_numpy#"
]
}
},
{
"version": 2,
"context_length": 131072,
"model_name": "mistral-small-3.2-instruct",
"model_lang": [
"en",
"fr",
"de",
"es",
"it",
"pt",
"zh",
"ru",
"ja",
"ko"
],
"model_ability": [
"chat",
"vision"
],
"model_description": "Mistral-Small-3.2 is a 24B parameter model designed for instruction-following tasks, optimized for performance and efficiency. It supports both English and Chinese languages, making it versatile for various applications.",
"model_specs": [
{
"model_format": "pytorch",
"model_size_in_billions": 24,
"model_src": {
"huggingface": {
"quantizations": [
"none"
],
"model_id": "unsloth/Mistral-Small-3.2-24B-Instruct-2506-bnb-4bit"
}
}
}
],
"chat_template": "{%- set today = strftime_now('%Y-%m-%d') %}\n{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYour knowledge base was last updated on 2023-10-01. The current date is \" + today + \".\\n\\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\")\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- else %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n {{- block['text'] }}\n {%- elif block['type'] in ['image', 'image_url'] %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'system' %}\n {%- if message['content'] is string %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- else %}\n {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] + eos_token }}\n {%- else %}\n {{- message['content'][0]['text'] + eos_token }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Only user, system and assistant roles are supported!') }}\n {%- endif %}\n{%- endfor %}",
"stop_token_ids": [
2
],
"stop": [
"</s>"
]
}
]
205 changes: 205 additions & 0 deletions xinference/model/llm/transformers/multimodal/mistral3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# Copyright 2022-2025 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, Iterator, List, Optional, Tuple

import torch

from .....model.utils import select_device
from .....types import PytorchModelConfig
from ...llm_family import LLMFamilyV2, LLMSpecV1, register_transformer
from ...utils import _decode_image
from ..core import register_non_default_model
from .core import PytorchMultiModalModel

logger = logging.getLogger(__name__)


@register_transformer
@register_non_default_model("mistral-small-3.2-instruct")
class MistralMultimodalModel(PytorchMultiModalModel):
def _sanitize_model_config(
self, pytorch_model_config: Optional[PytorchModelConfig]
) -> PytorchModelConfig:
pytorch_model_config = super()._sanitize_model_config(pytorch_model_config)
assert pytorch_model_config is not None
return pytorch_model_config

@classmethod
def match_json(
cls, model_family: "LLMFamilyV2", model_spec: "LLMSpecV1", quantization: str
) -> bool:
if model_spec.model_format not in ["pytorch", "gptq", "awq"]:
return False
llm_family = model_family.model_family or model_family.model_name
if "mistral-small-3.2-instruct" in llm_family.lower():
return True
return False

def decide_device(self):
device = self._pytorch_model_config.get("device", "cuda")
self._device = select_device(device)

def load_processor(self):
from transformers import AutoProcessor, AutoTokenizer

min_pixels = self._pytorch_model_config.get("min_pixels")
max_pixels = self._pytorch_model_config.get("max_pixels")
self._processor = AutoProcessor.from_pretrained(
self.model_path,
trust_remote_code=True,
min_pixels=min_pixels,
max_pixels=max_pixels,
)
self._tokenizer = AutoTokenizer.from_pretrained(
self.model_path, trust_remote_code=True, use_fast=False
)

def load_multimodal_model(self):
from transformers import BitsAndBytesConfig, Mistral3ForConditionalGeneration

kwargs = {"device_map": self._device}
kwargs = self.apply_bnb_quantization(kwargs)

if "4bit" in self.model_path:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
kwargs["quantization_config"] = quantization_config
elif "8bit" in self.model_path:
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
kwargs["quantization_config"] = quantization_config

self._model = Mistral3ForConditionalGeneration.from_pretrained(
self.model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
torch_dtype=torch.float16,
**kwargs
).eval()
# if self._device == 'cuda':
# self._model.cuda()

@staticmethod
def _get_processed_msgs(
messages: List[Dict],
) -> tuple[List[Dict], List[str], List[Any]]:
res = []
texts = []
images = []
for message in messages:
role = message["role"]
content = message["content"]
if isinstance(content, str):
res.append(
{"role": role, "content": [{"type": "text", "text": content}]}
)
texts.append(content)
else:
texts = []
image_urls = []
for c in content:
c_type = c.get("type")
if c_type == "text":
texts.append(c["text"])
else:
assert (
c_type == "image_url"
), "Please follow the image input of the OpenAI API."
image_urls.append(c["image_url"]["url"])
if len(image_urls) > 1:
raise RuntimeError("Only one image per message is supported")
image_futures = []
with ThreadPoolExecutor() as executor:
for image_url in image_urls:
fut = executor.submit(_decode_image, image_url)
image_futures.append(fut)
images = [fut.result() for fut in image_futures]
assert len(images) <= 1
text = " ".join(texts)
if images:
res.append(
{
"role": role,
"content": [
{"type": "image", "image": images[0]},
{"type": "text", "text": text},
],
}
)
texts.append(text)
images.append(images[0])
else:
texts.append(text)
res.append(
{"role": role, "content": [{"type": "text", "text": text}]}
)
return res, texts, images

@staticmethod
def flatten_content(msg):
if isinstance(msg["content"], list):
parts = []
for part in msg["content"]:
if part["type"] == "image":
parts.append("<image>") # 或者其他占位符
elif part["type"] == "text":
parts.append(part["text"])
msg["content"] = "".join(parts)
return msg

def build_inputs_from_messages(self, messages: List[Dict], generate_config: Dict):
rst, text, images = self._get_processed_msgs(messages)
flattened_messages = [self.flatten_content(m.copy()) for m in rst]
inputs = self._tokenizer.apply_chat_template(
conversation=flattened_messages,
# text=text,
images=images,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
return_dict=True,
)
inputs = inputs.to(self._device)
return inputs

def build_generate_kwargs(self, generate_config: Dict) -> Dict[str, Any]:
return dict(
max_new_tokens=generate_config.get("max_tokens", 1000),
temperature=generate_config.get("temperature", 1),
eos_token_id=generate_config.get("eos_token_id", 2),
do_sample=generate_config.get("do_sample", True),
bos_token_id=generate_config.get("bos_token_id", 1),
)

def build_streaming_iter(
self,
messages: List[Dict],
generate_config: Dict,
) -> Tuple[Iterator, int]:
from threading import Thread

from transformers import TextIteratorStreamer

inputs = self.build_inputs_from_messages(messages, generate_config)
configs = self.build_generate_kwargs(generate_config)

tokenizer = self._tokenizer
streamer = TextIteratorStreamer(
tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True
)

gen_kwargs = {"streamer": streamer, **inputs, **configs}
t = Thread(target=self._model.generate, kwargs=gen_kwargs)
t.start()
return streamer, len(inputs["input_ids"][0])
4 changes: 4 additions & 0 deletions xinference/model/llm/vllm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ class VLLMGenerateConfig(TypedDict, total=False):
VLLM_SUPPORTED_CHAT_MODELS.append("gemma-3-1b-it")
VLLM_SUPPORTED_VISION_MODEL_LIST.append("gemma-3-it")

if VLLM_INSTALLED and vllm.__version__ >= "0.8.1":
VLLM_SUPPORTED_CHAT_MODELS.append("mistral-small-3.2-instruct")
VLLM_SUPPORTED_VISION_MODEL_LIST.append("mistral-small-3.2-instruct")

if VLLM_INSTALLED and vllm.__version__ >= "0.8.4":
VLLM_SUPPORTED_CHAT_MODELS.append("glm4-0414")

Expand Down
Loading