vamplabAI · hijera · Dec 6, 2025 · Dec 6, 2025 · Dec 6, 2025 · Dec 6, 2025
diff --git a/sgr_deep_research/api/endpoints.py b/sgr_deep_research/api/endpoints.py
@@ -1,5 +1,6 @@
 import asyncio
 import logging
+from typing import Any, Dict, List, Union
 
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
@@ -9,6 +10,7 @@
     AgentListResponse,
     AgentStateResponse,
     ChatCompletionRequest,
+    ChatMessage,
     ClarificationRequest,
     HealthResponse,
 )
@@ -75,10 +77,53 @@ async def get_available_models():
     return {"data": models_data, "object": "list"}
 
 
-def extract_user_content_from_messages(messages):
-    for message in reversed(messages):
+def _prepare_message_content(message: ChatMessage) -> Union[str, List[Dict[str, Any]]]:
+    """Prepare message content: merge content with images field if present.
+
+    Images are passed as-is (URLs/base64) without any processing.
+    Returns content in OpenAI-compatible format (str or list of parts).
+    """
+    content = message.content
+
+    # If no images field, return content as-is
+    if not message.images:
+        return content
+
+    # Wrap images in OpenAI image_url format (pass URLs/base64 as-is)
+    image_parts = [{"type": "image_url", "image_url": {"url": img, "detail": "auto"}} for img in message.images]
+
+    # Merge with existing content
+    if isinstance(content, str):
+        # Convert string to parts format
+        parts = [{"type": "text", "text": content}] if content else []
+        return parts + image_parts
+    elif isinstance(content, list):
+        # Already in parts format, append images
+        return content + image_parts
+    else:
+        # Fallback: empty content, just images
+        return image_parts
+
+
+def extract_user_content_from_messages(
+    messages: list[ChatMessage], first: bool = False
+) -> Union[str, List[Dict[str, Any]]]:
+    """Extract content from user message(s) with support for images.
+
+    Args:
+        messages: List of chat messages
+        first: If True, extract from first user message (for task initialization).
+               If False, extract from last user message (for clarifications).
+
+    Returns:
+        Processed content (text + images) in OpenAI-compatible format.
+    """
+    message_iter = messages if first else reversed(messages)
+
+    for message in message_iter:
         if message.role == "user":
-            return message.content
+            return _prepare_message_content(message)
+
     raise ValueError("User message not found in messages")
 
 
@@ -89,7 +134,12 @@ async def provide_clarification(agent_id: str, request: ClarificationRequest):
         if not agent:
             raise HTTPException(status_code=404, detail="Agent not found")
 
-        logger.info(f"Providing clarification to agent {agent.id}: {request.clarifications[:100]}...")
+        clarifications_preview = (
+            request.clarifications[:100]
+            if isinstance(request.clarifications, str)
+            else f"{len(request.clarifications)} parts (multimodal)"
+        )
+        logger.info(f"Providing clarification to agent {agent.id}: {clarifications_preview}...")
 
         await agent.provide_clarification(request.clarifications)
         return StreamingResponse(
@@ -132,7 +182,22 @@ async def create_chat_completion(request: ChatCompletionRequest):
         )
 
     try:
-        task = extract_user_content_from_messages(request.messages)
+        # Extract task text from first user message (for logging/prompts only)
+        # Full conversation context is passed via agent.conversation
+        task_content = extract_user_content_from_messages(request.messages, first=True)
+
+        # Extract text from content for agent.task field (string only)
+        if isinstance(task_content, str):
+            task = task_content
+        else:
+            text_parts = [p.get("text") for p in task_content if isinstance(p, dict) and p.get("type") == "text"]
+            task = " ".join(filter(None, text_parts)) or "Image-only request"
+
+        # Process all messages: merge images field into content parts
+        processed_messages = []
+        for msg in request.messages:
+            processed_content = _prepare_message_content(msg)
+            processed_messages.append({"role": msg.role, "content": processed_content})
 
         agent_def = next(filter(lambda ad: ad.name == request.model, AgentFactory.get_definitions_list()), None)
         if not agent_def:
@@ -144,6 +209,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
         agent = await AgentFactory.create(agent_def, task)
         logger.info(f"Created agent '{request.model}' for task: {task[:100]}...")
 
+        # Add all processed messages to agent conversation (excluding system, which is added by _prepare_context)
+        for msg in processed_messages:
+            if msg["role"] != "system":
+                agent.conversation.append(msg)
+
         agents_storage[agent.id] = agent
         _ = asyncio.create_task(agent.execute())
         return StreamingResponse(

diff --git a/sgr_deep_research/api/models.py b/sgr_deep_research/api/models.py
@@ -1,16 +1,27 @@
 """OpenAI-compatible models for API endpoints."""
 
 from datetime import datetime
-from typing import Any, Dict, List, Literal
+from typing import Any, Dict, List, Literal, Union
 
 from pydantic import BaseModel, Field
 
 
 class ChatMessage(BaseModel):
-    """Chat message."""
+    """Chat message compatible with OpenAI ChatCompletionMessageParam.
+
+    Supports multimodal content via content parts (text + image_url).
+    The 'images' field is a convenience shortcut that gets converted to
+    image_url parts.
+    """
 
     role: Literal["system", "user", "assistant", "tool"] = Field(default="user", description="Sender role")
-    content: str = Field(description="Message content")
+    content: Union[str, List[Dict[str, Any]]] = Field(
+        description="Message content: text string or OpenAI content parts (text/image_url)"
+    )
+    images: List[str] | None = Field(
+        default=None,
+        description="Optional convenience field: image paths/URLs/base64 (converted to image_url parts)",
+    )
 
 
 class ChatCompletionRequest(BaseModel):
@@ -80,4 +91,6 @@ class AgentListResponse(BaseModel):
 class ClarificationRequest(BaseModel):
     """Simple request for providing clarifications to an agent."""
 
-    clarifications: str = Field(description="Clarification text to provide to the agent")
+    clarifications: Union[str, List[Dict[str, Any]]] = Field(
+        description="Clarification content: text string or OpenAI content parts (text/image_url)"
+    )
diff --git a/sgr_deep_research/core/base_agent.py b/sgr_deep_research/core/base_agent.py
@@ -4,7 +4,7 @@
 import traceback
 import uuid
 from datetime import datetime
-from typing import Type
+from typing import Any, Dict, List, Type, Union
 
 from openai import AsyncOpenAI
 from openai.types.chat import ChatCompletionFunctionToolParam
@@ -56,15 +56,38 @@ def __init__(
         self.logger = logging.getLogger(f"sgr_deep_research.agents.{self.id}")
         self.log = []
 
-    async def provide_clarification(self, clarifications: str):
-        """Receive clarification from an external source (e.g. user input)"""
-        self.conversation.append(
-            {"role": "user", "content": PromptLoader.get_clarification_template(clarifications, self.config.prompts)}
-        )
+    async def provide_clarification(self, clarifications: Union[str, List[Dict[str, Any]]]):
+        """Receive clarification from an external source (e.g. user input).
+
+        Supports both text-only clarifications (str) and multimodal
+        content (list of parts with images).
+        """
+        if isinstance(clarifications, str):
+            # Text-only clarification: use template as before
+            content = PromptLoader.get_clarification_template(clarifications, self.config.prompts)
+            log_content = clarifications[:2000]
+        else:
+            # Multimodal content (list of parts): use directly, but wrap text parts in template if present
+            text_parts = [p.get("text") for p in clarifications if isinstance(p, dict) and p.get("type") == "text"]
+            image_parts = [p for p in clarifications if isinstance(p, dict) and p.get("type") == "image_url"]
+
+            if text_parts:
+                # Combine text parts and wrap in template
+                combined_text = " ".join(filter(None, text_parts))
+                template_text = PromptLoader.get_clarification_template(combined_text, self.config.prompts)
+                # Create parts: template text + images
+                content = [{"type": "text", "text": template_text}] + image_parts
+                log_content = combined_text[:2000]
+            else:
+                # Images only: use as-is (no template wrapping for image-only)
+                content = clarifications
+                log_content = f"{len(image_parts)} image(s)"
+
+        self.conversation.append({"role": "user", "content": content})
         self._context.clarifications_used += 1
         self._context.clarification_received.set()
         self._context.state = AgentStatesEnum.RESEARCHING
-        self.logger.info(f"✅ Clarification received: {clarifications[:2000]}...")
+        self.logger.info(f"✅ Clarification received: {log_content}...")
 
     def _log_reasoning(self, result: ReasoningTool) -> None:
         next_step = result.remaining_steps[0] if result.remaining_steps else "Completing"
@@ -131,16 +154,33 @@ def _save_agent_log(self):
 
         json.dump(agent_log, open(filepath, "w", encoding="utf-8"), indent=2, ensure_ascii=False)
 
+    @staticmethod
+    def _normalize_messages(messages: list[dict]) -> list[dict]:
+        """Ensure messages use OpenAI content parts when needed (supports
+        images)."""
+        normalized = []
+        for msg in messages:
+            content = msg.get("content")
+            if isinstance(content, str):
+                msg = {**msg, "content": [{"type": "text", "text": content}]}
+            elif isinstance(content, list):
+                msg = {**msg, "content": content}
+            elif content is None:
+                msg = {**msg, "content": [{"type": "text", "text": ""}]}
+            normalized.append(msg)
+        return normalized
+
     async def _prepare_context(self) -> list[dict]:
         """Prepare conversation context with system prompt."""
-        return [
+        messages = [
             {"role": "system", "content": PromptLoader.get_system_prompt(self.toolkit, self.config.prompts)},
             {
                 "role": "user",
                 "content": PromptLoader.get_initial_user_request(self.task, self.config.prompts),
             },
             *self.conversation,
         ]
+        return self._normalize_messages(messages)
 
     async def _prepare_tools(self) -> list[ChatCompletionFunctionToolParam]:
         """Prepare available tools for the current agent state and progress."""