Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 75 additions & 5 deletions sgr_deep_research/api/endpoints.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import logging
from typing import Any, Dict, List, Union

from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
Expand All @@ -9,6 +10,7 @@
AgentListResponse,
AgentStateResponse,
ChatCompletionRequest,
ChatMessage,
ClarificationRequest,
HealthResponse,
)
Expand Down Expand Up @@ -75,10 +77,53 @@ async def get_available_models():
return {"data": models_data, "object": "list"}


def extract_user_content_from_messages(messages):
for message in reversed(messages):
def _prepare_message_content(message: ChatMessage) -> Union[str, List[Dict[str, Any]]]:
"""Prepare message content: merge content with images field if present.

Images are passed as-is (URLs/base64) without any processing.
Returns content in OpenAI-compatible format (str or list of parts).
"""
content = message.content

# If no images field, return content as-is
if not message.images:
return content

# Wrap images in OpenAI image_url format (pass URLs/base64 as-is)
image_parts = [{"type": "image_url", "image_url": {"url": img, "detail": "auto"}} for img in message.images]

# Merge with existing content
if isinstance(content, str):
# Convert string to parts format
parts = [{"type": "text", "text": content}] if content else []
return parts + image_parts
elif isinstance(content, list):
# Already in parts format, append images
return content + image_parts
else:
# Fallback: empty content, just images
return image_parts


def extract_user_content_from_messages(
messages: list[ChatMessage], first: bool = False
) -> Union[str, List[Dict[str, Any]]]:
"""Extract content from user message(s) with support for images.

Args:
messages: List of chat messages
first: If True, extract from first user message (for task initialization).
If False, extract from last user message (for clarifications).

Returns:
Processed content (text + images) in OpenAI-compatible format.
"""
message_iter = messages if first else reversed(messages)

for message in message_iter:
if message.role == "user":
return message.content
return _prepare_message_content(message)

raise ValueError("User message not found in messages")


Expand All @@ -89,7 +134,12 @@ async def provide_clarification(agent_id: str, request: ClarificationRequest):
if not agent:
raise HTTPException(status_code=404, detail="Agent not found")

logger.info(f"Providing clarification to agent {agent.id}: {request.clarifications[:100]}...")
clarifications_preview = (
request.clarifications[:100]
if isinstance(request.clarifications, str)
else f"{len(request.clarifications)} parts (multimodal)"
)
logger.info(f"Providing clarification to agent {agent.id}: {clarifications_preview}...")

await agent.provide_clarification(request.clarifications)
return StreamingResponse(
Expand Down Expand Up @@ -132,7 +182,22 @@ async def create_chat_completion(request: ChatCompletionRequest):
)

try:
task = extract_user_content_from_messages(request.messages)
# Extract task text from first user message (for logging/prompts only)
# Full conversation context is passed via agent.conversation
task_content = extract_user_content_from_messages(request.messages, first=True)

# Extract text from content for agent.task field (string only)
if isinstance(task_content, str):
task = task_content
else:
text_parts = [p.get("text") for p in task_content if isinstance(p, dict) and p.get("type") == "text"]
task = " ".join(filter(None, text_parts)) or "Image-only request"

# Process all messages: merge images field into content parts
processed_messages = []
for msg in request.messages:
processed_content = _prepare_message_content(msg)
processed_messages.append({"role": msg.role, "content": processed_content})

agent_def = next(filter(lambda ad: ad.name == request.model, AgentFactory.get_definitions_list()), None)
if not agent_def:
Expand All @@ -144,6 +209,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
agent = await AgentFactory.create(agent_def, task)
logger.info(f"Created agent '{request.model}' for task: {task[:100]}...")

# Add all processed messages to agent conversation (excluding system, which is added by _prepare_context)
for msg in processed_messages:
if msg["role"] != "system":
agent.conversation.append(msg)

agents_storage[agent.id] = agent
_ = asyncio.create_task(agent.execute())
return StreamingResponse(
Expand Down
21 changes: 17 additions & 4 deletions sgr_deep_research/api/models.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,27 @@
"""OpenAI-compatible models for API endpoints."""

from datetime import datetime
from typing import Any, Dict, List, Literal
from typing import Any, Dict, List, Literal, Union

from pydantic import BaseModel, Field


class ChatMessage(BaseModel):
"""Chat message."""
"""Chat message compatible with OpenAI ChatCompletionMessageParam.

Supports multimodal content via content parts (text + image_url).
The 'images' field is a convenience shortcut that gets converted to
image_url parts.
"""

role: Literal["system", "user", "assistant", "tool"] = Field(default="user", description="Sender role")
content: str = Field(description="Message content")
content: Union[str, List[Dict[str, Any]]] = Field(
description="Message content: text string or OpenAI content parts (text/image_url)"
)
images: List[str] | None = Field(
default=None,
description="Optional convenience field: image paths/URLs/base64 (converted to image_url parts)",
)


class ChatCompletionRequest(BaseModel):
Expand Down Expand Up @@ -80,4 +91,6 @@ class AgentListResponse(BaseModel):
class ClarificationRequest(BaseModel):
"""Simple request for providing clarifications to an agent."""

clarifications: str = Field(description="Clarification text to provide to the agent")
clarifications: Union[str, List[Dict[str, Any]]] = Field(
description="Clarification content: text string or OpenAI content parts (text/image_url)"
)
56 changes: 48 additions & 8 deletions sgr_deep_research/core/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import traceback
import uuid
from datetime import datetime
from typing import Type
from typing import Any, Dict, List, Type, Union

from openai import AsyncOpenAI
from openai.types.chat import ChatCompletionFunctionToolParam
Expand Down Expand Up @@ -56,15 +56,38 @@ def __init__(
self.logger = logging.getLogger(f"sgr_deep_research.agents.{self.id}")
self.log = []

async def provide_clarification(self, clarifications: str):
"""Receive clarification from an external source (e.g. user input)"""
self.conversation.append(
{"role": "user", "content": PromptLoader.get_clarification_template(clarifications, self.config.prompts)}
)
async def provide_clarification(self, clarifications: Union[str, List[Dict[str, Any]]]):
"""Receive clarification from an external source (e.g. user input).

Supports both text-only clarifications (str) and multimodal
content (list of parts with images).
"""
if isinstance(clarifications, str):
# Text-only clarification: use template as before
content = PromptLoader.get_clarification_template(clarifications, self.config.prompts)
log_content = clarifications[:2000]
else:
# Multimodal content (list of parts): use directly, but wrap text parts in template if present
text_parts = [p.get("text") for p in clarifications if isinstance(p, dict) and p.get("type") == "text"]
image_parts = [p for p in clarifications if isinstance(p, dict) and p.get("type") == "image_url"]

if text_parts:
# Combine text parts and wrap in template
combined_text = " ".join(filter(None, text_parts))
template_text = PromptLoader.get_clarification_template(combined_text, self.config.prompts)
# Create parts: template text + images
content = [{"type": "text", "text": template_text}] + image_parts
log_content = combined_text[:2000]
else:
# Images only: use as-is (no template wrapping for image-only)
content = clarifications
log_content = f"{len(image_parts)} image(s)"

self.conversation.append({"role": "user", "content": content})
self._context.clarifications_used += 1
self._context.clarification_received.set()
self._context.state = AgentStatesEnum.RESEARCHING
self.logger.info(f"✅ Clarification received: {clarifications[:2000]}...")
self.logger.info(f"✅ Clarification received: {log_content}...")

def _log_reasoning(self, result: ReasoningTool) -> None:
next_step = result.remaining_steps[0] if result.remaining_steps else "Completing"
Expand Down Expand Up @@ -131,16 +154,33 @@ def _save_agent_log(self):

json.dump(agent_log, open(filepath, "w", encoding="utf-8"), indent=2, ensure_ascii=False)

@staticmethod
def _normalize_messages(messages: list[dict]) -> list[dict]:
"""Ensure messages use OpenAI content parts when needed (supports
images)."""
normalized = []
for msg in messages:
content = msg.get("content")
if isinstance(content, str):
msg = {**msg, "content": [{"type": "text", "text": content}]}
elif isinstance(content, list):
msg = {**msg, "content": content}
elif content is None:
msg = {**msg, "content": [{"type": "text", "text": ""}]}
normalized.append(msg)
return normalized

async def _prepare_context(self) -> list[dict]:
"""Prepare conversation context with system prompt."""
return [
messages = [
{"role": "system", "content": PromptLoader.get_system_prompt(self.toolkit, self.config.prompts)},
{
"role": "user",
"content": PromptLoader.get_initial_user_request(self.task, self.config.prompts),
},
*self.conversation,
]
return self._normalize_messages(messages)

async def _prepare_tools(self) -> list[ChatCompletionFunctionToolParam]:
"""Prepare available tools for the current agent state and progress."""
Expand Down