Add prefill to completions

miquido · Aug 22, 2024 · b55ff10 · b55ff10
1 parent 5717680
commit b55ff10
Show file tree

Hide file tree

Showing 17 changed files with 149 additions and 33 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "draive"
 description = "Framework designed to simplify and accelerate the development of LLM-based applications."
-version = "0.26.0"
+version = "0.27.0"
 readme = "README.md"
 maintainers = [
  { name = "Kacper Kaliński", email = "[email protected]" },

diff --git a/src/draive/anthropic/lmm.py b/src/draive/anthropic/lmm.py
@@ -332,7 +332,21 @@ async def _completion( # noqa: PLR0913, PLR0912, C901
  ),
  )
 
- message_parts: list[TextBlock] = []
+ message_parts: list[TextBlock]
+ match messages[-1]:
+ case {"role": "assistant", "content": str() as content_text}:
+ message_parts = [TextBlock(type="text", text=content_text)]
+
+ case {"role": "assistant", "content": content_parts}:
+ message_parts = [ # currently supporting only text prefills
+ TextBlock(type="text", text=part.text)
+ for part in content_parts
+ if isinstance(part, TextBlock)
+ ]
+
+ case _:
+ message_parts = []
+
  tool_calls: list[ToolUseBlock] = []
  for part in completion.content:
  match part:
@@ -376,8 +390,12 @@ async def _completion( # noqa: PLR0913, PLR0912, C901
 
  else:
  ctx.record(ResultTrace.of(message_parts))
+
  return LMMCompletion.of(
- MultimodalContent.of(*[TextContent(text=part.text) for part in message_parts])
+ MultimodalContent.of(
+ *[TextContent(text=part.text) for part in message_parts],
+ merge_text=True,
+ )
  )
 
  case other:

diff --git a/src/draive/choice/call.py b/src/draive/choice/call.py
@@ -13,11 +13,12 @@
 ]
 
 
-async def choice_completion(
+async def choice_completion( # noqa: PLR0913
  *,
  instruction: Instruction | str,
  options: Iterable[ChoiceOption | Multimodal],
  input: Multimodal, # noqa: A002
+ prefill: str | None = None,
  tools: Toolbox | Sequence[AnyTool] | None = None,
  examples: Iterable[tuple[Multimodal, ChoiceOption]] | None = None,
  **extra: Any,
@@ -40,6 +41,7 @@ async def choice_completion(
  for option in options
  ],
  input=input,
+ prefill=prefill,
  toolbox=toolbox,
  examples=examples,
  **extra,

diff --git a/src/draive/choice/completion.py b/src/draive/choice/completion.py
@@ -13,12 +13,13 @@
 
 @runtime_checkable
 class ChoiceCompletion(Protocol):
- async def __call__(
+ async def __call__( # noqa: PLR0913
  self,
  *,
  instruction: Instruction | str,
  options: Sequence[ChoiceOption],
  input: Multimodal, # noqa: A002
+ prefill: str | None,
  toolbox: Toolbox,
  examples: Iterable[tuple[Multimodal, ChoiceOption]] | None,
  **extra: Any,

diff --git a/src/draive/choice/lmm.py b/src/draive/choice/lmm.py
@@ -23,19 +23,19 @@
 ]
 
 
-async def lmm_choice_completion( # noqa: C901
+async def lmm_choice_completion( # noqa: C901, PLR0912, PLR0913
  *,
  instruction: Instruction | str,
  options: Sequence[ChoiceOption],
  input: Multimodal, # noqa: A002
+ prefill: str | None,
  toolbox: Toolbox,
- examples: Iterable[tuple[Multimodal, ChoiceOption]] | None = None,
+ examples: Iterable[tuple[Multimodal, ChoiceOption]] | None,
  **extra: Any,
 ) -> ChoiceOption:
  with ctx.nested(
  "lmm_choice_completion",
  ):
- assert "select" in str(instruction).lower(), "Instruction have to contain a word 'select'" # nosec: B101
  assert options, "Choice options cannot be empty" # nosec: B101
  assert all( # nosec: B101
  example[1] in options for example in examples or []
@@ -78,6 +78,9 @@ async def lmm_choice_completion( # noqa: C901
  ),
  ]
 
+ if prefill := prefill:
+ context.append(LMMCompletion.of(prefill))
+
  recursion_level: int = 0
  while recursion_level <= toolbox.recursion_limit:
  match await lmm_invocation(
@@ -91,7 +94,7 @@ async def lmm_choice_completion( # noqa: C901
  ):
  case LMMCompletion() as completion:
  ctx.log_debug("Received choice results")
- if selection := xml_tag("SELECTION", source=completion.content.as_string()):
+ if selection := xml_tag("CHOICE", source=completion.content.as_string()):
  if option := options_map.get(selection):
  return option
 
@@ -105,7 +108,7 @@ async def lmm_choice_completion( # noqa: C901
  response.content for response in responses if response.direct
  ]:
  if selection := xml_tag(
- "SELECTION",
+ "CHOICE",
  source=MultimodalContent.of(*direct_content).as_string(),
  ):
  if option := options_map.get(selection):
@@ -161,6 +164,8 @@ def _format_example(
 
 
 INSTRUCTION_EXTENSION: str = """\
-Selection HAVE to contain an identifier of a chosen option inside a `SELECTION` \
-xml tag within the result i.e. `<SELECTION>identifier</SELECTION>`.
+<FORMAT>
+Place identifier of the final choice inside a <CHOICE> XML tag within the result, \
+like this: `<CHOICE>identifier</CHOICE>`.
+</FORMAT>
 """
diff --git a/src/draive/gemini/lmm.py b/src/draive/gemini/lmm.py
@@ -39,6 +39,7 @@
  LMMToolRequest,
  LMMToolRequests,
  LMMToolResponse,
+ MultimodalContent,
  MultimodalContentElement,
  TextContent,
  VideoBase64Content,
@@ -359,6 +360,24 @@ async def _generate( # noqa: PLR0913, C901, PLR0912, PLR0915
 
  converted_tools.append(tool_function)
 
+ prefill: str = ""
+ match messages[-1]:
+ case {"role": "model", "parts": content_parts}:
+ if config.response_format == "application/json":
+ del messages[-1] # for json mode ignore prefill
+
+ else:
+ for part in content_parts:
+ match part: # currently supporting only text prefills
+ case {"text": str() as text}:
+ prefill += text
+
+ case _:
+ continue
+
+ case _:
+ pass
+
  match tool_selection:
  case "auto":
  result = await client.generate(
@@ -445,7 +464,8 @@ async def _generate( # noqa: PLR0913, C901, PLR0912, PLR0915
 
  message_parts: list[
  GeminiTextMessageContent | GeminiDataReferenceMessageContent | GeminiDataMessageContent
- ] = []
+ ] = [GeminiTextMessageContent(text=prefill)] if prefill else []
+
  tool_calls: list[GeminiFunctionCallMessageContent] = []
  for part in result_message.content:
  match part:
@@ -483,7 +503,12 @@ async def _generate( # noqa: PLR0913, C901, PLR0912, PLR0915
 
  elif message_parts:
  ctx.record(ResultTrace.of(message_parts))
- return LMMCompletion.of(*[_convert_content_part(part) for part in message_parts])
+ return LMMCompletion.of(
+ MultimodalContent.of(
+ *[_convert_content_part(part) for part in message_parts],
+ merge_text=True,
+ )
+ )
 
  else:
  raise GeminiException("Invalid Gemini completion", result)

diff --git a/src/draive/generation/model/lmm.py b/src/draive/generation/model/lmm.py
@@ -89,6 +89,7 @@ async def lmm_generate_model[Generated: DataModel]( # noqa: PLR0913, C901, PLR0
  ]
  ],
  LMMInput.of(input),
+ LMMCompletion.of("{"), # prefill with json opening
  ]
 
  recursion_level: int = 0
@@ -146,10 +147,10 @@ async def lmm_generate_model[Generated: DataModel]( # noqa: PLR0913, C901, PLR0
 
 DEFAULT_INSTRUCTION_EXTENSION: str = """\
 <FORMAT>
-The result have to be a JSON object conforming to the following schema:
-```
+Provide the result using a single raw valid JSON object that adheres strictly to the given \
+SCHEMA without any comments, formatting, or additional elements.
+<SCHEMA>
 {schema}
-```
-Provide ONLY a single, raw, valid JSON without any comments, formatting or additional elements.
+</SCHEMA>
 </FORMAT>
 """
diff --git a/src/draive/generation/text/call.py b/src/draive/generation/text/call.py
@@ -16,13 +16,15 @@ async def generate_text(
  *,
  instruction: Instruction | str,
  input: MultimodalContent | MultimodalContentConvertible, # noqa: A002
+ prefill: str | None = None,
  tools: Toolbox | Sequence[AnyTool] | None = None,
  examples: Iterable[tuple[MultimodalContent | MultimodalContentConvertible, str]] | None = None,
  **extra: Any,
 ) -> str:
  return await ctx.state(TextGeneration).generate(
  instruction=instruction,
  input=input,
+ prefill=prefill,
  tools=tools,
  examples=examples,
  **extra,

diff --git a/src/draive/generation/text/generator.py b/src/draive/generation/text/generator.py
@@ -17,6 +17,7 @@ async def __call__(
  *,
  instruction: Instruction | str,
  input: MultimodalContent | MultimodalContentConvertible, # noqa: A002
+ prefill: str | None = None,
  tools: Toolbox | Sequence[AnyTool] | None = None,
  examples: Iterable[tuple[MultimodalContent | MultimodalContentConvertible, str]]
  | None = None,

diff --git a/src/draive/generation/text/lmm.py b/src/draive/generation/text/lmm.py
@@ -23,6 +23,7 @@ async def lmm_generate_text(
  *,
  instruction: Instruction | str,
  input: MultimodalContent | MultimodalContentConvertible, # noqa: A002
+ prefill: str | None = None,
  tools: Toolbox | Sequence[AnyTool] | None = None,
  examples: Iterable[tuple[MultimodalContent | MultimodalContentConvertible, str]] | None = None,
  **extra: Any,
@@ -51,6 +52,11 @@ async def lmm_generate_text(
  LMMInput.of(input),
  ]
 
+ if prefill := prefill:
+ context.append(
+ LMMCompletion.of(prefill),
+ )
+
  recursion_level: int = 0
  while recursion_level <= toolbox.recursion_limit:
  match await lmm_invocation(

diff --git a/src/draive/mistral/client.py b/src/draive/mistral/client.py
@@ -82,6 +82,13 @@ async def chat_completion(
  raise NotImplementedError("Mistral streaming is not supported yet")
 
  else:
+ if messages[-1]["role"] == "assistant":
+ if config.response_format == {"type": "json_object"}:
+ del messages[-1] # for json mode ignore prefill
+
+ else:
+ messages[-1]["prefix"] = True # add prefill parameter indicator
+
  return await self._create_chat_completion(
  messages=messages,
  model=config.model,

diff --git a/src/draive/mistral/lmm.py b/src/draive/mistral/lmm.py
@@ -200,6 +200,7 @@ async def _chat_completion(
  ),
  tool_choice="auto",
  )
+
  case "none":
  completion = await client.chat_completion(
  config=config,

diff --git a/src/draive/mistral/models.py b/src/draive/mistral/models.py
@@ -55,6 +55,7 @@ class ChatMessage(TypedDict, total=False):
  content: Required[str | list[str]]
  name: NotRequired[str]
  tool_calls: NotRequired[list[ChatToolCallRequest]]
+ prefix: NotRequired[bool]
 
 
 class ChatFunctionCallResponse(DataModel):

diff --git a/src/draive/ollama/lmm.py b/src/draive/ollama/lmm.py
@@ -7,7 +7,6 @@
 from draive.metrics.tokens import TokenUsage
 from draive.ollama.client import OllamaClient
 from draive.ollama.config import OllamaChatConfig
-from draive.ollama.errors import OllamaException
 from draive.ollama.models import ChatCompletionResponse, ChatMessage
 from draive.scope import ctx
 from draive.types import (
@@ -162,6 +161,14 @@ async def _chat_completion(
  config: OllamaChatConfig,
  messages: list[ChatMessage],
 ) -> LMMOutput:
+ prefill: str = ""
+ if messages[-1].role == "assistant":
+ if config.response_format == "json":
+ del messages[-1] # for json mode ignore prefill
+
+ else:
+ prefill = messages[-1].content
+
  completion: ChatCompletionResponse = await client.chat_completion(
  config=config,
  messages=messages,
@@ -175,12 +182,9 @@ async def _chat_completion(
  ),
  )
 
- if message := completion.message.content:
- ctx.record(ResultTrace.of(message))
- return LMMCompletion.of(message)
-
- else:
- raise OllamaException("Invalid Ollama completion", completion)
+ completion_message: str = prefill + completion.message.content
+ ctx.record(ResultTrace.of(completion_message))
+ return LMMCompletion.of(completion_message)
 
 
 async def _chat_completion_stream(