From 571768050dd1fa4b9f3ec3ac09b80c0d09fe2dd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Kali=C5=84ski?=
 <47140412+KaQuMiQ@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:31:14 +0200
Subject: [PATCH] Refine evaluation interfaces

---
 Makefile                                      |   2 +-
 constraints                                   |  18 ++-
 pyproject.toml                                |   2 +-
 src/draive/__init__.py                        |   6 +-
 src/draive/evaluation/__init__.py             |  13 +-
 src/draive/evaluation/evaluator.py            | 126 ++++++++++++----
 src/draive/evaluation/scenario.py             |  84 +++++++++--
 src/draive/evaluation/score.py                |  17 ---
 src/draive/evaluation/suite.py                |  90 +++++++----
 src/draive/evaluators/__init__.py             |  40 ++---
 src/draive/evaluators/coherence.py            |  90 +++++++++++
 src/draive/evaluators/conciseness.py          |  88 +++++++++++
 src/draive/evaluators/consistency.py          |  90 +++++++++++
 src/draive/evaluators/coverage.py             |  88 +++++++++++
 src/draive/evaluators/fluency.py              |  74 +++++++++
 .../{text_keywords.py => keywords.py}         |  15 +-
 src/draive/evaluators/readability.py          |  80 ++++++++++
 src/draive/evaluators/relevance.py            |  91 ++++++++++++
 src/draive/evaluators/score.py                |   8 +-
 src/draive/evaluators/similarity.py           | 133 +++++++++++++++++
 src/draive/evaluators/text_coherence.py       | 134 -----------------
 src/draive/evaluators/text_conciseness.py     | 140 ------------------
 src/draive/evaluators/text_consistency.py     | 133 -----------------
 src/draive/evaluators/text_coverage.py        | 137 -----------------
 src/draive/evaluators/text_fluency.py         |  89 -----------
 src/draive/evaluators/text_readability.py     | 100 -------------
 src/draive/evaluators/text_relevance.py       | 130 ----------------
 src/draive/evaluators/text_similarity.py      | 128 ----------------
 src/draive/generation/model/lmm.py            |   4 +-
 src/draive/types/__init__.py                  |   6 +-
 src/draive/types/multimodal.py                |  85 ++++++++++-
 31 files changed, 1111 insertions(+), 1130 deletions(-)
 create mode 100644 src/draive/evaluators/coherence.py
 create mode 100644 src/draive/evaluators/conciseness.py
 create mode 100644 src/draive/evaluators/consistency.py
 create mode 100644 src/draive/evaluators/coverage.py
 create mode 100644 src/draive/evaluators/fluency.py
 rename src/draive/evaluators/{text_keywords.py => keywords.py} (73%)
 create mode 100644 src/draive/evaluators/readability.py
 create mode 100644 src/draive/evaluators/relevance.py
 create mode 100644 src/draive/evaluators/similarity.py
 delete mode 100644 src/draive/evaluators/text_coherence.py
 delete mode 100644 src/draive/evaluators/text_conciseness.py
 delete mode 100644 src/draive/evaluators/text_consistency.py
 delete mode 100644 src/draive/evaluators/text_coverage.py
 delete mode 100644 src/draive/evaluators/text_fluency.py
 delete mode 100644 src/draive/evaluators/text_readability.py
 delete mode 100644 src/draive/evaluators/text_relevance.py
 delete mode 100644 src/draive/evaluators/text_similarity.py

diff --git a/Makefile b/Makefile
index c9310d3..9890709 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ ifndef INSTALL_OPTIONS
 endif
 
 ifndef UV_VERSION
-	UV_VERSION := 0.2.25
+	UV_VERSION := 0.2.37
 endif
 
 .PHONY: install venv sync lock update format lint test release
diff --git a/constraints b/constraints
index c0b444c..658ae02 100644
--- a/constraints
+++ b/constraints
@@ -57,7 +57,9 @@ idna==3.7
 iniconfig==2.0.0
     # via pytest
 jiter==0.5.0
-    # via anthropic
+    # via
+    #   anthropic
+    #   openai
 loguru==0.7.2
     # via fastembed
 markdown-it-py==3.0.0
@@ -80,9 +82,9 @@ numpy==1.26.4
     #   onnxruntime
 onnx==1.16.2
     # via fastembed
-onnxruntime==1.18.1
+onnxruntime==1.19.0
     # via fastembed
-openai==1.38.0
+openai==1.41.0
     # via draive (pyproject.toml)
 packaging==24.1
     # via
@@ -107,7 +109,7 @@ pydantic-core==2.20.1
     # via pydantic
 pygments==2.18.0
     # via rich
-pyright==1.1.374
+pyright==1.1.375
     # via draive (pyproject.toml)
 pystemmer==2.2.0.1
     # via fastembed
@@ -120,7 +122,7 @@ pytest-asyncio==0.23.8
     # via draive (pyproject.toml)
 pytest-cov==4.1.0
     # via draive (pyproject.toml)
-pyyaml==6.0.1
+pyyaml==6.0.2
     # via
     #   bandit
     #   huggingface-hub
@@ -133,7 +135,7 @@ requests==2.32.3
     #   tiktoken
 rich==13.7.1
     # via bandit
-ruff==0.5.6
+ruff==0.5.7
     # via draive (pyproject.toml)
 sentencepiece==0.2.0
     # via draive (pyproject.toml)
@@ -147,11 +149,11 @@ snowballstemmer==2.2.0
     # via fastembed
 stevedore==5.2.0
     # via bandit
-sympy==1.13.1
+sympy==1.13.2
     # via onnxruntime
 tiktoken==0.7.0
     # via draive (pyproject.toml)
-tokenizers==0.19.1
+tokenizers==0.20.0
     # via
     #   anthropic
     #   fastembed
diff --git a/pyproject.toml b/pyproject.toml
index 3f50e73..99d3772 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "draive"
 description = "Framework designed to simplify and accelerate the development of LLM-based applications."
-version = "0.25.0"
+version = "0.26.0"
 readme = "README.md"
 maintainers = [
   { name = "Kacper Kaliński", email = "kacper.kalinski@miquido.com" },
diff --git a/src/draive/__init__.py b/src/draive/__init__.py
index 585060a..5140164 100644
--- a/src/draive/__init__.py
+++ b/src/draive/__init__.py
@@ -148,6 +148,8 @@
     MultimodalContent,
     MultimodalContentConvertible,
     MultimodalContentElement,
+    MultimodalContentPlaceholder,
+    MultimodalTemplate,
     RateLimitError,
     TextContent,
     VideoBase64Content,
@@ -284,10 +286,12 @@
     "ModelGeneration",
     "ModelGenerator",
     "ModelGeneratorDecoder",
+    "Multimodal",
     "MultimodalContent",
     "MultimodalContentConvertible",
     "MultimodalContentElement",
-    "Multimodal",
+    "MultimodalContentPlaceholder",
+    "MultimodalTemplate",
     "noop",
     "not_missing",
     "ParameterDefaultFactory",
diff --git a/src/draive/evaluation/__init__.py b/src/draive/evaluation/__init__.py
index 87f2a4b..b8e25e9 100644
--- a/src/draive/evaluation/__init__.py
+++ b/src/draive/evaluation/__init__.py
@@ -1,36 +1,39 @@
 from draive.evaluation.evaluator import (
     Evaluator,
+    EvaluatorDefinition,
     EvaluatorResult,
     PreparedEvaluator,
     evaluator,
 )
 from draive.evaluation.scenario import (
+    EvaluationScenarioResult,
     PreparedScenarioEvaluator,
     ScenarioEvaluator,
     ScenarioEvaluatorDefinition,
     ScenarioEvaluatorResult,
     evaluation_scenario,
 )
-from draive.evaluation.score import Evaluation, EvaluationScore
+from draive.evaluation.score import EvaluationScore
 from draive.evaluation.suite import (
     EvaluationCaseResult,
     EvaluationSuite,
     EvaluationSuiteCase,
-    EvaluationSuiteCaseResult,
     EvaluationSuiteDefinition,
     EvaluationSuiteStorage,
+    SuiteEvaluatorCaseResult,
+    SuiteEvaluatorResult,
     evaluation_suite,
 )
 
 __all__ = [
     "evaluation_scenario",
     "evaluation_suite",
-    "Evaluation",
+    "EvaluatorDefinition",
     "EvaluationCaseResult",
+    "EvaluationScenarioResult",
     "EvaluationScore",
     "EvaluationSuite",
     "EvaluationSuiteCase",
-    "EvaluationSuiteCaseResult",
     "EvaluationSuiteDefinition",
     "EvaluationSuiteStorage",
     "evaluator",
@@ -41,4 +44,6 @@
     "ScenarioEvaluator",
     "ScenarioEvaluatorDefinition",
     "ScenarioEvaluatorResult",
+    "SuiteEvaluatorCaseResult",
+    "SuiteEvaluatorResult",
 ]
diff --git a/src/draive/evaluation/evaluator.py b/src/draive/evaluation/evaluator.py
index 11ab5e2..c17be7b 100644
--- a/src/draive/evaluation/evaluator.py
+++ b/src/draive/evaluation/evaluator.py
@@ -1,8 +1,9 @@
 from collections.abc import Callable
 from typing import Protocol, Self, cast, final, overload, runtime_checkable
 
-from draive.evaluation.score import Evaluation, EvaluationScore
+from draive.evaluation.score import EvaluationScore
 from draive.parameters import DataModel, Field, ParameterPath
+from draive.scope import ctx
 from draive.utils import freeze
 
 __all__ = [
@@ -10,6 +11,7 @@
     "Evaluator",
     "EvaluatorResult",
     "PreparedEvaluator",
+    "EvaluatorDefinition",
 ]
 
 
@@ -23,12 +25,63 @@ class EvaluatorResult(DataModel):
     threshold: float = Field(
         description="Score threshold required to pass evaluation",
     )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
 
     @property
     def passed(self) -> bool:
         return self.score.value >= self.threshold
 
 
+class EvaluationResult(DataModel):
+    @classmethod
+    async def of(
+        cls,
+        score: EvaluationScore | float | bool,
+        /,
+        meta: dict[str, str | float | int | bool | None] | None = None,
+    ) -> Self:
+        evaluation_score: EvaluationScore
+        match score:
+            case EvaluationScore() as score:
+                evaluation_score = score
+
+            case float() as value:
+                evaluation_score = EvaluationScore(value=value)
+
+            case passed:
+                evaluation_score = EvaluationScore(value=1.0 if passed else 0.0)
+
+        return cls(
+            score=evaluation_score,
+            meta=meta,
+        )
+
+    score: EvaluationScore = Field(
+        description="Evaluation score",
+    )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
+
+
+@runtime_checkable
+class EvaluatorDefinition[Value, **Args](Protocol):
+    @property
+    def __name__(self) -> str: ...
+
+    async def __call__(
+        self,
+        value: Value,
+        /,
+        *args: Args.args,
+        **kwargs: Args.kwargs,
+    ) -> EvaluationResult | EvaluationScore | float | bool: ...
+
+
 @runtime_checkable
 class PreparedEvaluator[Value](Protocol):
     async def __call__(
@@ -43,14 +96,14 @@ class Evaluator[Value, **Args]:
     def __init__(
         self,
         name: str,
-        evaluation: Evaluation[Value, Args],
+        definition: EvaluatorDefinition[Value, Args],
         threshold: float | None,
     ) -> None:
         assert (  # nosec: B101
             threshold is None or 0 <= threshold <= 1
         ), "Evaluation threshold has to be between 0 and 1"
 
-        self._evaluation: Evaluation[Value, Args] = evaluation
+        self._definition: EvaluatorDefinition[Value, Args] = definition
         self.name: str = name
         self.threshold: float = threshold or 1
 
@@ -62,7 +115,7 @@ def with_threshold(
     ) -> Self:
         return self.__class__(
             name=self.name,
-            evaluation=self._evaluation,
+            definition=self._definition,
             threshold=threshold,
         )
 
@@ -102,8 +155,8 @@ async def evaluation(
             value: Mapped,
             *args: Args.args,
             **kwargs: Args.kwargs,
-        ) -> EvaluationScore | float | bool:
-            return await self._evaluation(
+        ) -> EvaluationResult | EvaluationScore | float | bool:
+            return await self._definition(
                 mapper(value),
                 *args,
                 **kwargs,
@@ -111,7 +164,7 @@ async def evaluation(
 
         return Evaluator[Mapped, Args](
             name=self.name,
-            evaluation=evaluation,
+            definition=evaluation,
             threshold=self.threshold,
         )
 
@@ -123,34 +176,51 @@ async def __call__(
         **kwargs: Args.kwargs,
     ) -> EvaluatorResult:
         evaluation_score: EvaluationScore
-        match await self._evaluation(
-            value,
-            *args,
-            **kwargs,
-        ):
-            case float() as score_value:
-                evaluation_score = EvaluationScore(value=score_value)
+        evaluation_meta: dict[str, str | float | int | bool | None] | None
+        try:
+            match await self._definition(
+                value,
+                *args,
+                **kwargs,
+            ):
+                case EvaluationResult() as result:
+                    evaluation_score = result.score
+                    evaluation_meta = result.meta
 
-            case bool() as score_bool:
-                evaluation_score = EvaluationScore(value=1 if score_bool else 0)
+                case EvaluationScore() as score:
+                    evaluation_score = score
+                    evaluation_meta = None
 
-            case EvaluationScore() as score:
-                evaluation_score = score
+                case float() as score_value:
+                    evaluation_score = EvaluationScore(value=score_value)
+                    evaluation_meta = None
+
+                case passed:
+                    evaluation_score = EvaluationScore(value=1 if passed else 0)
+                    evaluation_meta = None
 
-            # for whatever reason pyright wants int to be handled...
-            case int() as score_int:
-                evaluation_score = EvaluationScore(value=float(score_int))
+        except Exception as exc:
+            ctx.log_error(
+                f"Evaluator `{self.name}` failed, using `0` score fallback result",
+                exception=exc,
+            )
+            evaluation_score = EvaluationScore(
+                value=0,
+                comment="Evaluation failed",
+            )
+            evaluation_meta = {"exception": str(exc)}
 
         return EvaluatorResult(
             evaluator=self.name,
             score=evaluation_score,
             threshold=self.threshold,
+            meta=evaluation_meta,
         )
 
 
 @overload
 def evaluator[Value, **Args](
-    evaluation: Evaluation[Value, Args] | None = None,
+    definition: EvaluatorDefinition[Value, Args] | None = None,
     /,
 ) -> Evaluator[Value, Args]: ...
 
@@ -161,29 +231,29 @@ def evaluator[Value, **Args](
     name: str | None = None,
     threshold: float | None = None,
 ) -> Callable[
-    [Evaluation[Value, Args]],
+    [EvaluatorDefinition[Value, Args]],
     Evaluator[Value, Args],
 ]: ...
 
 
 def evaluator[Value, **Args](
-    evaluation: Evaluation[Value, Args] | None = None,
+    evaluation: EvaluatorDefinition[Value, Args] | None = None,
     *,
     name: str | None = None,
     threshold: float | None = None,
 ) -> (
     Callable[
-        [Evaluation[Value, Args]],
+        [EvaluatorDefinition[Value, Args]],
         Evaluator[Value, Args],
     ]
     | Evaluator[Value, Args]
 ):
     def wrap(
-        evaluation: Evaluation[Value, Args],
+        definition: EvaluatorDefinition[Value, Args],
     ) -> Evaluator[Value, Args]:
         return Evaluator(
-            name=name or evaluation.__name__,
-            evaluation=evaluation,
+            name=name or definition.__name__,
+            definition=definition,
             threshold=threshold,
         )
 
diff --git a/src/draive/evaluation/scenario.py b/src/draive/evaluation/scenario.py
index 84ae8b7..2952d78 100644
--- a/src/draive/evaluation/scenario.py
+++ b/src/draive/evaluation/scenario.py
@@ -1,8 +1,10 @@
+from asyncio import gather
 from collections.abc import Callable, Sequence
-from typing import Protocol, overload, runtime_checkable
+from typing import Protocol, Self, overload, runtime_checkable
 
-from draive.evaluation.evaluator import EvaluatorResult
+from draive.evaluation.evaluator import EvaluatorResult, PreparedEvaluator
 from draive.parameters import DataModel, Field
+from draive.scope import ctx
 from draive.types import frozenlist
 from draive.utils import freeze
 
@@ -11,6 +13,7 @@
     "ScenarioEvaluator",
     "ScenarioEvaluatorDefinition",
     "ScenarioEvaluatorResult",
+    "EvaluationScenarioResult",
 ]
 
 
@@ -21,10 +24,44 @@ class ScenarioEvaluatorResult(DataModel):
     evaluations: frozenlist[EvaluatorResult] = Field(
         description="Scenario evaluation results",
     )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
 
     @property
     def passed(self) -> bool:
-        return all(case.passed for case in self.evaluations)
+        # empty evaluations is equivalent of failure
+        return len(self.evaluations) > 0 and all(case.passed for case in self.evaluations)
+
+
+class EvaluationScenarioResult(DataModel):
+    @classmethod
+    async def evaluating[Value](
+        cls,
+        value: Value,
+        /,
+        evaluators: PreparedEvaluator[Value],
+        *_evaluators: PreparedEvaluator[Value],
+        meta: dict[str, str | float | int | bool | None] | None = None,
+    ) -> Self:
+        return cls(
+            evaluations=tuple(
+                await gather(
+                    *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
+                    return_exceptions=False,
+                ),
+            ),
+            meta=meta,
+        )
+
+    evaluations: frozenlist[EvaluatorResult] = Field(
+        description="Scenario evaluation results",
+    )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
 
 
 @runtime_checkable
@@ -47,7 +84,7 @@ async def __call__(
         /,
         *args: Args.args,
         **kwargs: Args.kwargs,
-    ) -> Sequence[EvaluatorResult]: ...
+    ) -> Sequence[EvaluatorResult] | EvaluationScenarioResult: ...
 
 
 class ScenarioEvaluator[Value, **Args]:
@@ -84,16 +121,35 @@ async def __call__(
         *args: Args.args,
         **kwargs: Args.kwargs,
     ) -> ScenarioEvaluatorResult:
-        return ScenarioEvaluatorResult(
-            name=self.name,
-            evaluations=tuple(
-                await self._definition(
-                    value,
-                    *args,
-                    **kwargs,
-                )
-            ),
-        )
+        try:
+            match await self._definition(
+                value,
+                *args,
+                **kwargs,
+            ):
+                case EvaluationScenarioResult() as result:
+                    return ScenarioEvaluatorResult(
+                        name=self.name,
+                        evaluations=result.evaluations,
+                        meta=result.meta,
+                    )
+
+                case [*results]:
+                    return ScenarioEvaluatorResult(
+                        name=self.name,
+                        evaluations=tuple(results),
+                    )
+        except Exception as exc:
+            ctx.log_error(
+                f"Scenario evaluator `{self.name}` failed, using empty fallback result",
+                exception=exc,
+            )
+
+            return ScenarioEvaluatorResult(
+                name=self.name,
+                evaluations=(),
+                meta={"exception": str(exc)},
+            )
 
 
 @overload
diff --git a/src/draive/evaluation/score.py b/src/draive/evaluation/score.py
index 3082438..3acf94c 100644
--- a/src/draive/evaluation/score.py
+++ b/src/draive/evaluation/score.py
@@ -1,9 +1,6 @@
-from typing import Protocol, runtime_checkable
-
 from draive.parameters import DataModel, Field
 
 __all__ = [
-    "Evaluation",
     "EvaluationScore",
 ]
 
@@ -24,17 +21,3 @@ class EvaluationScore(DataModel):
         description="Explanation of the score",
         default=None,
     )
-
-
-@runtime_checkable
-class Evaluation[Value, **Args](Protocol):
-    @property
-    def __name__(self) -> str: ...
-
-    async def __call__(
-        self,
-        value: Value,
-        /,
-        *args: Args.args,
-        **kwargs: Args.kwargs,
-    ) -> EvaluationScore | float | bool: ...
diff --git a/src/draive/evaluation/suite.py b/src/draive/evaluation/suite.py
index bbe592d..9a45f2d 100644
--- a/src/draive/evaluation/suite.py
+++ b/src/draive/evaluation/suite.py
@@ -15,9 +15,10 @@
     "evaluation_suite",
     "EvaluationCaseResult",
     "EvaluationSuite",
-    "EvaluationSuiteCaseResult",
     "EvaluationSuiteDefinition",
     "EvaluationSuiteStorage",
+    "SuiteEvaluatorCaseResult",
+    "SuiteEvaluatorResult",
 ]
 
 
@@ -27,20 +28,33 @@ class EvaluationSuiteCase[CaseParameters: DataModel](DataModel):
     comment: str | None = None
 
 
-class EvaluationSuiteCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
+class SuiteEvaluatorCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
     case: EvaluationSuiteCase[CaseParameters] = Field(
         description="Evaluated case",
     )
     value: Value = Field(
         description="Evaluated value",
     )
-    results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field(
+    results: frozenlist[ScenarioEvaluatorResult] = Field(
         description="Evaluation results",
     )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
 
     @property
     def passed(self) -> bool:
-        return all(result.passed for result in self.results)
+        # empty results is equivalent of failure
+        return len(self.results) > 0 and all(result.passed for result in self.results)
+
+
+class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
+    cases: list[SuiteEvaluatorCaseResult[CaseParameters, Value]]
+
+    @property
+    def passed(self) -> bool:
+        return all(case.passed for case in self.cases)
 
 
 class EvaluationCaseResult[Value: DataModel | str](DataModel):
@@ -50,10 +64,30 @@ def of(
         results: ScenarioEvaluatorResult | EvaluatorResult,
         *_results: ScenarioEvaluatorResult | EvaluatorResult,
         value: Value,
+        meta: dict[str, str | float | int | bool | None] | None = None,
     ) -> Self:
+        free_results: list[EvaluatorResult] = []
+        scenario_results: list[ScenarioEvaluatorResult] = []
+        for result in (results, *_results):
+            match result:
+                case ScenarioEvaluatorResult() as scenario_result:
+                    scenario_results.append(scenario_result)
+
+                case EvaluatorResult() as evaluator_result:
+                    free_results.append(evaluator_result)
+
+        if free_results:
+            scenario_results.append(
+                ScenarioEvaluatorResult(
+                    name="EvaluationSuite",
+                    evaluations=tuple(free_results),
+                )
+            )
+
         return cls(
             value=value,
-            results=(results, *_results),
+            results=tuple(scenario_results),
+            meta=meta,
         )
 
     @classmethod
@@ -63,23 +97,27 @@ async def evaluating(
         /,
         evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value],
         *_evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value],
+        meta: dict[str, str | float | int | bool | None] | None = None,
     ) -> Self:
-        return cls(
-            value=value,
-            results=tuple(
-                await gather(
-                    *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
-                    return_exceptions=False,
-                ),
+        return cls.of(
+            *await gather(
+                *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
+                return_exceptions=False,
             ),
+            value=value,
+            meta=meta,
         )
 
     value: Value = Field(
         description="Evaluated value",
     )
-    results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field(
+    results: frozenlist[ScenarioEvaluatorResult] = Field(
         description="Evaluation results",
     )
+    meta: dict[str, str | float | int | bool | None] | None = Field(
+        description="Additional evaluation metadata",
+        default=None,
+    )
 
 
 @runtime_checkable
@@ -124,7 +162,7 @@ async def __call__(
         /,
         *,
         reload: bool = False,
-    ) -> EvaluationSuiteCaseResult[CaseParameters, Value]: ...
+    ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]: ...
 
     @overload
     async def __call__(
@@ -132,7 +170,7 @@ async def __call__(
         /,
         *,
         reload: bool = False,
-    ) -> list[EvaluationSuiteCaseResult[CaseParameters, Value]]: ...
+    ) -> SuiteEvaluatorResult[CaseParameters, Value]: ...
 
     async def __call__(
         self,
@@ -141,18 +179,20 @@ async def __call__(
         *,
         reload: bool = False,
     ) -> (
-        list[EvaluationSuiteCaseResult[CaseParameters, Value]]
-        | EvaluationSuiteCaseResult[CaseParameters, Value]
+        SuiteEvaluatorResult[CaseParameters, Value]
+        | SuiteEvaluatorCaseResult[CaseParameters, Value]
     ):
         async with self._lock:
             match parameters:
                 case None:
-                    return await gather(
-                        *[
-                            self._evaluate(case=case)
-                            for case in (await self._data(reload=reload)).cases
-                        ],
-                        return_exceptions=False,
+                    return SuiteEvaluatorResult(
+                        cases=await gather(
+                            *[
+                                self._evaluate(case=case)
+                                for case in (await self._data(reload=reload)).cases
+                            ],
+                            return_exceptions=False,
+                        )
                     )
 
                 case UUID() as identifier:
@@ -180,10 +220,10 @@ async def _evaluate(
         self,
         *,
         case: EvaluationSuiteCase[CaseParameters],
-    ) -> EvaluationSuiteCaseResult[CaseParameters, Value]:
+    ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]:
         result: EvaluationCaseResult[Value] = await self._definition(parameters=case.parameters)
 
-        return EvaluationSuiteCaseResult[CaseParameters, Value](
+        return SuiteEvaluatorCaseResult[CaseParameters, Value](
             case=case,
             value=result.value,
             results=result.results,
diff --git a/src/draive/evaluators/__init__.py b/src/draive/evaluators/__init__.py
index 68d8735..923af8e 100644
--- a/src/draive/evaluators/__init__.py
+++ b/src/draive/evaluators/__init__.py
@@ -1,25 +1,27 @@
-from draive.evaluators.text_coherence import text_coherence_evaluator
-from draive.evaluators.text_conciseness import text_conciseness_evaluator
-from draive.evaluators.text_consistency import text_consistency_evaluator
-from draive.evaluators.text_coverage import text_coverage_evaluator
-from draive.evaluators.text_fluency import text_fluency_evaluator
-from draive.evaluators.text_keywords import text_keywords_evaluator
-from draive.evaluators.text_readability import text_readability_evaluator
-from draive.evaluators.text_relevance import text_relevance_evaluator
-from draive.evaluators.text_similarity import (
-    text_similarity_evaluator,
+from draive.evaluators.coherence import coherence_evaluator
+from draive.evaluators.conciseness import conciseness_evaluator
+from draive.evaluators.consistency import consistency_evaluator
+from draive.evaluators.coverage import coverage_evaluator
+from draive.evaluators.fluency import fluency_evaluator
+from draive.evaluators.keywords import keywords_evaluator
+from draive.evaluators.readability import readability_evaluator
+from draive.evaluators.relevance import relevance_evaluator
+from draive.evaluators.similarity import (
+    image_vector_similarity_evaluator,
+    similarity_evaluator,
     text_vector_similarity_evaluator,
 )
 
 __all__ = [
-    "text_coherence_evaluator",
-    "text_conciseness_evaluator",
-    "text_consistency_evaluator",
-    "text_coverage_evaluator",
-    "text_fluency_evaluator",
-    "text_readability_evaluator",
-    "text_relevance_evaluator",
-    "text_keywords_evaluator",
-    "text_similarity_evaluator",
+    "coherence_evaluator",
+    "conciseness_evaluator",
+    "consistency_evaluator",
+    "coverage_evaluator",
+    "fluency_evaluator",
+    "image_vector_similarity_evaluator",
+    "keywords_evaluator",
+    "readability_evaluator",
+    "relevance_evaluator",
+    "similarity_evaluator",
     "text_vector_similarity_evaluator",
 ]
diff --git a/src/draive/evaluators/coherence.py b/src/draive/evaluators/coherence.py
new file mode 100644
index 0000000..d80a58f
--- /dev/null
+++ b/src/draive/evaluators/coherence.py
@@ -0,0 +1,90 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "coherence_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is coherence - a collective quality of the content.
+We align this dimension with the DUC (Document Understanding Conference) quality question of \
+structure and coherence, whereby the content should be well-structured and well-organized.
+EVALUATED content should not just be a heap of related information, but should build from part
+to part into a coherent body of information about the topic.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a coherence score using value between 0.0 and 4.0 where:
+0.0 is very low coherence - the content is chaotic, lacking logical connections between parts.
+1.0 is low coherence - some connections are visible, but the overall structure is weak.
+2.0 is moderate coherence - the content has a noticeable structure, but with some shortcomings.
+3.0 is good coherence - the content is well-organized with minor imperfections.
+4.0 is excellent coherence - the content is exemplarily structured, with smooth transitions \
+between ideas.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="coherence")
+async def coherence_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/conciseness.py b/src/draive/evaluators/conciseness.py
new file mode 100644
index 0000000..7fe01a2
--- /dev/null
+++ b/src/draive/evaluators/conciseness.py
@@ -0,0 +1,88 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "conciseness_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a conciseness metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is conciseness - a extent to which the EVALUATED content is brief and to the \
+point while still covering all key information.
+A concise content avoids unnecessary details and repetition, also avoiding being overly verbose \
+or include irrelevant information.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a conciseness score using value between 0.0 and 4.0 where:
+0.0 is very low conciseness - the content is excessively verbose with much irrelevant information.
+1.0 is low conciseness - the content contains unnecessary details and some irrelevant information.
+2.0 is moderate conciseness - the content is somewhat concise but could be more focused.
+3.0 is good conciseness - the content is mostly concise with minimal unnecessary information.
+4.0 is excellent conciseness - the content is highly concise, containing only essential information.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="conciseness")
+async def conciseness_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/consistency.py b/src/draive/evaluators/consistency.py
new file mode 100644
index 0000000..93fd4b5
--- /dev/null
+++ b/src/draive/evaluators/consistency.py
@@ -0,0 +1,90 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "consistency_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a consistency metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is consistency - a factual alignment between the REFERENCE and the EVALUATED content.
+A factually consistent content contains only elements that are entailed by the REFERENCE content.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a consistency score using value between 0.0 and 4.0 where:
+0.0 is very low consistency - the content contains multiple hallucinated facts \
+or significant misalignments with the reference content.
+1.0 is low consistency - the content has several instances of information not supported by \
+the reference content.
+2.0 is moderate consistency - the content is mostly consistent but contains a few unsupported \
+statements.
+3.0 is good consistency - the content is largely consistent with minor discrepancies.
+4.0 is excellent consistency - the content is fully consistent with the reference content, \
+containing only supported information.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""  # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="consistency")
+async def consistency_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/coverage.py b/src/draive/evaluators/coverage.py
new file mode 100644
index 0000000..c4713b4
--- /dev/null
+++ b/src/draive/evaluators/coverage.py
@@ -0,0 +1,88 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "coverage_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a coverage metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is coverage - the extent to which the EVALUATED content includes all \
+the key points from the REFERENCE content.
+EVALUATED content with good coverage includes all the important information from \
+the REFERENCE content without omitting critical points.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a coverage score using value between 0.0 and 4.0 where:
+0.0 is very low coverage - the content misses most key points from the reference content.
+1.0 is low coverage - the content includes some key points but omits several important ones.
+2.0 is moderate coverage - the content covers most key points but misses a few important details.
+3.0 is good coverage - the content includes nearly all key points with minor omissions.
+4.0 is excellent coverage - the content comprehensively covers all key points from the reference content.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""  # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="coverage")
+async def coverage_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/fluency.py b/src/draive/evaluators/fluency.py
new file mode 100644
index 0000000..74dac21
--- /dev/null
+++ b/src/draive/evaluators/fluency.py
@@ -0,0 +1,74 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "fluency_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Carefully examine provided CONTENT, then rate it using solely a \
+fluency metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is fluency - the quality of the content in terms of grammar, spelling, \
+punctuation, content choice, and overall structure.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a fluency score using value between 0.0 and 2.0 where:
+0.0 is poor fluency - the content has many errors that make it hard to understand or look unnatural.
+1.0 is fair fluency - the content has some errors that affect the clarity or smoothness, \
+but the main points are still comprehensible.
+2.0 is good fluency - the content has few or no errors and is easy to read and follow.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<CONTENT>",
+    ("content",),
+    "</CONTENT>",
+)
+
+
+@evaluator(name="fluency")
+async def fluency_evaluator(
+    content: Multimodal,
+    /,
+) -> EvaluationScore:
+    if not content:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                content=content,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 2,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/text_keywords.py b/src/draive/evaluators/keywords.py
similarity index 73%
rename from src/draive/evaluators/text_keywords.py
rename to src/draive/evaluators/keywords.py
index 737fa8e..1fa8163 100644
--- a/src/draive/evaluators/text_keywords.py
+++ b/src/draive/evaluators/keywords.py
@@ -1,23 +1,24 @@
 from collections.abc import Callable, Sequence
 
 from draive.evaluation import EvaluationScore, evaluator
+from draive.types import Multimodal, MultimodalContent
 
 __all__ = [
-    "text_keywords_evaluator",
+    "keywords_evaluator",
 ]
 
 
-@evaluator(name="text_keywords")
-async def text_keywords_evaluator(
-    text: str,
+@evaluator(name="keywords")
+async def keywords_evaluator(
+    content: Multimodal,
     /,
     keywords: Sequence[str],
     normalization: Callable[[str], str] | None = None,
 ) -> EvaluationScore:
-    if not text:
+    if not content:
         return EvaluationScore(
             value=0,
-            comment="Input text was empty!",
+            comment="Input was empty!",
         )
 
     if not keywords:
@@ -33,7 +34,7 @@ async def text_keywords_evaluator(
     else:
         text_normalization = _lowercased
 
-    normalized_text: str = text_normalization(text)
+    normalized_text: str = text_normalization(MultimodalContent.of(content).as_string())
     return EvaluationScore(
         value=len(
             [keyword for keyword in keywords if text_normalization(keyword) in normalized_text]
diff --git a/src/draive/evaluators/readability.py b/src/draive/evaluators/readability.py
new file mode 100644
index 0000000..3a2b0b7
--- /dev/null
+++ b/src/draive/evaluators/readability.py
@@ -0,0 +1,80 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "readability_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Carefully examine provided CONTENT, then rate it using solely a \
+readability metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is readability - the ease with which a reader can understand the content.
+A readable content uses clear and concise language, is well-structured,
+and avoids complex or convoluted elements.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a readability score using value between 0.0 and 4.0 where:
+0.0 is very low readability - the content is extremely difficult to understand, \
+with complex language and convoluted structure.
+1.0 is low readability - the content is challenging to read, with frequent use of \
+complex sentences, unclear language or irrelevant parts.
+2.0 is moderate readability - the content is somewhat clear but has some areas \
+that are difficult to understand.
+3.0 is good readability - the content is mostly clear and easy to read, with minor instances \
+of complexity.
+4.0 is excellent readability - the content is highly clear, concise, and easy to understand throughout.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""  # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<CONTENT>",
+    ("content",),
+    "</CONTENT>",
+)
+
+
+@evaluator(name="readability")
+async def readability_evaluator(
+    content: Multimodal,
+    /,
+) -> EvaluationScore:
+    if not content:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                content=content,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/relevance.py b/src/draive/evaluators/relevance.py
new file mode 100644
index 0000000..ae62475
--- /dev/null
+++ b/src/draive/evaluators/relevance.py
@@ -0,0 +1,91 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+    "relevance_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a relevance metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is relevance - selection of important parts from the REFERENCE content.
+The EVALUATED content should include only important information from the REFERENCE avoiding \
+redundancies and excess information.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a relevance score using value between 0.0 and 4.0 where:
+0.0 is very low relevance - the content contains mostly irrelevant or redundant information.
+1.0 is low coverage - the content includes some important points but has \
+significant irrelevant parts.
+2.0 is moderate relevance - the content covers most important points but includes \
+some unnecessary information.
+3.0 is good relevance - the content focuses on important information with minor inclusions \
+of less relevant content.
+4.0 is excellent relevance - the content precisely captures only the most important information \
+from the reference.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="relevance")
+async def relevance_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 4,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/score.py b/src/draive/evaluators/score.py
index 127e498..3cb5d94 100644
--- a/src/draive/evaluators/score.py
+++ b/src/draive/evaluators/score.py
@@ -39,14 +39,14 @@ def _score_validator(
 
 
 class CommonScoreModel(DataModel):
-    score: float = Field(
-        description="Decimal score value",
-        validator=_score_validator,
-    )
     comment: str | None = Field(
         description="Explanation of the score",
         default=None,
     )
+    score: float = Field(
+        description="Decimal score value",
+        validator=_score_validator,
+    )
 
     def normalized(
         self,
diff --git a/src/draive/evaluators/similarity.py b/src/draive/evaluators/similarity.py
new file mode 100644
index 0000000..6480956
--- /dev/null
+++ b/src/draive/evaluators/similarity.py
@@ -0,0 +1,133 @@
+from base64 import b64decode
+
+from draive.embedding import Embedded, embed_images, embed_texts
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.similarity.score import vector_similarity_score
+from draive.types import (
+    ImageBase64Content,
+    Multimodal,
+    MultimodalTemplate,
+)
+from draive.utils import xml_tag
+
+__all__ = [
+    "similarity_evaluator",
+    "text_vector_similarity_evaluator",
+    "image_vector_similarity_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+<INSTRUCTION>
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a similarity metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+</INSTRUCTION>
+
+<EVALUATION_CRITERIA>
+Evaluated metric is similarity - the degree of semantic similarity between the REFERENCE \
+and the EVALUATED content.
+</EVALUATION_CRITERIA>
+
+<RATING>
+Assign a similarity score using value between 0.0 and 2.0 where:
+0.0 is no similarity - the content is completely unrelated in meaning.
+1.0 is moderate similarity - the content share some common themes or ideas.
+2.0 is high similarity - the content is very close in meaning \
+or convey the same information.
+</RATING>
+
+<FORMAT>
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `<RESULT>score</RESULT>`.
+</FORMAT>
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+    "<REFERENCE>",
+    ("reference",),
+    "</REFERENCE>",
+    "<EVALUATED>",
+    ("evaluated",),
+    "</EVALUATED>",
+)
+
+
+@evaluator(name="similarity")
+async def similarity_evaluator(
+    evaluated: Multimodal,
+    /,
+    reference: Multimodal,
+) -> EvaluationScore:
+    if not evaluated:
+        return EvaluationScore(
+            value=0,
+            comment="Input was empty!",
+        )
+
+    if not reference:
+        return EvaluationScore(
+            value=0,
+            comment="Reference was empty!",
+        )
+
+    if result := xml_tag(
+        "RESULT",
+        await generate_text(
+            instruction=INSTRUCTION,
+            input=INPUT_TEMPLATE.format(
+                reference=reference,
+                evaluated=evaluated,
+            ),
+        ),
+    ):
+        return EvaluationScore(
+            value=float(result) / 2,
+            comment=None,
+        )
+
+    else:
+        raise ValueError("Invalid result")
+
+
+@evaluator(name="text_vector_similarity")
+async def text_vector_similarity_evaluator(
+    evaluated: str,
+    /,
+    reference: str,
+) -> float:
+    embedding: list[Embedded[str]] = await embed_texts([reference, evaluated])
+
+    return vector_similarity_score(embedding[0].vector, embedding[1].vector)
+
+
+@evaluator(name="image_vector_similarity")
+async def image_vector_similarity_evaluator(
+    evaluated: ImageBase64Content | bytes,
+    /,
+    reference: ImageBase64Content | bytes,
+) -> float:
+    evaluated_data: bytes
+    match evaluated:
+        case ImageBase64Content() as base64_data:
+            evaluated_data = b64decode(base64_data.image_base64)
+
+        case raw_data:
+            evaluated_data = raw_data
+
+    reference_data: bytes
+    match reference:
+        case ImageBase64Content() as base64_data:
+            reference_data = b64decode(base64_data.image_base64)
+
+        case raw_data:
+            reference_data = raw_data
+
+    embedding: list[Embedded[bytes]] = await embed_images([reference_data, evaluated_data])
+
+    return vector_similarity_score(embedding[0].vector, embedding[1].vector)
diff --git a/src/draive/evaluators/text_coherence.py b/src/draive/evaluators/text_coherence.py
deleted file mode 100644
index f0d5464..0000000
--- a/src/draive/evaluators/text_coherence.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_coherence_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Coherence metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Coherence (0.0-4.0) - the collective quality of all sentences.
-We align this dimension with the DUC (Document Understanding Conference) quality question of \
-structure and coherence, whereby the text should be well-structured and well-organized.
-The compared text should not just be a heap of related information, but should build from sentence
-to sentence into a coherent body of information about a topic.
-
-Rating Scale:
-0.0: Very low coherence - the text is chaotic, lacking logical connections between sentences.
-1.0: Low coherence - some connections are visible, but the overall structure is weak.
-2.0: Moderate coherence - the text has a noticeable structure, but with some shortcomings.
-3.0: Good coherence - the text is well-organized with minor imperfections.
-4.0: Excellent coherence - the text is exemplarily structured, with smooth transitions \
-between ideas.
-
-Evaluation Steps:
-1. Read the reference text carefully and identify the main topic and key points.
-2. Read the compared text and compare it to the reference text.
-Check if the compared text covers the main topic and key points of the reference text, \
-and if it presents them in a clear and logical order.
-3. Assign a coherence score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_coherence")
-async def text_coherence_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Solar energy is a renewable energy source that is gaining popularity. "
-                        "Solar panels convert sunlight into electricity. "
-                        "This technology is environmentally friendly and can reduce electricity "
-                        "bills. However, installing solar panels requires an initial investment "
-                        "and is dependent on weather conditions."
-                    ),
-                    compared=(
-                        "Solar panels are on roofs. Energy is important. "
-                        "The sun shines brightly. Electricity bills can be high. "
-                        "Technology is developing fast. People like to save money."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Coffee is a popular beverage worldwide. "
-                        "It's made from roasted coffee beans. Caffeine in coffee "
-                        "can boost energy and alertness. However, excessive consumption may "
-                        "lead to sleep issues."
-                    ),
-                    compared=(
-                        "Coffee is drunk by many people. It comes from beans that are roasted. "
-                        "Caffeine makes you feel more awake. "
-                        "Drinking too much coffee might make it hard to sleep. "
-                        "Some people add milk or sugar to their coffee."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Honey is a natural sweetener produced by bees. "
-                        "It has antibacterial properties and is rich in antioxidants. "
-                        "People use honey in cooking, as a spread, and for medicinal "
-                        "purposes. However, it's high in calories and should be consumed "
-                        "in moderation."
-                    ),
-                    compared=(
-                        "Bees create honey, a natural sweetener with multiple benefits. "
-                        "Its antibacterial and antioxidant-rich composition makes it valuable "
-                        "for culinary, nutritional, and medicinal uses. While versatile, "
-                        "honey's high caloric content necessitates mindful consumption."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_conciseness.py b/src/draive/evaluators/text_conciseness.py
deleted file mode 100644
index 0fc911e..0000000
--- a/src/draive/evaluators/text_conciseness.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_conciseness_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Conciseness metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Conciseness (0.0-4.0) - the extent to which the compared text is brief and to the point \
-while still covering all key information.
-A concise compared text avoids unnecessary details and repetition.
-Annotators should penalize compared texts that are overly verbose or include irrelevant information.
-
-Rating Scale:
-0.0: Very low conciseness - the text is excessively verbose with much irrelevant information.
-1.0: Low conciseness - the text contains unnecessary details and some irrelevant information.
-2.0: Moderate conciseness - the text is somewhat concise but could be more focused.
-3.0: Good conciseness - the text is mostly concise with minimal unnecessary information.
-4.0: Excellent conciseness - the text is highly concise, containing only essential information.
-
-Evaluation Steps:
-1. Read the derived text and the reference text carefully.
-2. Compare the compared text to the reference text and identify the main \
-points of the reference text.
-3. Assess how well the compared text covers the main points of the reference text, \
-and how much irrelevant or redundant information it contains.
-4. Assign a conciseness score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_conciseness")
-async def text_conciseness_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Solar energy is a renewable energy source that is gaining popularity. "
-                        "Solar panels convert sunlight into electricity. "
-                        "This technology is environmentally friendly and can reduce electricity "
-                        "bills. However,installing solar panels requires an initial investment and "
-                        "is dependent on weather conditions."
-                    ),
-                    compared=(
-                        "Did you know that solar energy is becoming super popular these days? "
-                        "It's this amazing, eco-friendly way to make electricity using "
-                        "the sun's rays. People are getting really excited about it! Basically, "
-                        "you put these special panels on your roof, and they soak up the sunlight "
-                        "like a sponge. Then, through some pretty cool science stuff, "
-                        "they turn that sunlight into electricity you can use in your house. "
-                        "It's pretty neat, right? And get this - it can actually help you save "
-                        "money on your electricity bills in the long run. But here's the thing: "
-                        "you've got to shell out some cash upfront to get those panels installed. "
-                        "It's kind of like buying a fancy coffee machine - costs a bit at first, "
-                        "but then you save on all those coffee shop visits."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Coffee is a popular beverage worldwide. "
-                        "It's made from roasted coffee beans. Caffeine in coffee "
-                        "can boost energy and alertness. However, excessive consumption may "
-                        "lead to sleep issues."
-                    ),
-                    compared=(
-                        "Coffee is a widely consumed beverage made from roasted coffee beans. "
-                        "It contains caffeine, which can enhance energy and alertness. However, "
-                        "drinking too much coffee may cause sleep problems. "
-                        "People enjoy coffee for its taste and stimulating effects, but it's "
-                        "important to consume it in moderation."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "The water cycle, also known as the hydrologic cycle, "
-                        "describes the continuous movement of water within the Earth and "
-                        "atmosphere. It involves processes such as evaporation, condensation, "
-                        "precipitation, and runoff."
-                    ),
-                    compared=(
-                        "The water cycle is the continuous movement of water on Earth. "
-                        "It includes evaporation, condensation, precipitation, and runoff."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_consistency.py b/src/draive/evaluators/text_consistency.py
deleted file mode 100644
index 52582c7..0000000
--- a/src/draive/evaluators/text_consistency.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_consistency_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Consistency metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Consistency(0.0-4.0) - the factual alignment between the reference text and the compared text.
-A factually consistent compared text contains only statements that are entailed \
-by the reference text.
-Annotators should penalize compared texts that contain hallucinated facts.
-
-Rating Scale:
-0.0: Very low consistency - the text contains multiple hallucinated facts \
-or significant misalignments with the reference text.
-1.0: Low consistency - the text has several instances of information not supported by \
-the reference text.
-2.0: Moderate consistency - the text is mostly consistent but contains a few unsupported statements.
-3.0: Good consistency - the text is largely consistent with minor discrepancies.
-4.0: Excellent consistency - the text is fully consistent with the reference text, \
-containing only supported information.
-
-Evaluation Steps:
-1. Read the compared text and the reference text carefully.
-2. Compare the compared text to the reference text and identify the main points \
-of the reference text.
-3. Assess how well the compared text covers the main points of the reference text \
-and how much irrelevant or redundant information it contains.
-4. Assign a consistency score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_consistency")
-async def text_consistency_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Dolphins are intelligent marine mammals. They use echolocation "
-                        "to navigate and hunt. Dolphins live in social groups called pods."
-                    ),
-                    compared=(
-                        "Dolphins are smart fish that can fly short distances. They use sonar "
-                        "to talk to whales. Dolphins live in families and go to school "
-                        "to learn hunting techniques."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Coffee is a popular beverage worldwide. "
-                        "It's made from roasted coffee beans. Caffeine in coffee "
-                        "can boost energy and alertness. However, excessive consumption may "
-                        "lead to sleep issues."
-                    ),
-                    compared=(
-                        "Coffee is a widely consumed drink around the world. It's produced "
-                        "by roasting coffee beans. The caffeine in coffee can increase energy "
-                        "levels and improve alertness. However, drinking too much coffee might "
-                        "cause sleep problems. Coffee is also known to improve memory and reduce "
-                        "the risk of certain diseases."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Photosynthesis is the process by which plants use sunlight to "
-                        "produce energy. It requires water, carbon dioxide, and chlorophyll. "
-                        "Oxygen is released as a byproduct of photosynthesis."
-                    ),
-                    compared=(
-                        "Plants carry out photosynthesis to create energy from sunlight. "
-                        "This process needs water, carbon dioxide, and the green pigment "
-                        "chlorophyll. As plants photosynthesize, "
-                        "they release oxygen into the environment."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_coverage.py b/src/draive/evaluators/text_coverage.py
deleted file mode 100644
index 7de871d..0000000
--- a/src/draive/evaluators/text_coverage.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_coverage_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Coverage metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Coverage (0.0-4.0) - the extent to which the compared text includes all \
-the key points from the reference text.
-A compared text with good coverage includes all the important information from \
-the reference text without omitting critical points.
-Annotators should penalize compared texts that miss significant content.
-
-Rating Scale:
-0.0: Very low coverage - the text misses most key points from the reference text.
-1.0: Low coverage - the text includes some key points but omits several important ones.
-2.0: Moderate coverage - the text covers most key points but misses a few important details.
-3.0: Good coverage - the text includes nearly all key points with minor omissions.
-4.0: Excellent coverage - the text comprehensively covers all key points from the reference text.
-
-Evaluation Steps:
-1. Read the reference text carefully and identify all key points and important information.
-2. Read the compared text and compare it to the reference text. \
-Check if the compared text includes all the key points and important information \
-from the reference text.
-3. Assess how well the compared text covers the reference text, \
-and if any critical points are missing.
-4. Assign a coverage score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_coverage")
-async def text_coverage_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Smartphones are versatile devices. They can make calls, send messages, "
-                        "access the internet, take photos, and run various apps. "
-                        "Many people use smartphones for work and entertainment. "
-                        "However, excessive use can lead to addiction and sleep problems."
-                    ),
-                    compared=(
-                        "Smartphones can make calls and send messages. They are popular devices."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Recycling helps protect the environment. It reduces waste in landfills, "
-                        "conserves natural resources, and saves energy. Common recyclable items "
-                        "include paper, plastic, glass, and metal. Many cities have recycling "
-                        "programs, but individual participation is crucial for success."
-                    ),
-                    compared=(
-                        "Recycling is good for the environment. "
-                        "It reduces waste and saves resources. "
-                        "People can recycle things like paper and plastic. "
-                        "Many cities have recycling programs."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Regular exercise is important for health. It strengthens the heart, "
-                        "builds muscle, and improves flexibility. Exercise can also reduce stress "
-                        "and boost mood. Experts recommend at least 30 minutes of moderate "
-                        "activity most days of the week. Walking, swimming, and cycling are "
-                        "good options for many people."
-                    ),
-                    compared=(
-                        "Regular exercise is crucial for maintaining good health. "
-                        "It has many benefits, including strengthening the heart, "
-                        "building muscle, and enhancing flexibility. Exercise also has "
-                        "mental health benefits, such as reducing stress and improving mood. "
-                        "Health experts advise doing at least 30 minutes of moderate exercise "
-                        "on most days. Some popular and accessible forms of exercise "
-                        "include walking, swimming, and cycling."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_fluency.py b/src/draive/evaluators/text_fluency.py
deleted file mode 100644
index f184e3f..0000000
--- a/src/draive/evaluators/text_fluency.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_fluency_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a text. Your task is to rate this text using only the Fluency metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Fluency (0.0-2.0) - the quality of the text in terms of grammar, spelling, punctuation, \
-word choice, and sentence structure.
-
-Rating Scale:
-0.0: Poor - the text has many errors that make it hard to understand or sound unnatural.
-1.0: Fair - the text has some errors that affect the clarity or smoothness of the text, \
-but the main points are still comprehensible.
-2.0: Good - the text has few or no errors and is easy to read and follow.
-
-Evaluation Steps:
-1. Read the text and evaluate its fluency based on the given criteria.
-2. Assign a fluency score from 0.0 to 2.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<TEXT>
-{text}
-</TEXT>
-"""
-
-
-@evaluator(name="text_fluency")
-async def text_fluency_evaluator(
-    text: str,
-    /,
-) -> EvaluationScore:
-    if not text:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(text=text),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "The cat sitted on mat. It were very comfrotable. "
-                        "The sun shine bright in sky."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "The movie was good, but I didn't liked the ending. "
-                        "It left me feeling confuse and unsatisfied."
-                    ),
-                ),
-                CommonScoreModel(score=1.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "The concert last night was amazing. "
-                        "The band played all their hit songs, and the crowd was energetic "
-                        "throughout the performance."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-        ],
-    )
-
-    return score.normalized(divider=2)
diff --git a/src/draive/evaluators/text_readability.py b/src/draive/evaluators/text_readability.py
deleted file mode 100644
index 4edccc0..0000000
--- a/src/draive/evaluators/text_readability.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_readability_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a text. Your task is to rate this text using only the Readability metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Readability (0.0-4.0) - the ease with which a reader can understand the text.
-A readable text uses clear and concise language, is well-structured,
-and avoids complex or convoluted sentences. Annotators should penalize texts that \
-are difficult to read or understand.
-
-Rating Scale:
-0.0: Very low readability - the text is extremely difficult to understand, \
-with complex language and convoluted structure.
-1.0: Low readability - the text is challenging to read, with frequent use of \
-complex sentences or unclear language.
-2.0: Moderate readability - the text is somewhat clear but has some areas \
-that are difficult to understand.
-3.0: Good readability - the text is mostly clear and easy to read, with minor instances \
-of complexity.
-4.0: Excellent readability - the text is highly clear, concise, and easy to understand throughout.
-
-Evaluation Steps:
-1. Read the text carefully and evaluate how easy it is to read and understand.
-2. Consider the language used in the text, including clarity, simplicity, and sentence structure.
-3. Assess whether the text is well-structured and free from complex or convoluted sentences.
-4. Assign a readability score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<TEXT>
-{text}
-</TEXT>
-"""
-
-
-@evaluator(name="text_readability")
-async def text_readability_evaluator(
-    text: str,
-    /,
-) -> EvaluationScore:
-    if not text:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(text=text),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "The canine species, frequently domesticated for companionship purposes, "
-                        "exhibit characteristics of fidelity and ludic propensities that engender "
-                        "their widespread appeal among human populations as domestic "
-                        "animal companions."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "Pizza, a widely consumed dish, consists of a circular bread foundation "
-                        "adorned with various ingredients. Typically, it includes a layer of "
-                        "tomato-based sauce and cheese, though additional toppings may be "
-                        "incorporated to suit individual preferences."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    text=(
-                        "Exercise is good for health. It helps maintain fitness and reduces stress."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_relevance.py b/src/draive/evaluators/text_relevance.py
deleted file mode 100644
index 26fa6fd..0000000
--- a/src/draive/evaluators/text_relevance.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
-    "text_relevance_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Relevance metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Relevance (0.0-4.0) - selection of important content from the reference text.
-The compared text should include only important information from the reference text.
-Annotators should penalize compared texts that contain redundancies and excess information.
-
-Rating Scale:
-0.0: Very low relevance - the text contains mostly irrelevant or redundant information.
-1.0: Low relevance - the text includes some important points but has \
-significant irrelevant content.
-2.0: Moderate relevance - the text covers most important points but includes \
-some unnecessary information.
-3.0: Good relevance - the text focuses on important information with minor inclusions \
-of less relevant content.
-4.0: Excellent relevance - the text precisely captures only the most important information \
-from the reference text.
-
-Evaluation Steps:
-1. Read the compared text and the reference text carefully.
-2. Compare the compared text to the reference text and identify \
-the main points of the reference text.
-3. Assess how well the compared text covers the main points of the reference text, \
-and note any irrelevant or redundant information it contains.
-4. Assign a relevance score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_relevance")
-async def text_relevance_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "The sun is the star at the center of our solar system. "
-                        "It provides light and heat to Earth."
-                    ),
-                    compared=(
-                        "Stars twinkle in the night sky. Some people believe in astrology. "
-                        "The moon orbits the Earth. Astronauts have been to space. "
-                        "Solar panels use energy from the sun."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Elephants are the largest land animals. They have long trunks and tusks. "
-                        "Elephants live in herds and are known for their intelligence."
-                    ),
-                    compared=(
-                        "Elephants are very big animals. They use their trunks to grab food "
-                        "and water. Elephants live together in groups. They're smart and have "
-                        "good memories. Some people ride elephants in zoos, "
-                        "but this can be harmful to the animals."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Bicycles are a popular mode of transportation. They are eco-friendly "
-                        "and provide exercise. However, cyclists need to follow "
-                        "traffic rules for safety."
-                    ),
-                    compared=(
-                        "Bicycles are widely used for travel. "
-                        "They don't pollute and help people stay fit. "
-                        "Cyclists must obey traffic laws to stay safe."
-                    ),
-                ),
-                CommonScoreModel(score=4.0),
-            ),
-        ],
-    )
-
-    return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_similarity.py b/src/draive/evaluators/text_similarity.py
deleted file mode 100644
index b329ff8..0000000
--- a/src/draive/evaluators/text_similarity.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from draive.embedding import Embedded, embed_texts
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-from draive.similarity.score import vector_similarity_score
-
-__all__ = [
-    "text_similarity_evaluator",
-    "text_vector_similarity_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given two texts: a reference text and a compared text. \
-Your task is to rate the compared text using only the Similarity metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Similarity (0.0-2.0) - the degree of semantic similarity between the reference text \
-and the compared text.
-
-Rating Scale:
-0.0: No similarity - the reference text and compared text are completely unrelated in meaning.
-1.0: Moderate similarity - the reference text and compared text share some common themes or ideas.
-2.0: High similarity - the reference text and compared text are very close in meaning \
-or convey the same information.
-
-Evaluation Steps:
-1. Read both the reference text and the compared text carefully.
-2. Compare the semantic meaning of the reference text and the compared text.
-3. Assign a similarity score from 0.0 to 2.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-<REFERENCE_TEXT>
-{reference}
-</REFERENCE_TEXT>
-
-<COMPARED_TEXT>
-{compared}
-</COMPARED_TEXT>
-"""
-
-
-@evaluator(name="text_similarity")
-async def text_similarity_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> EvaluationScore:
-    if not compared:
-        return EvaluationScore(
-            value=0,
-            comment="Input text was empty!",
-        )
-
-    if not reference:
-        return EvaluationScore(
-            value=0,
-            comment="Reference text was empty!",
-        )
-
-    score: CommonScoreModel = await generate_model(
-        CommonScoreModel,
-        instruction=INSTRUCTION,
-        input=INPUT_TEMPLATE.format(
-            reference=reference,
-            compared=compared,
-        ),
-        examples=[
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Cats are popular pets. They are independent and like to groom themselves."
-                    ),
-                    compared=(
-                        "Bananas are a healthy fruit. They are rich in potassium and easy to peel."
-                    ),
-                ),
-                CommonScoreModel(score=0.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "The beach is a great place for relaxation. "
-                        "People enjoy swimming and sunbathing."
-                    ),
-                    compared=(
-                        "Many people like to spend time outdoors. "
-                        "Parks are popular for picnics and walking."
-                    ),
-                ),
-                CommonScoreModel(score=1.0),
-            ),
-            (
-                INPUT_TEMPLATE.format(
-                    reference=(
-                        "Coffee is a popular morning drink. It contains caffeine which helps "
-                        "people feel more alert."
-                    ),
-                    compared=(
-                        "Many people start their day with coffee. "
-                        "The caffeine in coffee can increase alertness and energy."
-                    ),
-                ),
-                CommonScoreModel(score=2.0),
-            ),
-        ],
-    )
-
-    return score.normalized(divider=2)
-
-
-@evaluator(name="text_vector_similarity")
-async def text_vector_similarity_evaluator(
-    compared: str,
-    /,
-    reference: str,
-) -> float:
-    embedding: list[Embedded[str]] = await embed_texts([reference, compared])
-
-    return vector_similarity_score(embedding[0].vector, embedding[1].vector)
diff --git a/src/draive/generation/model/lmm.py b/src/draive/generation/model/lmm.py
index c529c7d..9bf8bbd 100644
--- a/src/draive/generation/model/lmm.py
+++ b/src/draive/generation/model/lmm.py
@@ -145,9 +145,11 @@ async def lmm_generate_model[Generated: DataModel](  # noqa: PLR0913, C901, PLR0
 
 
 DEFAULT_INSTRUCTION_EXTENSION: str = """\
-The result have to be a JSON conforming to the following schema:
+<FORMAT>
+The result have to be a JSON object conforming to the following schema:
 ```
 {schema}
 ```
 Provide ONLY a single, raw, valid JSON without any comments, formatting or additional elements.
+</FORMAT>
 """
diff --git a/src/draive/types/__init__.py b/src/draive/types/__init__.py
index 87b6588..a13f7b9 100644
--- a/src/draive/types/__init__.py
+++ b/src/draive/types/__init__.py
@@ -21,6 +21,8 @@
     MultimodalContent,
     MultimodalContentConvertible,
     MultimodalContentElement,
+    MultimodalContentPlaceholder,
+    MultimodalTemplate,
 )
 from draive.types.text import TextContent
 from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent
@@ -46,10 +48,12 @@
     "LMMToolRequests",
     "LMMToolResponse",
     "Memory",
+    "Multimodal",
     "MultimodalContent",
     "MultimodalContentConvertible",
     "MultimodalContentElement",
-    "Multimodal",
+    "MultimodalContentPlaceholder",
+    "MultimodalTemplate",
     "RateLimitError",
     "TextContent",
     "VideoBase64Content",
diff --git a/src/draive/types/multimodal.py b/src/draive/types/multimodal.py
index 91a607c..e4da41c 100644
--- a/src/draive/types/multimodal.py
+++ b/src/draive/types/multimodal.py
@@ -10,12 +10,15 @@
 from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent
 
 __all__ = [
+    "Multimodal",
     "MultimodalContent",
-    "MultimodalContentElement",
     "MultimodalContentConvertible",
-    "Multimodal",
+    "MultimodalContentElement",
+    "MultimodalContentPlaceholder",
+    "MultimodalTemplate",
 ]
 
+
 MultimodalContentElement = TextContent | ImageContent | AudioContent | VideoContent | DataModel
 MultimodalContentConvertible = str | MultimodalContentElement
 
@@ -165,10 +168,70 @@ def extending(
     def __bool__(self) -> bool:
         return bool(self.parts) and any(self.parts)
 
+    def __str__(self) -> str:
+        return self.as_string()
+
 
 Multimodal = MultimodalContent | MultimodalContentConvertible
 
 
+class MultimodalContentPlaceholder(DataModel):
+    identifier: str
+
+
+class MultimodalTemplate(DataModel):
+    @classmethod
+    def of(
+        cls,
+        *elements: Multimodal | MultimodalContentPlaceholder | tuple[str],
+        merge_text: bool = True,
+        skip_empty: bool = True,
+        meta: dict[str, str | float | int | bool | None] | None = None,
+    ) -> Self:
+        return cls(
+            parts=tuple(
+                [
+                    MultimodalContentPlaceholder(identifier=element[0])
+                    if isinstance(element, tuple)
+                    else element
+                    for element in elements
+                ]
+            ),
+            merge_text=merge_text,
+            skip_empty=skip_empty,
+            meta=meta,
+        )
+
+    parts: frozenlist[Multimodal | MultimodalContentPlaceholder]
+    merge_text: bool
+    skip_empty: bool
+    meta: dict[str, str | float | int | bool | None] | None
+
+    def format(
+        self,
+        **variables: Multimodal,
+    ) -> MultimodalContent:
+        parts: list[Multimodal] = []
+        for part in self.parts:
+            match part:
+                case MultimodalContentPlaceholder() as placeholder:
+                    if value := variables.get(placeholder.identifier):
+                        parts.append(value)
+
+                    else:
+                        raise ValueError("Missing format variable '%s'", placeholder.identifier)
+
+                case part:
+                    parts.append(part)
+
+        return MultimodalContent.of(
+            *parts,
+            merge_text=self.merge_text,
+            skip_empty=self.skip_empty,
+            meta=self.meta,
+        )
+
+
 def _extract_parts(  # noqa: PLR0911
     element: Multimodal,
     /,
@@ -315,12 +378,18 @@ def _merge_texts(
     last_text_element: TextContent | None = None
     while element := next(iterator, None):
         match element:
-            case TextContent() as text:  # do not merge texts with different metadata
-                if (last_text := last_text_element) and last_text.meta == text.meta:
-                    last_text_element = TextContent(
-                        text=last_text.text + text.text,
-                        meta=text.meta,
-                    )
+            case TextContent() as text:
+                # do not merge texts with different metadata
+                if last_text := last_text_element:
+                    if last_text.meta == text.meta:
+                        last_text_element = TextContent(
+                            text=last_text.text + text.text,
+                            meta=text.meta,
+                        )
+
+                    else:
+                        result.append(last_text)
+                        last_text_element = text
 
                 else:
                     last_text_element = text