From 571768050dd1fa4b9f3ec3ac09b80c0d09fe2dd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kacper=20Kali=C5=84ski?=
<47140412+KaQuMiQ@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:31:14 +0200
Subject: [PATCH] Refine evaluation interfaces
---
Makefile | 2 +-
constraints | 18 ++-
pyproject.toml | 2 +-
src/draive/__init__.py | 6 +-
src/draive/evaluation/__init__.py | 13 +-
src/draive/evaluation/evaluator.py | 126 ++++++++++++----
src/draive/evaluation/scenario.py | 84 +++++++++--
src/draive/evaluation/score.py | 17 ---
src/draive/evaluation/suite.py | 90 +++++++----
src/draive/evaluators/__init__.py | 40 ++---
src/draive/evaluators/coherence.py | 90 +++++++++++
src/draive/evaluators/conciseness.py | 88 +++++++++++
src/draive/evaluators/consistency.py | 90 +++++++++++
src/draive/evaluators/coverage.py | 88 +++++++++++
src/draive/evaluators/fluency.py | 74 +++++++++
.../{text_keywords.py => keywords.py} | 15 +-
src/draive/evaluators/readability.py | 80 ++++++++++
src/draive/evaluators/relevance.py | 91 ++++++++++++
src/draive/evaluators/score.py | 8 +-
src/draive/evaluators/similarity.py | 133 +++++++++++++++++
src/draive/evaluators/text_coherence.py | 134 -----------------
src/draive/evaluators/text_conciseness.py | 140 ------------------
src/draive/evaluators/text_consistency.py | 133 -----------------
src/draive/evaluators/text_coverage.py | 137 -----------------
src/draive/evaluators/text_fluency.py | 89 -----------
src/draive/evaluators/text_readability.py | 100 -------------
src/draive/evaluators/text_relevance.py | 130 ----------------
src/draive/evaluators/text_similarity.py | 128 ----------------
src/draive/generation/model/lmm.py | 4 +-
src/draive/types/__init__.py | 6 +-
src/draive/types/multimodal.py | 85 ++++++++++-
31 files changed, 1111 insertions(+), 1130 deletions(-)
create mode 100644 src/draive/evaluators/coherence.py
create mode 100644 src/draive/evaluators/conciseness.py
create mode 100644 src/draive/evaluators/consistency.py
create mode 100644 src/draive/evaluators/coverage.py
create mode 100644 src/draive/evaluators/fluency.py
rename src/draive/evaluators/{text_keywords.py => keywords.py} (73%)
create mode 100644 src/draive/evaluators/readability.py
create mode 100644 src/draive/evaluators/relevance.py
create mode 100644 src/draive/evaluators/similarity.py
delete mode 100644 src/draive/evaluators/text_coherence.py
delete mode 100644 src/draive/evaluators/text_conciseness.py
delete mode 100644 src/draive/evaluators/text_consistency.py
delete mode 100644 src/draive/evaluators/text_coverage.py
delete mode 100644 src/draive/evaluators/text_fluency.py
delete mode 100644 src/draive/evaluators/text_readability.py
delete mode 100644 src/draive/evaluators/text_relevance.py
delete mode 100644 src/draive/evaluators/text_similarity.py
diff --git a/Makefile b/Makefile
index c9310d3..9890709 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ ifndef INSTALL_OPTIONS
endif
ifndef UV_VERSION
- UV_VERSION := 0.2.25
+ UV_VERSION := 0.2.37
endif
.PHONY: install venv sync lock update format lint test release
diff --git a/constraints b/constraints
index c0b444c..658ae02 100644
--- a/constraints
+++ b/constraints
@@ -57,7 +57,9 @@ idna==3.7
iniconfig==2.0.0
# via pytest
jiter==0.5.0
- # via anthropic
+ # via
+ # anthropic
+ # openai
loguru==0.7.2
# via fastembed
markdown-it-py==3.0.0
@@ -80,9 +82,9 @@ numpy==1.26.4
# onnxruntime
onnx==1.16.2
# via fastembed
-onnxruntime==1.18.1
+onnxruntime==1.19.0
# via fastembed
-openai==1.38.0
+openai==1.41.0
# via draive (pyproject.toml)
packaging==24.1
# via
@@ -107,7 +109,7 @@ pydantic-core==2.20.1
# via pydantic
pygments==2.18.0
# via rich
-pyright==1.1.374
+pyright==1.1.375
# via draive (pyproject.toml)
pystemmer==2.2.0.1
# via fastembed
@@ -120,7 +122,7 @@ pytest-asyncio==0.23.8
# via draive (pyproject.toml)
pytest-cov==4.1.0
# via draive (pyproject.toml)
-pyyaml==6.0.1
+pyyaml==6.0.2
# via
# bandit
# huggingface-hub
@@ -133,7 +135,7 @@ requests==2.32.3
# tiktoken
rich==13.7.1
# via bandit
-ruff==0.5.6
+ruff==0.5.7
# via draive (pyproject.toml)
sentencepiece==0.2.0
# via draive (pyproject.toml)
@@ -147,11 +149,11 @@ snowballstemmer==2.2.0
# via fastembed
stevedore==5.2.0
# via bandit
-sympy==1.13.1
+sympy==1.13.2
# via onnxruntime
tiktoken==0.7.0
# via draive (pyproject.toml)
-tokenizers==0.19.1
+tokenizers==0.20.0
# via
# anthropic
# fastembed
diff --git a/pyproject.toml b/pyproject.toml
index 3f50e73..99d3772 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "draive"
description = "Framework designed to simplify and accelerate the development of LLM-based applications."
-version = "0.25.0"
+version = "0.26.0"
readme = "README.md"
maintainers = [
{ name = "Kacper KaliĆski", email = "kacper.kalinski@miquido.com" },
diff --git a/src/draive/__init__.py b/src/draive/__init__.py
index 585060a..5140164 100644
--- a/src/draive/__init__.py
+++ b/src/draive/__init__.py
@@ -148,6 +148,8 @@
MultimodalContent,
MultimodalContentConvertible,
MultimodalContentElement,
+ MultimodalContentPlaceholder,
+ MultimodalTemplate,
RateLimitError,
TextContent,
VideoBase64Content,
@@ -284,10 +286,12 @@
"ModelGeneration",
"ModelGenerator",
"ModelGeneratorDecoder",
+ "Multimodal",
"MultimodalContent",
"MultimodalContentConvertible",
"MultimodalContentElement",
- "Multimodal",
+ "MultimodalContentPlaceholder",
+ "MultimodalTemplate",
"noop",
"not_missing",
"ParameterDefaultFactory",
diff --git a/src/draive/evaluation/__init__.py b/src/draive/evaluation/__init__.py
index 87f2a4b..b8e25e9 100644
--- a/src/draive/evaluation/__init__.py
+++ b/src/draive/evaluation/__init__.py
@@ -1,36 +1,39 @@
from draive.evaluation.evaluator import (
Evaluator,
+ EvaluatorDefinition,
EvaluatorResult,
PreparedEvaluator,
evaluator,
)
from draive.evaluation.scenario import (
+ EvaluationScenarioResult,
PreparedScenarioEvaluator,
ScenarioEvaluator,
ScenarioEvaluatorDefinition,
ScenarioEvaluatorResult,
evaluation_scenario,
)
-from draive.evaluation.score import Evaluation, EvaluationScore
+from draive.evaluation.score import EvaluationScore
from draive.evaluation.suite import (
EvaluationCaseResult,
EvaluationSuite,
EvaluationSuiteCase,
- EvaluationSuiteCaseResult,
EvaluationSuiteDefinition,
EvaluationSuiteStorage,
+ SuiteEvaluatorCaseResult,
+ SuiteEvaluatorResult,
evaluation_suite,
)
__all__ = [
"evaluation_scenario",
"evaluation_suite",
- "Evaluation",
+ "EvaluatorDefinition",
"EvaluationCaseResult",
+ "EvaluationScenarioResult",
"EvaluationScore",
"EvaluationSuite",
"EvaluationSuiteCase",
- "EvaluationSuiteCaseResult",
"EvaluationSuiteDefinition",
"EvaluationSuiteStorage",
"evaluator",
@@ -41,4 +44,6 @@
"ScenarioEvaluator",
"ScenarioEvaluatorDefinition",
"ScenarioEvaluatorResult",
+ "SuiteEvaluatorCaseResult",
+ "SuiteEvaluatorResult",
]
diff --git a/src/draive/evaluation/evaluator.py b/src/draive/evaluation/evaluator.py
index 11ab5e2..c17be7b 100644
--- a/src/draive/evaluation/evaluator.py
+++ b/src/draive/evaluation/evaluator.py
@@ -1,8 +1,9 @@
from collections.abc import Callable
from typing import Protocol, Self, cast, final, overload, runtime_checkable
-from draive.evaluation.score import Evaluation, EvaluationScore
+from draive.evaluation.score import EvaluationScore
from draive.parameters import DataModel, Field, ParameterPath
+from draive.scope import ctx
from draive.utils import freeze
__all__ = [
@@ -10,6 +11,7 @@
"Evaluator",
"EvaluatorResult",
"PreparedEvaluator",
+ "EvaluatorDefinition",
]
@@ -23,12 +25,63 @@ class EvaluatorResult(DataModel):
threshold: float = Field(
description="Score threshold required to pass evaluation",
)
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
@property
def passed(self) -> bool:
return self.score.value >= self.threshold
+class EvaluationResult(DataModel):
+ @classmethod
+ async def of(
+ cls,
+ score: EvaluationScore | float | bool,
+ /,
+ meta: dict[str, str | float | int | bool | None] | None = None,
+ ) -> Self:
+ evaluation_score: EvaluationScore
+ match score:
+ case EvaluationScore() as score:
+ evaluation_score = score
+
+ case float() as value:
+ evaluation_score = EvaluationScore(value=value)
+
+ case passed:
+ evaluation_score = EvaluationScore(value=1.0 if passed else 0.0)
+
+ return cls(
+ score=evaluation_score,
+ meta=meta,
+ )
+
+ score: EvaluationScore = Field(
+ description="Evaluation score",
+ )
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
+
+
+@runtime_checkable
+class EvaluatorDefinition[Value, **Args](Protocol):
+ @property
+ def __name__(self) -> str: ...
+
+ async def __call__(
+ self,
+ value: Value,
+ /,
+ *args: Args.args,
+ **kwargs: Args.kwargs,
+ ) -> EvaluationResult | EvaluationScore | float | bool: ...
+
+
@runtime_checkable
class PreparedEvaluator[Value](Protocol):
async def __call__(
@@ -43,14 +96,14 @@ class Evaluator[Value, **Args]:
def __init__(
self,
name: str,
- evaluation: Evaluation[Value, Args],
+ definition: EvaluatorDefinition[Value, Args],
threshold: float | None,
) -> None:
assert ( # nosec: B101
threshold is None or 0 <= threshold <= 1
), "Evaluation threshold has to be between 0 and 1"
- self._evaluation: Evaluation[Value, Args] = evaluation
+ self._definition: EvaluatorDefinition[Value, Args] = definition
self.name: str = name
self.threshold: float = threshold or 1
@@ -62,7 +115,7 @@ def with_threshold(
) -> Self:
return self.__class__(
name=self.name,
- evaluation=self._evaluation,
+ definition=self._definition,
threshold=threshold,
)
@@ -102,8 +155,8 @@ async def evaluation(
value: Mapped,
*args: Args.args,
**kwargs: Args.kwargs,
- ) -> EvaluationScore | float | bool:
- return await self._evaluation(
+ ) -> EvaluationResult | EvaluationScore | float | bool:
+ return await self._definition(
mapper(value),
*args,
**kwargs,
@@ -111,7 +164,7 @@ async def evaluation(
return Evaluator[Mapped, Args](
name=self.name,
- evaluation=evaluation,
+ definition=evaluation,
threshold=self.threshold,
)
@@ -123,34 +176,51 @@ async def __call__(
**kwargs: Args.kwargs,
) -> EvaluatorResult:
evaluation_score: EvaluationScore
- match await self._evaluation(
- value,
- *args,
- **kwargs,
- ):
- case float() as score_value:
- evaluation_score = EvaluationScore(value=score_value)
+ evaluation_meta: dict[str, str | float | int | bool | None] | None
+ try:
+ match await self._definition(
+ value,
+ *args,
+ **kwargs,
+ ):
+ case EvaluationResult() as result:
+ evaluation_score = result.score
+ evaluation_meta = result.meta
- case bool() as score_bool:
- evaluation_score = EvaluationScore(value=1 if score_bool else 0)
+ case EvaluationScore() as score:
+ evaluation_score = score
+ evaluation_meta = None
- case EvaluationScore() as score:
- evaluation_score = score
+ case float() as score_value:
+ evaluation_score = EvaluationScore(value=score_value)
+ evaluation_meta = None
+
+ case passed:
+ evaluation_score = EvaluationScore(value=1 if passed else 0)
+ evaluation_meta = None
- # for whatever reason pyright wants int to be handled...
- case int() as score_int:
- evaluation_score = EvaluationScore(value=float(score_int))
+ except Exception as exc:
+ ctx.log_error(
+ f"Evaluator `{self.name}` failed, using `0` score fallback result",
+ exception=exc,
+ )
+ evaluation_score = EvaluationScore(
+ value=0,
+ comment="Evaluation failed",
+ )
+ evaluation_meta = {"exception": str(exc)}
return EvaluatorResult(
evaluator=self.name,
score=evaluation_score,
threshold=self.threshold,
+ meta=evaluation_meta,
)
@overload
def evaluator[Value, **Args](
- evaluation: Evaluation[Value, Args] | None = None,
+ definition: EvaluatorDefinition[Value, Args] | None = None,
/,
) -> Evaluator[Value, Args]: ...
@@ -161,29 +231,29 @@ def evaluator[Value, **Args](
name: str | None = None,
threshold: float | None = None,
) -> Callable[
- [Evaluation[Value, Args]],
+ [EvaluatorDefinition[Value, Args]],
Evaluator[Value, Args],
]: ...
def evaluator[Value, **Args](
- evaluation: Evaluation[Value, Args] | None = None,
+ evaluation: EvaluatorDefinition[Value, Args] | None = None,
*,
name: str | None = None,
threshold: float | None = None,
) -> (
Callable[
- [Evaluation[Value, Args]],
+ [EvaluatorDefinition[Value, Args]],
Evaluator[Value, Args],
]
| Evaluator[Value, Args]
):
def wrap(
- evaluation: Evaluation[Value, Args],
+ definition: EvaluatorDefinition[Value, Args],
) -> Evaluator[Value, Args]:
return Evaluator(
- name=name or evaluation.__name__,
- evaluation=evaluation,
+ name=name or definition.__name__,
+ definition=definition,
threshold=threshold,
)
diff --git a/src/draive/evaluation/scenario.py b/src/draive/evaluation/scenario.py
index 84ae8b7..2952d78 100644
--- a/src/draive/evaluation/scenario.py
+++ b/src/draive/evaluation/scenario.py
@@ -1,8 +1,10 @@
+from asyncio import gather
from collections.abc import Callable, Sequence
-from typing import Protocol, overload, runtime_checkable
+from typing import Protocol, Self, overload, runtime_checkable
-from draive.evaluation.evaluator import EvaluatorResult
+from draive.evaluation.evaluator import EvaluatorResult, PreparedEvaluator
from draive.parameters import DataModel, Field
+from draive.scope import ctx
from draive.types import frozenlist
from draive.utils import freeze
@@ -11,6 +13,7 @@
"ScenarioEvaluator",
"ScenarioEvaluatorDefinition",
"ScenarioEvaluatorResult",
+ "EvaluationScenarioResult",
]
@@ -21,10 +24,44 @@ class ScenarioEvaluatorResult(DataModel):
evaluations: frozenlist[EvaluatorResult] = Field(
description="Scenario evaluation results",
)
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
@property
def passed(self) -> bool:
- return all(case.passed for case in self.evaluations)
+ # empty evaluations is equivalent of failure
+ return len(self.evaluations) > 0 and all(case.passed for case in self.evaluations)
+
+
+class EvaluationScenarioResult(DataModel):
+ @classmethod
+ async def evaluating[Value](
+ cls,
+ value: Value,
+ /,
+ evaluators: PreparedEvaluator[Value],
+ *_evaluators: PreparedEvaluator[Value],
+ meta: dict[str, str | float | int | bool | None] | None = None,
+ ) -> Self:
+ return cls(
+ evaluations=tuple(
+ await gather(
+ *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
+ return_exceptions=False,
+ ),
+ ),
+ meta=meta,
+ )
+
+ evaluations: frozenlist[EvaluatorResult] = Field(
+ description="Scenario evaluation results",
+ )
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
@runtime_checkable
@@ -47,7 +84,7 @@ async def __call__(
/,
*args: Args.args,
**kwargs: Args.kwargs,
- ) -> Sequence[EvaluatorResult]: ...
+ ) -> Sequence[EvaluatorResult] | EvaluationScenarioResult: ...
class ScenarioEvaluator[Value, **Args]:
@@ -84,16 +121,35 @@ async def __call__(
*args: Args.args,
**kwargs: Args.kwargs,
) -> ScenarioEvaluatorResult:
- return ScenarioEvaluatorResult(
- name=self.name,
- evaluations=tuple(
- await self._definition(
- value,
- *args,
- **kwargs,
- )
- ),
- )
+ try:
+ match await self._definition(
+ value,
+ *args,
+ **kwargs,
+ ):
+ case EvaluationScenarioResult() as result:
+ return ScenarioEvaluatorResult(
+ name=self.name,
+ evaluations=result.evaluations,
+ meta=result.meta,
+ )
+
+ case [*results]:
+ return ScenarioEvaluatorResult(
+ name=self.name,
+ evaluations=tuple(results),
+ )
+ except Exception as exc:
+ ctx.log_error(
+ f"Scenario evaluator `{self.name}` failed, using empty fallback result",
+ exception=exc,
+ )
+
+ return ScenarioEvaluatorResult(
+ name=self.name,
+ evaluations=(),
+ meta={"exception": str(exc)},
+ )
@overload
diff --git a/src/draive/evaluation/score.py b/src/draive/evaluation/score.py
index 3082438..3acf94c 100644
--- a/src/draive/evaluation/score.py
+++ b/src/draive/evaluation/score.py
@@ -1,9 +1,6 @@
-from typing import Protocol, runtime_checkable
-
from draive.parameters import DataModel, Field
__all__ = [
- "Evaluation",
"EvaluationScore",
]
@@ -24,17 +21,3 @@ class EvaluationScore(DataModel):
description="Explanation of the score",
default=None,
)
-
-
-@runtime_checkable
-class Evaluation[Value, **Args](Protocol):
- @property
- def __name__(self) -> str: ...
-
- async def __call__(
- self,
- value: Value,
- /,
- *args: Args.args,
- **kwargs: Args.kwargs,
- ) -> EvaluationScore | float | bool: ...
diff --git a/src/draive/evaluation/suite.py b/src/draive/evaluation/suite.py
index bbe592d..9a45f2d 100644
--- a/src/draive/evaluation/suite.py
+++ b/src/draive/evaluation/suite.py
@@ -15,9 +15,10 @@
"evaluation_suite",
"EvaluationCaseResult",
"EvaluationSuite",
- "EvaluationSuiteCaseResult",
"EvaluationSuiteDefinition",
"EvaluationSuiteStorage",
+ "SuiteEvaluatorCaseResult",
+ "SuiteEvaluatorResult",
]
@@ -27,20 +28,33 @@ class EvaluationSuiteCase[CaseParameters: DataModel](DataModel):
comment: str | None = None
-class EvaluationSuiteCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
+class SuiteEvaluatorCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
case: EvaluationSuiteCase[CaseParameters] = Field(
description="Evaluated case",
)
value: Value = Field(
description="Evaluated value",
)
- results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field(
+ results: frozenlist[ScenarioEvaluatorResult] = Field(
description="Evaluation results",
)
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
@property
def passed(self) -> bool:
- return all(result.passed for result in self.results)
+ # empty results is equivalent of failure
+ return len(self.results) > 0 and all(result.passed for result in self.results)
+
+
+class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
+ cases: list[SuiteEvaluatorCaseResult[CaseParameters, Value]]
+
+ @property
+ def passed(self) -> bool:
+ return all(case.passed for case in self.cases)
class EvaluationCaseResult[Value: DataModel | str](DataModel):
@@ -50,10 +64,30 @@ def of(
results: ScenarioEvaluatorResult | EvaluatorResult,
*_results: ScenarioEvaluatorResult | EvaluatorResult,
value: Value,
+ meta: dict[str, str | float | int | bool | None] | None = None,
) -> Self:
+ free_results: list[EvaluatorResult] = []
+ scenario_results: list[ScenarioEvaluatorResult] = []
+ for result in (results, *_results):
+ match result:
+ case ScenarioEvaluatorResult() as scenario_result:
+ scenario_results.append(scenario_result)
+
+ case EvaluatorResult() as evaluator_result:
+ free_results.append(evaluator_result)
+
+ if free_results:
+ scenario_results.append(
+ ScenarioEvaluatorResult(
+ name="EvaluationSuite",
+ evaluations=tuple(free_results),
+ )
+ )
+
return cls(
value=value,
- results=(results, *_results),
+ results=tuple(scenario_results),
+ meta=meta,
)
@classmethod
@@ -63,23 +97,27 @@ async def evaluating(
/,
evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value],
*_evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value],
+ meta: dict[str, str | float | int | bool | None] | None = None,
) -> Self:
- return cls(
- value=value,
- results=tuple(
- await gather(
- *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
- return_exceptions=False,
- ),
+ return cls.of(
+ *await gather(
+ *[evaluator(value) for evaluator in [evaluators, *_evaluators]],
+ return_exceptions=False,
),
+ value=value,
+ meta=meta,
)
value: Value = Field(
description="Evaluated value",
)
- results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field(
+ results: frozenlist[ScenarioEvaluatorResult] = Field(
description="Evaluation results",
)
+ meta: dict[str, str | float | int | bool | None] | None = Field(
+ description="Additional evaluation metadata",
+ default=None,
+ )
@runtime_checkable
@@ -124,7 +162,7 @@ async def __call__(
/,
*,
reload: bool = False,
- ) -> EvaluationSuiteCaseResult[CaseParameters, Value]: ...
+ ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]: ...
@overload
async def __call__(
@@ -132,7 +170,7 @@ async def __call__(
/,
*,
reload: bool = False,
- ) -> list[EvaluationSuiteCaseResult[CaseParameters, Value]]: ...
+ ) -> SuiteEvaluatorResult[CaseParameters, Value]: ...
async def __call__(
self,
@@ -141,18 +179,20 @@ async def __call__(
*,
reload: bool = False,
) -> (
- list[EvaluationSuiteCaseResult[CaseParameters, Value]]
- | EvaluationSuiteCaseResult[CaseParameters, Value]
+ SuiteEvaluatorResult[CaseParameters, Value]
+ | SuiteEvaluatorCaseResult[CaseParameters, Value]
):
async with self._lock:
match parameters:
case None:
- return await gather(
- *[
- self._evaluate(case=case)
- for case in (await self._data(reload=reload)).cases
- ],
- return_exceptions=False,
+ return SuiteEvaluatorResult(
+ cases=await gather(
+ *[
+ self._evaluate(case=case)
+ for case in (await self._data(reload=reload)).cases
+ ],
+ return_exceptions=False,
+ )
)
case UUID() as identifier:
@@ -180,10 +220,10 @@ async def _evaluate(
self,
*,
case: EvaluationSuiteCase[CaseParameters],
- ) -> EvaluationSuiteCaseResult[CaseParameters, Value]:
+ ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]:
result: EvaluationCaseResult[Value] = await self._definition(parameters=case.parameters)
- return EvaluationSuiteCaseResult[CaseParameters, Value](
+ return SuiteEvaluatorCaseResult[CaseParameters, Value](
case=case,
value=result.value,
results=result.results,
diff --git a/src/draive/evaluators/__init__.py b/src/draive/evaluators/__init__.py
index 68d8735..923af8e 100644
--- a/src/draive/evaluators/__init__.py
+++ b/src/draive/evaluators/__init__.py
@@ -1,25 +1,27 @@
-from draive.evaluators.text_coherence import text_coherence_evaluator
-from draive.evaluators.text_conciseness import text_conciseness_evaluator
-from draive.evaluators.text_consistency import text_consistency_evaluator
-from draive.evaluators.text_coverage import text_coverage_evaluator
-from draive.evaluators.text_fluency import text_fluency_evaluator
-from draive.evaluators.text_keywords import text_keywords_evaluator
-from draive.evaluators.text_readability import text_readability_evaluator
-from draive.evaluators.text_relevance import text_relevance_evaluator
-from draive.evaluators.text_similarity import (
- text_similarity_evaluator,
+from draive.evaluators.coherence import coherence_evaluator
+from draive.evaluators.conciseness import conciseness_evaluator
+from draive.evaluators.consistency import consistency_evaluator
+from draive.evaluators.coverage import coverage_evaluator
+from draive.evaluators.fluency import fluency_evaluator
+from draive.evaluators.keywords import keywords_evaluator
+from draive.evaluators.readability import readability_evaluator
+from draive.evaluators.relevance import relevance_evaluator
+from draive.evaluators.similarity import (
+ image_vector_similarity_evaluator,
+ similarity_evaluator,
text_vector_similarity_evaluator,
)
__all__ = [
- "text_coherence_evaluator",
- "text_conciseness_evaluator",
- "text_consistency_evaluator",
- "text_coverage_evaluator",
- "text_fluency_evaluator",
- "text_readability_evaluator",
- "text_relevance_evaluator",
- "text_keywords_evaluator",
- "text_similarity_evaluator",
+ "coherence_evaluator",
+ "conciseness_evaluator",
+ "consistency_evaluator",
+ "coverage_evaluator",
+ "fluency_evaluator",
+ "image_vector_similarity_evaluator",
+ "keywords_evaluator",
+ "readability_evaluator",
+ "relevance_evaluator",
+ "similarity_evaluator",
"text_vector_similarity_evaluator",
]
diff --git a/src/draive/evaluators/coherence.py b/src/draive/evaluators/coherence.py
new file mode 100644
index 0000000..d80a58f
--- /dev/null
+++ b/src/draive/evaluators/coherence.py
@@ -0,0 +1,90 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "coherence_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is coherence - a collective quality of the content.
+We align this dimension with the DUC (Document Understanding Conference) quality question of \
+structure and coherence, whereby the content should be well-structured and well-organized.
+EVALUATED content should not just be a heap of related information, but should build from part
+to part into a coherent body of information about the topic.
+
+
+
+Assign a coherence score using value between 0.0 and 4.0 where:
+0.0 is very low coherence - the content is chaotic, lacking logical connections between parts.
+1.0 is low coherence - some connections are visible, but the overall structure is weak.
+2.0 is moderate coherence - the content has a noticeable structure, but with some shortcomings.
+3.0 is good coherence - the content is well-organized with minor imperfections.
+4.0 is excellent coherence - the content is exemplarily structured, with smooth transitions \
+between ideas.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="coherence")
+async def coherence_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/conciseness.py b/src/draive/evaluators/conciseness.py
new file mode 100644
index 0000000..7fe01a2
--- /dev/null
+++ b/src/draive/evaluators/conciseness.py
@@ -0,0 +1,88 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "conciseness_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a conciseness metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is conciseness - a extent to which the EVALUATED content is brief and to the \
+point while still covering all key information.
+A concise content avoids unnecessary details and repetition, also avoiding being overly verbose \
+or include irrelevant information.
+
+
+
+Assign a conciseness score using value between 0.0 and 4.0 where:
+0.0 is very low conciseness - the content is excessively verbose with much irrelevant information.
+1.0 is low conciseness - the content contains unnecessary details and some irrelevant information.
+2.0 is moderate conciseness - the content is somewhat concise but could be more focused.
+3.0 is good conciseness - the content is mostly concise with minimal unnecessary information.
+4.0 is excellent conciseness - the content is highly concise, containing only essential information.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="conciseness")
+async def conciseness_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/consistency.py b/src/draive/evaluators/consistency.py
new file mode 100644
index 0000000..93fd4b5
--- /dev/null
+++ b/src/draive/evaluators/consistency.py
@@ -0,0 +1,90 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "consistency_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a consistency metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is consistency - a factual alignment between the REFERENCE and the EVALUATED content.
+A factually consistent content contains only elements that are entailed by the REFERENCE content.
+
+
+
+Assign a consistency score using value between 0.0 and 4.0 where:
+0.0 is very low consistency - the content contains multiple hallucinated facts \
+or significant misalignments with the reference content.
+1.0 is low consistency - the content has several instances of information not supported by \
+the reference content.
+2.0 is moderate consistency - the content is mostly consistent but contains a few unsupported \
+statements.
+3.0 is good consistency - the content is largely consistent with minor discrepancies.
+4.0 is excellent consistency - the content is fully consistent with the reference content, \
+containing only supported information.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+""" # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="consistency")
+async def consistency_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/coverage.py b/src/draive/evaluators/coverage.py
new file mode 100644
index 0000000..c4713b4
--- /dev/null
+++ b/src/draive/evaluators/coverage.py
@@ -0,0 +1,88 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "coverage_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a coverage metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is coverage - the extent to which the EVALUATED content includes all \
+the key points from the REFERENCE content.
+EVALUATED content with good coverage includes all the important information from \
+the REFERENCE content without omitting critical points.
+
+
+
+Assign a coverage score using value between 0.0 and 4.0 where:
+0.0 is very low coverage - the content misses most key points from the reference content.
+1.0 is low coverage - the content includes some key points but omits several important ones.
+2.0 is moderate coverage - the content covers most key points but misses a few important details.
+3.0 is good coverage - the content includes nearly all key points with minor omissions.
+4.0 is excellent coverage - the content comprehensively covers all key points from the reference content.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+""" # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="coverage")
+async def coverage_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/fluency.py b/src/draive/evaluators/fluency.py
new file mode 100644
index 0000000..74dac21
--- /dev/null
+++ b/src/draive/evaluators/fluency.py
@@ -0,0 +1,74 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "fluency_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Carefully examine provided CONTENT, then rate it using solely a \
+fluency metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is fluency - the quality of the content in terms of grammar, spelling, \
+punctuation, content choice, and overall structure.
+
+
+
+Assign a fluency score using value between 0.0 and 2.0 where:
+0.0 is poor fluency - the content has many errors that make it hard to understand or look unnatural.
+1.0 is fair fluency - the content has some errors that affect the clarity or smoothness, \
+but the main points are still comprehensible.
+2.0 is good fluency - the content has few or no errors and is easy to read and follow.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("content",),
+ "",
+)
+
+
+@evaluator(name="fluency")
+async def fluency_evaluator(
+ content: Multimodal,
+ /,
+) -> EvaluationScore:
+ if not content:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ content=content,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 2,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/text_keywords.py b/src/draive/evaluators/keywords.py
similarity index 73%
rename from src/draive/evaluators/text_keywords.py
rename to src/draive/evaluators/keywords.py
index 737fa8e..1fa8163 100644
--- a/src/draive/evaluators/text_keywords.py
+++ b/src/draive/evaluators/keywords.py
@@ -1,23 +1,24 @@
from collections.abc import Callable, Sequence
from draive.evaluation import EvaluationScore, evaluator
+from draive.types import Multimodal, MultimodalContent
__all__ = [
- "text_keywords_evaluator",
+ "keywords_evaluator",
]
-@evaluator(name="text_keywords")
-async def text_keywords_evaluator(
- text: str,
+@evaluator(name="keywords")
+async def keywords_evaluator(
+ content: Multimodal,
/,
keywords: Sequence[str],
normalization: Callable[[str], str] | None = None,
) -> EvaluationScore:
- if not text:
+ if not content:
return EvaluationScore(
value=0,
- comment="Input text was empty!",
+ comment="Input was empty!",
)
if not keywords:
@@ -33,7 +34,7 @@ async def text_keywords_evaluator(
else:
text_normalization = _lowercased
- normalized_text: str = text_normalization(text)
+ normalized_text: str = text_normalization(MultimodalContent.of(content).as_string())
return EvaluationScore(
value=len(
[keyword for keyword in keywords if text_normalization(keyword) in normalized_text]
diff --git a/src/draive/evaluators/readability.py b/src/draive/evaluators/readability.py
new file mode 100644
index 0000000..3a2b0b7
--- /dev/null
+++ b/src/draive/evaluators/readability.py
@@ -0,0 +1,80 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "readability_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Carefully examine provided CONTENT, then rate it using solely a \
+readability metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is readability - the ease with which a reader can understand the content.
+A readable content uses clear and concise language, is well-structured,
+and avoids complex or convoluted elements.
+
+
+
+Assign a readability score using value between 0.0 and 4.0 where:
+0.0 is very low readability - the content is extremely difficult to understand, \
+with complex language and convoluted structure.
+1.0 is low readability - the content is challenging to read, with frequent use of \
+complex sentences, unclear language or irrelevant parts.
+2.0 is moderate readability - the content is somewhat clear but has some areas \
+that are difficult to understand.
+3.0 is good readability - the content is mostly clear and easy to read, with minor instances \
+of complexity.
+4.0 is excellent readability - the content is highly clear, concise, and easy to understand throughout.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+""" # noqa: E501
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("content",),
+ "",
+)
+
+
+@evaluator(name="readability")
+async def readability_evaluator(
+ content: Multimodal,
+ /,
+) -> EvaluationScore:
+ if not content:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ content=content,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/relevance.py b/src/draive/evaluators/relevance.py
new file mode 100644
index 0000000..ae62475
--- /dev/null
+++ b/src/draive/evaluators/relevance.py
@@ -0,0 +1,91 @@
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.types import Multimodal, MultimodalTemplate
+from draive.utils import xml_tag
+
+__all__ = [
+ "relevance_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a relevance metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is relevance - selection of important parts from the REFERENCE content.
+The EVALUATED content should include only important information from the REFERENCE avoiding \
+redundancies and excess information.
+
+
+
+Assign a relevance score using value between 0.0 and 4.0 where:
+0.0 is very low relevance - the content contains mostly irrelevant or redundant information.
+1.0 is low coverage - the content includes some important points but has \
+significant irrelevant parts.
+2.0 is moderate relevance - the content covers most important points but includes \
+some unnecessary information.
+3.0 is good relevance - the content focuses on important information with minor inclusions \
+of less relevant content.
+4.0 is excellent relevance - the content precisely captures only the most important information \
+from the reference.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="relevance")
+async def relevance_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 4,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
diff --git a/src/draive/evaluators/score.py b/src/draive/evaluators/score.py
index 127e498..3cb5d94 100644
--- a/src/draive/evaluators/score.py
+++ b/src/draive/evaluators/score.py
@@ -39,14 +39,14 @@ def _score_validator(
class CommonScoreModel(DataModel):
- score: float = Field(
- description="Decimal score value",
- validator=_score_validator,
- )
comment: str | None = Field(
description="Explanation of the score",
default=None,
)
+ score: float = Field(
+ description="Decimal score value",
+ validator=_score_validator,
+ )
def normalized(
self,
diff --git a/src/draive/evaluators/similarity.py b/src/draive/evaluators/similarity.py
new file mode 100644
index 0000000..6480956
--- /dev/null
+++ b/src/draive/evaluators/similarity.py
@@ -0,0 +1,133 @@
+from base64 import b64decode
+
+from draive.embedding import Embedded, embed_images, embed_texts
+from draive.evaluation import EvaluationScore, evaluator
+from draive.generation import generate_text
+from draive.similarity.score import vector_similarity_score
+from draive.types import (
+ ImageBase64Content,
+ Multimodal,
+ MultimodalTemplate,
+)
+from draive.utils import xml_tag
+
+__all__ = [
+ "similarity_evaluator",
+ "text_vector_similarity_evaluator",
+ "image_vector_similarity_evaluator",
+]
+
+
+INSTRUCTION: str = """\
+Assistant is an evaluator scoring the provided content.
+
+
+Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \
+the EVALUATED content using solely a similarity metric according to the EVALUATION_CRITERIA.
+Think step by step and provide explanation of the score before the final score.
+Use the explained RATING scale and the requested FORMAT to provide the result.
+
+
+
+Evaluated metric is similarity - the degree of semantic similarity between the REFERENCE \
+and the EVALUATED content.
+
+
+
+Assign a similarity score using value between 0.0 and 2.0 where:
+0.0 is no similarity - the content is completely unrelated in meaning.
+1.0 is moderate similarity - the content share some common themes or ideas.
+2.0 is high similarity - the content is very close in meaning \
+or convey the same information.
+
+
+
+The final result containing only the numerical score value HAVE to be put inside a `RESULT` \
+xml tag within the result i.e. `score`.
+
+"""
+
+
+INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of(
+ "",
+ ("reference",),
+ "",
+ "",
+ ("evaluated",),
+ "",
+)
+
+
+@evaluator(name="similarity")
+async def similarity_evaluator(
+ evaluated: Multimodal,
+ /,
+ reference: Multimodal,
+) -> EvaluationScore:
+ if not evaluated:
+ return EvaluationScore(
+ value=0,
+ comment="Input was empty!",
+ )
+
+ if not reference:
+ return EvaluationScore(
+ value=0,
+ comment="Reference was empty!",
+ )
+
+ if result := xml_tag(
+ "RESULT",
+ await generate_text(
+ instruction=INSTRUCTION,
+ input=INPUT_TEMPLATE.format(
+ reference=reference,
+ evaluated=evaluated,
+ ),
+ ),
+ ):
+ return EvaluationScore(
+ value=float(result) / 2,
+ comment=None,
+ )
+
+ else:
+ raise ValueError("Invalid result")
+
+
+@evaluator(name="text_vector_similarity")
+async def text_vector_similarity_evaluator(
+ evaluated: str,
+ /,
+ reference: str,
+) -> float:
+ embedding: list[Embedded[str]] = await embed_texts([reference, evaluated])
+
+ return vector_similarity_score(embedding[0].vector, embedding[1].vector)
+
+
+@evaluator(name="image_vector_similarity")
+async def image_vector_similarity_evaluator(
+ evaluated: ImageBase64Content | bytes,
+ /,
+ reference: ImageBase64Content | bytes,
+) -> float:
+ evaluated_data: bytes
+ match evaluated:
+ case ImageBase64Content() as base64_data:
+ evaluated_data = b64decode(base64_data.image_base64)
+
+ case raw_data:
+ evaluated_data = raw_data
+
+ reference_data: bytes
+ match reference:
+ case ImageBase64Content() as base64_data:
+ reference_data = b64decode(base64_data.image_base64)
+
+ case raw_data:
+ reference_data = raw_data
+
+ embedding: list[Embedded[bytes]] = await embed_images([reference_data, evaluated_data])
+
+ return vector_similarity_score(embedding[0].vector, embedding[1].vector)
diff --git a/src/draive/evaluators/text_coherence.py b/src/draive/evaluators/text_coherence.py
deleted file mode 100644
index f0d5464..0000000
--- a/src/draive/evaluators/text_coherence.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_coherence_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Coherence metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Coherence (0.0-4.0) - the collective quality of all sentences.
-We align this dimension with the DUC (Document Understanding Conference) quality question of \
-structure and coherence, whereby the text should be well-structured and well-organized.
-The compared text should not just be a heap of related information, but should build from sentence
-to sentence into a coherent body of information about a topic.
-
-Rating Scale:
-0.0: Very low coherence - the text is chaotic, lacking logical connections between sentences.
-1.0: Low coherence - some connections are visible, but the overall structure is weak.
-2.0: Moderate coherence - the text has a noticeable structure, but with some shortcomings.
-3.0: Good coherence - the text is well-organized with minor imperfections.
-4.0: Excellent coherence - the text is exemplarily structured, with smooth transitions \
-between ideas.
-
-Evaluation Steps:
-1. Read the reference text carefully and identify the main topic and key points.
-2. Read the compared text and compare it to the reference text.
-Check if the compared text covers the main topic and key points of the reference text, \
-and if it presents them in a clear and logical order.
-3. Assign a coherence score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_coherence")
-async def text_coherence_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Solar energy is a renewable energy source that is gaining popularity. "
- "Solar panels convert sunlight into electricity. "
- "This technology is environmentally friendly and can reduce electricity "
- "bills. However, installing solar panels requires an initial investment "
- "and is dependent on weather conditions."
- ),
- compared=(
- "Solar panels are on roofs. Energy is important. "
- "The sun shines brightly. Electricity bills can be high. "
- "Technology is developing fast. People like to save money."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Coffee is a popular beverage worldwide. "
- "It's made from roasted coffee beans. Caffeine in coffee "
- "can boost energy and alertness. However, excessive consumption may "
- "lead to sleep issues."
- ),
- compared=(
- "Coffee is drunk by many people. It comes from beans that are roasted. "
- "Caffeine makes you feel more awake. "
- "Drinking too much coffee might make it hard to sleep. "
- "Some people add milk or sugar to their coffee."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Honey is a natural sweetener produced by bees. "
- "It has antibacterial properties and is rich in antioxidants. "
- "People use honey in cooking, as a spread, and for medicinal "
- "purposes. However, it's high in calories and should be consumed "
- "in moderation."
- ),
- compared=(
- "Bees create honey, a natural sweetener with multiple benefits. "
- "Its antibacterial and antioxidant-rich composition makes it valuable "
- "for culinary, nutritional, and medicinal uses. While versatile, "
- "honey's high caloric content necessitates mindful consumption."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_conciseness.py b/src/draive/evaluators/text_conciseness.py
deleted file mode 100644
index 0fc911e..0000000
--- a/src/draive/evaluators/text_conciseness.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_conciseness_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Conciseness metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Conciseness (0.0-4.0) - the extent to which the compared text is brief and to the point \
-while still covering all key information.
-A concise compared text avoids unnecessary details and repetition.
-Annotators should penalize compared texts that are overly verbose or include irrelevant information.
-
-Rating Scale:
-0.0: Very low conciseness - the text is excessively verbose with much irrelevant information.
-1.0: Low conciseness - the text contains unnecessary details and some irrelevant information.
-2.0: Moderate conciseness - the text is somewhat concise but could be more focused.
-3.0: Good conciseness - the text is mostly concise with minimal unnecessary information.
-4.0: Excellent conciseness - the text is highly concise, containing only essential information.
-
-Evaluation Steps:
-1. Read the derived text and the reference text carefully.
-2. Compare the compared text to the reference text and identify the main \
-points of the reference text.
-3. Assess how well the compared text covers the main points of the reference text, \
-and how much irrelevant or redundant information it contains.
-4. Assign a conciseness score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_conciseness")
-async def text_conciseness_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Solar energy is a renewable energy source that is gaining popularity. "
- "Solar panels convert sunlight into electricity. "
- "This technology is environmentally friendly and can reduce electricity "
- "bills. However,installing solar panels requires an initial investment and "
- "is dependent on weather conditions."
- ),
- compared=(
- "Did you know that solar energy is becoming super popular these days? "
- "It's this amazing, eco-friendly way to make electricity using "
- "the sun's rays. People are getting really excited about it! Basically, "
- "you put these special panels on your roof, and they soak up the sunlight "
- "like a sponge. Then, through some pretty cool science stuff, "
- "they turn that sunlight into electricity you can use in your house. "
- "It's pretty neat, right? And get this - it can actually help you save "
- "money on your electricity bills in the long run. But here's the thing: "
- "you've got to shell out some cash upfront to get those panels installed. "
- "It's kind of like buying a fancy coffee machine - costs a bit at first, "
- "but then you save on all those coffee shop visits."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Coffee is a popular beverage worldwide. "
- "It's made from roasted coffee beans. Caffeine in coffee "
- "can boost energy and alertness. However, excessive consumption may "
- "lead to sleep issues."
- ),
- compared=(
- "Coffee is a widely consumed beverage made from roasted coffee beans. "
- "It contains caffeine, which can enhance energy and alertness. However, "
- "drinking too much coffee may cause sleep problems. "
- "People enjoy coffee for its taste and stimulating effects, but it's "
- "important to consume it in moderation."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "The water cycle, also known as the hydrologic cycle, "
- "describes the continuous movement of water within the Earth and "
- "atmosphere. It involves processes such as evaporation, condensation, "
- "precipitation, and runoff."
- ),
- compared=(
- "The water cycle is the continuous movement of water on Earth. "
- "It includes evaporation, condensation, precipitation, and runoff."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_consistency.py b/src/draive/evaluators/text_consistency.py
deleted file mode 100644
index 52582c7..0000000
--- a/src/draive/evaluators/text_consistency.py
+++ /dev/null
@@ -1,133 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_consistency_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Consistency metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Consistency(0.0-4.0) - the factual alignment between the reference text and the compared text.
-A factually consistent compared text contains only statements that are entailed \
-by the reference text.
-Annotators should penalize compared texts that contain hallucinated facts.
-
-Rating Scale:
-0.0: Very low consistency - the text contains multiple hallucinated facts \
-or significant misalignments with the reference text.
-1.0: Low consistency - the text has several instances of information not supported by \
-the reference text.
-2.0: Moderate consistency - the text is mostly consistent but contains a few unsupported statements.
-3.0: Good consistency - the text is largely consistent with minor discrepancies.
-4.0: Excellent consistency - the text is fully consistent with the reference text, \
-containing only supported information.
-
-Evaluation Steps:
-1. Read the compared text and the reference text carefully.
-2. Compare the compared text to the reference text and identify the main points \
-of the reference text.
-3. Assess how well the compared text covers the main points of the reference text \
-and how much irrelevant or redundant information it contains.
-4. Assign a consistency score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_consistency")
-async def text_consistency_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Dolphins are intelligent marine mammals. They use echolocation "
- "to navigate and hunt. Dolphins live in social groups called pods."
- ),
- compared=(
- "Dolphins are smart fish that can fly short distances. They use sonar "
- "to talk to whales. Dolphins live in families and go to school "
- "to learn hunting techniques."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Coffee is a popular beverage worldwide. "
- "It's made from roasted coffee beans. Caffeine in coffee "
- "can boost energy and alertness. However, excessive consumption may "
- "lead to sleep issues."
- ),
- compared=(
- "Coffee is a widely consumed drink around the world. It's produced "
- "by roasting coffee beans. The caffeine in coffee can increase energy "
- "levels and improve alertness. However, drinking too much coffee might "
- "cause sleep problems. Coffee is also known to improve memory and reduce "
- "the risk of certain diseases."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Photosynthesis is the process by which plants use sunlight to "
- "produce energy. It requires water, carbon dioxide, and chlorophyll. "
- "Oxygen is released as a byproduct of photosynthesis."
- ),
- compared=(
- "Plants carry out photosynthesis to create energy from sunlight. "
- "This process needs water, carbon dioxide, and the green pigment "
- "chlorophyll. As plants photosynthesize, "
- "they release oxygen into the environment."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_coverage.py b/src/draive/evaluators/text_coverage.py
deleted file mode 100644
index 7de871d..0000000
--- a/src/draive/evaluators/text_coverage.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_coverage_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Coverage metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Coverage (0.0-4.0) - the extent to which the compared text includes all \
-the key points from the reference text.
-A compared text with good coverage includes all the important information from \
-the reference text without omitting critical points.
-Annotators should penalize compared texts that miss significant content.
-
-Rating Scale:
-0.0: Very low coverage - the text misses most key points from the reference text.
-1.0: Low coverage - the text includes some key points but omits several important ones.
-2.0: Moderate coverage - the text covers most key points but misses a few important details.
-3.0: Good coverage - the text includes nearly all key points with minor omissions.
-4.0: Excellent coverage - the text comprehensively covers all key points from the reference text.
-
-Evaluation Steps:
-1. Read the reference text carefully and identify all key points and important information.
-2. Read the compared text and compare it to the reference text. \
-Check if the compared text includes all the key points and important information \
-from the reference text.
-3. Assess how well the compared text covers the reference text, \
-and if any critical points are missing.
-4. Assign a coverage score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_coverage")
-async def text_coverage_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Smartphones are versatile devices. They can make calls, send messages, "
- "access the internet, take photos, and run various apps. "
- "Many people use smartphones for work and entertainment. "
- "However, excessive use can lead to addiction and sleep problems."
- ),
- compared=(
- "Smartphones can make calls and send messages. They are popular devices."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Recycling helps protect the environment. It reduces waste in landfills, "
- "conserves natural resources, and saves energy. Common recyclable items "
- "include paper, plastic, glass, and metal. Many cities have recycling "
- "programs, but individual participation is crucial for success."
- ),
- compared=(
- "Recycling is good for the environment. "
- "It reduces waste and saves resources. "
- "People can recycle things like paper and plastic. "
- "Many cities have recycling programs."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Regular exercise is important for health. It strengthens the heart, "
- "builds muscle, and improves flexibility. Exercise can also reduce stress "
- "and boost mood. Experts recommend at least 30 minutes of moderate "
- "activity most days of the week. Walking, swimming, and cycling are "
- "good options for many people."
- ),
- compared=(
- "Regular exercise is crucial for maintaining good health. "
- "It has many benefits, including strengthening the heart, "
- "building muscle, and enhancing flexibility. Exercise also has "
- "mental health benefits, such as reducing stress and improving mood. "
- "Health experts advise doing at least 30 minutes of moderate exercise "
- "on most days. Some popular and accessible forms of exercise "
- "include walking, swimming, and cycling."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
-
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_fluency.py b/src/draive/evaluators/text_fluency.py
deleted file mode 100644
index f184e3f..0000000
--- a/src/draive/evaluators/text_fluency.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_fluency_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a text. Your task is to rate this text using only the Fluency metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Fluency (0.0-2.0) - the quality of the text in terms of grammar, spelling, punctuation, \
-word choice, and sentence structure.
-
-Rating Scale:
-0.0: Poor - the text has many errors that make it hard to understand or sound unnatural.
-1.0: Fair - the text has some errors that affect the clarity or smoothness of the text, \
-but the main points are still comprehensible.
-2.0: Good - the text has few or no errors and is easy to read and follow.
-
-Evaluation Steps:
-1. Read the text and evaluate its fluency based on the given criteria.
-2. Assign a fluency score from 0.0 to 2.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{text}
-
-"""
-
-
-@evaluator(name="text_fluency")
-async def text_fluency_evaluator(
- text: str,
- /,
-) -> EvaluationScore:
- if not text:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(text=text),
- examples=[
- (
- INPUT_TEMPLATE.format(
- text=(
- "The cat sitted on mat. It were very comfrotable. "
- "The sun shine bright in sky."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- text=(
- "The movie was good, but I didn't liked the ending. "
- "It left me feeling confuse and unsatisfied."
- ),
- ),
- CommonScoreModel(score=1.0),
- ),
- (
- INPUT_TEMPLATE.format(
- text=(
- "The concert last night was amazing. "
- "The band played all their hit songs, and the crowd was energetic "
- "throughout the performance."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- ],
- )
-
- return score.normalized(divider=2)
diff --git a/src/draive/evaluators/text_readability.py b/src/draive/evaluators/text_readability.py
deleted file mode 100644
index 4edccc0..0000000
--- a/src/draive/evaluators/text_readability.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_readability_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a text. Your task is to rate this text using only the Readability metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Readability (0.0-4.0) - the ease with which a reader can understand the text.
-A readable text uses clear and concise language, is well-structured,
-and avoids complex or convoluted sentences. Annotators should penalize texts that \
-are difficult to read or understand.
-
-Rating Scale:
-0.0: Very low readability - the text is extremely difficult to understand, \
-with complex language and convoluted structure.
-1.0: Low readability - the text is challenging to read, with frequent use of \
-complex sentences or unclear language.
-2.0: Moderate readability - the text is somewhat clear but has some areas \
-that are difficult to understand.
-3.0: Good readability - the text is mostly clear and easy to read, with minor instances \
-of complexity.
-4.0: Excellent readability - the text is highly clear, concise, and easy to understand throughout.
-
-Evaluation Steps:
-1. Read the text carefully and evaluate how easy it is to read and understand.
-2. Consider the language used in the text, including clarity, simplicity, and sentence structure.
-3. Assess whether the text is well-structured and free from complex or convoluted sentences.
-4. Assign a readability score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{text}
-
-"""
-
-
-@evaluator(name="text_readability")
-async def text_readability_evaluator(
- text: str,
- /,
-) -> EvaluationScore:
- if not text:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(text=text),
- examples=[
- (
- INPUT_TEMPLATE.format(
- text=(
- "The canine species, frequently domesticated for companionship purposes, "
- "exhibit characteristics of fidelity and ludic propensities that engender "
- "their widespread appeal among human populations as domestic "
- "animal companions."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- text=(
- "Pizza, a widely consumed dish, consists of a circular bread foundation "
- "adorned with various ingredients. Typically, it includes a layer of "
- "tomato-based sauce and cheese, though additional toppings may be "
- "incorporated to suit individual preferences."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- text=(
- "Exercise is good for health. It helps maintain fitness and reduces stress."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
-
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_relevance.py b/src/draive/evaluators/text_relevance.py
deleted file mode 100644
index 26fa6fd..0000000
--- a/src/draive/evaluators/text_relevance.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-
-__all__ = [
- "text_relevance_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given a reference text and a compared text based on the reference text.
-Your task is to rate the compared text using only the Relevance metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Relevance (0.0-4.0) - selection of important content from the reference text.
-The compared text should include only important information from the reference text.
-Annotators should penalize compared texts that contain redundancies and excess information.
-
-Rating Scale:
-0.0: Very low relevance - the text contains mostly irrelevant or redundant information.
-1.0: Low relevance - the text includes some important points but has \
-significant irrelevant content.
-2.0: Moderate relevance - the text covers most important points but includes \
-some unnecessary information.
-3.0: Good relevance - the text focuses on important information with minor inclusions \
-of less relevant content.
-4.0: Excellent relevance - the text precisely captures only the most important information \
-from the reference text.
-
-Evaluation Steps:
-1. Read the compared text and the reference text carefully.
-2. Compare the compared text to the reference text and identify \
-the main points of the reference text.
-3. Assess how well the compared text covers the main points of the reference text, \
-and note any irrelevant or redundant information it contains.
-4. Assign a relevance score from 0.0 to 4.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_relevance")
-async def text_relevance_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "The sun is the star at the center of our solar system. "
- "It provides light and heat to Earth."
- ),
- compared=(
- "Stars twinkle in the night sky. Some people believe in astrology. "
- "The moon orbits the Earth. Astronauts have been to space. "
- "Solar panels use energy from the sun."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Elephants are the largest land animals. They have long trunks and tusks. "
- "Elephants live in herds and are known for their intelligence."
- ),
- compared=(
- "Elephants are very big animals. They use their trunks to grab food "
- "and water. Elephants live together in groups. They're smart and have "
- "good memories. Some people ride elephants in zoos, "
- "but this can be harmful to the animals."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Bicycles are a popular mode of transportation. They are eco-friendly "
- "and provide exercise. However, cyclists need to follow "
- "traffic rules for safety."
- ),
- compared=(
- "Bicycles are widely used for travel. "
- "They don't pollute and help people stay fit. "
- "Cyclists must obey traffic laws to stay safe."
- ),
- ),
- CommonScoreModel(score=4.0),
- ),
- ],
- )
-
- return score.normalized(divider=4)
diff --git a/src/draive/evaluators/text_similarity.py b/src/draive/evaluators/text_similarity.py
deleted file mode 100644
index b329ff8..0000000
--- a/src/draive/evaluators/text_similarity.py
+++ /dev/null
@@ -1,128 +0,0 @@
-from draive.embedding import Embedded, embed_texts
-from draive.evaluation import EvaluationScore, evaluator
-from draive.evaluators.score import CommonScoreModel
-from draive.generation import generate_model
-from draive.similarity.score import vector_similarity_score
-
-__all__ = [
- "text_similarity_evaluator",
- "text_vector_similarity_evaluator",
-]
-
-
-INSTRUCTION: str = """\
-You will be given two texts: a reference text and a compared text. \
-Your task is to rate the compared text using only the Similarity metric, \
-which is described in the Evaluation Criteria.
-Please make sure you read and understand these instructions very carefully.
-Keep this document open while reviewing, and refer to it as needed.
-
-Evaluation Criteria:
-Similarity (0.0-2.0) - the degree of semantic similarity between the reference text \
-and the compared text.
-
-Rating Scale:
-0.0: No similarity - the reference text and compared text are completely unrelated in meaning.
-1.0: Moderate similarity - the reference text and compared text share some common themes or ideas.
-2.0: High similarity - the reference text and compared text are very close in meaning \
-or convey the same information.
-
-Evaluation Steps:
-1. Read both the reference text and the compared text carefully.
-2. Compare the semantic meaning of the reference text and the compared text.
-3. Assign a similarity score from 0.0 to 2.0 based on the provided criteria.
-
-Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \
-do not exceed this value.
-"""
-
-
-INPUT_TEMPLATE: str = """
-
-{reference}
-
-
-
-{compared}
-
-"""
-
-
-@evaluator(name="text_similarity")
-async def text_similarity_evaluator(
- compared: str,
- /,
- reference: str,
-) -> EvaluationScore:
- if not compared:
- return EvaluationScore(
- value=0,
- comment="Input text was empty!",
- )
-
- if not reference:
- return EvaluationScore(
- value=0,
- comment="Reference text was empty!",
- )
-
- score: CommonScoreModel = await generate_model(
- CommonScoreModel,
- instruction=INSTRUCTION,
- input=INPUT_TEMPLATE.format(
- reference=reference,
- compared=compared,
- ),
- examples=[
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Cats are popular pets. They are independent and like to groom themselves."
- ),
- compared=(
- "Bananas are a healthy fruit. They are rich in potassium and easy to peel."
- ),
- ),
- CommonScoreModel(score=0.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "The beach is a great place for relaxation. "
- "People enjoy swimming and sunbathing."
- ),
- compared=(
- "Many people like to spend time outdoors. "
- "Parks are popular for picnics and walking."
- ),
- ),
- CommonScoreModel(score=1.0),
- ),
- (
- INPUT_TEMPLATE.format(
- reference=(
- "Coffee is a popular morning drink. It contains caffeine which helps "
- "people feel more alert."
- ),
- compared=(
- "Many people start their day with coffee. "
- "The caffeine in coffee can increase alertness and energy."
- ),
- ),
- CommonScoreModel(score=2.0),
- ),
- ],
- )
-
- return score.normalized(divider=2)
-
-
-@evaluator(name="text_vector_similarity")
-async def text_vector_similarity_evaluator(
- compared: str,
- /,
- reference: str,
-) -> float:
- embedding: list[Embedded[str]] = await embed_texts([reference, compared])
-
- return vector_similarity_score(embedding[0].vector, embedding[1].vector)
diff --git a/src/draive/generation/model/lmm.py b/src/draive/generation/model/lmm.py
index c529c7d..9bf8bbd 100644
--- a/src/draive/generation/model/lmm.py
+++ b/src/draive/generation/model/lmm.py
@@ -145,9 +145,11 @@ async def lmm_generate_model[Generated: DataModel]( # noqa: PLR0913, C901, PLR0
DEFAULT_INSTRUCTION_EXTENSION: str = """\
-The result have to be a JSON conforming to the following schema:
+
+The result have to be a JSON object conforming to the following schema:
```
{schema}
```
Provide ONLY a single, raw, valid JSON without any comments, formatting or additional elements.
+
"""
diff --git a/src/draive/types/__init__.py b/src/draive/types/__init__.py
index 87b6588..a13f7b9 100644
--- a/src/draive/types/__init__.py
+++ b/src/draive/types/__init__.py
@@ -21,6 +21,8 @@
MultimodalContent,
MultimodalContentConvertible,
MultimodalContentElement,
+ MultimodalContentPlaceholder,
+ MultimodalTemplate,
)
from draive.types.text import TextContent
from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent
@@ -46,10 +48,12 @@
"LMMToolRequests",
"LMMToolResponse",
"Memory",
+ "Multimodal",
"MultimodalContent",
"MultimodalContentConvertible",
"MultimodalContentElement",
- "Multimodal",
+ "MultimodalContentPlaceholder",
+ "MultimodalTemplate",
"RateLimitError",
"TextContent",
"VideoBase64Content",
diff --git a/src/draive/types/multimodal.py b/src/draive/types/multimodal.py
index 91a607c..e4da41c 100644
--- a/src/draive/types/multimodal.py
+++ b/src/draive/types/multimodal.py
@@ -10,12 +10,15 @@
from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent
__all__ = [
+ "Multimodal",
"MultimodalContent",
- "MultimodalContentElement",
"MultimodalContentConvertible",
- "Multimodal",
+ "MultimodalContentElement",
+ "MultimodalContentPlaceholder",
+ "MultimodalTemplate",
]
+
MultimodalContentElement = TextContent | ImageContent | AudioContent | VideoContent | DataModel
MultimodalContentConvertible = str | MultimodalContentElement
@@ -165,10 +168,70 @@ def extending(
def __bool__(self) -> bool:
return bool(self.parts) and any(self.parts)
+ def __str__(self) -> str:
+ return self.as_string()
+
Multimodal = MultimodalContent | MultimodalContentConvertible
+class MultimodalContentPlaceholder(DataModel):
+ identifier: str
+
+
+class MultimodalTemplate(DataModel):
+ @classmethod
+ def of(
+ cls,
+ *elements: Multimodal | MultimodalContentPlaceholder | tuple[str],
+ merge_text: bool = True,
+ skip_empty: bool = True,
+ meta: dict[str, str | float | int | bool | None] | None = None,
+ ) -> Self:
+ return cls(
+ parts=tuple(
+ [
+ MultimodalContentPlaceholder(identifier=element[0])
+ if isinstance(element, tuple)
+ else element
+ for element in elements
+ ]
+ ),
+ merge_text=merge_text,
+ skip_empty=skip_empty,
+ meta=meta,
+ )
+
+ parts: frozenlist[Multimodal | MultimodalContentPlaceholder]
+ merge_text: bool
+ skip_empty: bool
+ meta: dict[str, str | float | int | bool | None] | None
+
+ def format(
+ self,
+ **variables: Multimodal,
+ ) -> MultimodalContent:
+ parts: list[Multimodal] = []
+ for part in self.parts:
+ match part:
+ case MultimodalContentPlaceholder() as placeholder:
+ if value := variables.get(placeholder.identifier):
+ parts.append(value)
+
+ else:
+ raise ValueError("Missing format variable '%s'", placeholder.identifier)
+
+ case part:
+ parts.append(part)
+
+ return MultimodalContent.of(
+ *parts,
+ merge_text=self.merge_text,
+ skip_empty=self.skip_empty,
+ meta=self.meta,
+ )
+
+
def _extract_parts( # noqa: PLR0911
element: Multimodal,
/,
@@ -315,12 +378,18 @@ def _merge_texts(
last_text_element: TextContent | None = None
while element := next(iterator, None):
match element:
- case TextContent() as text: # do not merge texts with different metadata
- if (last_text := last_text_element) and last_text.meta == text.meta:
- last_text_element = TextContent(
- text=last_text.text + text.text,
- meta=text.meta,
- )
+ case TextContent() as text:
+ # do not merge texts with different metadata
+ if last_text := last_text_element:
+ if last_text.meta == text.meta:
+ last_text_element = TextContent(
+ text=last_text.text + text.text,
+ meta=text.meta,
+ )
+
+ else:
+ result.append(last_text)
+ last_text_element = text
else:
last_text_element = text