diff --git a/Makefile b/Makefile index c9310d3..9890709 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ ifndef INSTALL_OPTIONS endif ifndef UV_VERSION - UV_VERSION := 0.2.25 + UV_VERSION := 0.2.37 endif .PHONY: install venv sync lock update format lint test release diff --git a/constraints b/constraints index c0b444c..658ae02 100644 --- a/constraints +++ b/constraints @@ -57,7 +57,9 @@ idna==3.7 iniconfig==2.0.0 # via pytest jiter==0.5.0 - # via anthropic + # via + # anthropic + # openai loguru==0.7.2 # via fastembed markdown-it-py==3.0.0 @@ -80,9 +82,9 @@ numpy==1.26.4 # onnxruntime onnx==1.16.2 # via fastembed -onnxruntime==1.18.1 +onnxruntime==1.19.0 # via fastembed -openai==1.38.0 +openai==1.41.0 # via draive (pyproject.toml) packaging==24.1 # via @@ -107,7 +109,7 @@ pydantic-core==2.20.1 # via pydantic pygments==2.18.0 # via rich -pyright==1.1.374 +pyright==1.1.375 # via draive (pyproject.toml) pystemmer==2.2.0.1 # via fastembed @@ -120,7 +122,7 @@ pytest-asyncio==0.23.8 # via draive (pyproject.toml) pytest-cov==4.1.0 # via draive (pyproject.toml) -pyyaml==6.0.1 +pyyaml==6.0.2 # via # bandit # huggingface-hub @@ -133,7 +135,7 @@ requests==2.32.3 # tiktoken rich==13.7.1 # via bandit -ruff==0.5.6 +ruff==0.5.7 # via draive (pyproject.toml) sentencepiece==0.2.0 # via draive (pyproject.toml) @@ -147,11 +149,11 @@ snowballstemmer==2.2.0 # via fastembed stevedore==5.2.0 # via bandit -sympy==1.13.1 +sympy==1.13.2 # via onnxruntime tiktoken==0.7.0 # via draive (pyproject.toml) -tokenizers==0.19.1 +tokenizers==0.20.0 # via # anthropic # fastembed diff --git a/pyproject.toml b/pyproject.toml index 3f50e73..99d3772 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "draive" description = "Framework designed to simplify and accelerate the development of LLM-based applications." -version = "0.25.0" +version = "0.26.0" readme = "README.md" maintainers = [ { name = "Kacper KaliƄski", email = "kacper.kalinski@miquido.com" }, diff --git a/src/draive/__init__.py b/src/draive/__init__.py index 585060a..5140164 100644 --- a/src/draive/__init__.py +++ b/src/draive/__init__.py @@ -148,6 +148,8 @@ MultimodalContent, MultimodalContentConvertible, MultimodalContentElement, + MultimodalContentPlaceholder, + MultimodalTemplate, RateLimitError, TextContent, VideoBase64Content, @@ -284,10 +286,12 @@ "ModelGeneration", "ModelGenerator", "ModelGeneratorDecoder", + "Multimodal", "MultimodalContent", "MultimodalContentConvertible", "MultimodalContentElement", - "Multimodal", + "MultimodalContentPlaceholder", + "MultimodalTemplate", "noop", "not_missing", "ParameterDefaultFactory", diff --git a/src/draive/evaluation/__init__.py b/src/draive/evaluation/__init__.py index 87f2a4b..b8e25e9 100644 --- a/src/draive/evaluation/__init__.py +++ b/src/draive/evaluation/__init__.py @@ -1,36 +1,39 @@ from draive.evaluation.evaluator import ( Evaluator, + EvaluatorDefinition, EvaluatorResult, PreparedEvaluator, evaluator, ) from draive.evaluation.scenario import ( + EvaluationScenarioResult, PreparedScenarioEvaluator, ScenarioEvaluator, ScenarioEvaluatorDefinition, ScenarioEvaluatorResult, evaluation_scenario, ) -from draive.evaluation.score import Evaluation, EvaluationScore +from draive.evaluation.score import EvaluationScore from draive.evaluation.suite import ( EvaluationCaseResult, EvaluationSuite, EvaluationSuiteCase, - EvaluationSuiteCaseResult, EvaluationSuiteDefinition, EvaluationSuiteStorage, + SuiteEvaluatorCaseResult, + SuiteEvaluatorResult, evaluation_suite, ) __all__ = [ "evaluation_scenario", "evaluation_suite", - "Evaluation", + "EvaluatorDefinition", "EvaluationCaseResult", + "EvaluationScenarioResult", "EvaluationScore", "EvaluationSuite", "EvaluationSuiteCase", - "EvaluationSuiteCaseResult", "EvaluationSuiteDefinition", "EvaluationSuiteStorage", "evaluator", @@ -41,4 +44,6 @@ "ScenarioEvaluator", "ScenarioEvaluatorDefinition", "ScenarioEvaluatorResult", + "SuiteEvaluatorCaseResult", + "SuiteEvaluatorResult", ] diff --git a/src/draive/evaluation/evaluator.py b/src/draive/evaluation/evaluator.py index 11ab5e2..c17be7b 100644 --- a/src/draive/evaluation/evaluator.py +++ b/src/draive/evaluation/evaluator.py @@ -1,8 +1,9 @@ from collections.abc import Callable from typing import Protocol, Self, cast, final, overload, runtime_checkable -from draive.evaluation.score import Evaluation, EvaluationScore +from draive.evaluation.score import EvaluationScore from draive.parameters import DataModel, Field, ParameterPath +from draive.scope import ctx from draive.utils import freeze __all__ = [ @@ -10,6 +11,7 @@ "Evaluator", "EvaluatorResult", "PreparedEvaluator", + "EvaluatorDefinition", ] @@ -23,12 +25,63 @@ class EvaluatorResult(DataModel): threshold: float = Field( description="Score threshold required to pass evaluation", ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) @property def passed(self) -> bool: return self.score.value >= self.threshold +class EvaluationResult(DataModel): + @classmethod + async def of( + cls, + score: EvaluationScore | float | bool, + /, + meta: dict[str, str | float | int | bool | None] | None = None, + ) -> Self: + evaluation_score: EvaluationScore + match score: + case EvaluationScore() as score: + evaluation_score = score + + case float() as value: + evaluation_score = EvaluationScore(value=value) + + case passed: + evaluation_score = EvaluationScore(value=1.0 if passed else 0.0) + + return cls( + score=evaluation_score, + meta=meta, + ) + + score: EvaluationScore = Field( + description="Evaluation score", + ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) + + +@runtime_checkable +class EvaluatorDefinition[Value, **Args](Protocol): + @property + def __name__(self) -> str: ... + + async def __call__( + self, + value: Value, + /, + *args: Args.args, + **kwargs: Args.kwargs, + ) -> EvaluationResult | EvaluationScore | float | bool: ... + + @runtime_checkable class PreparedEvaluator[Value](Protocol): async def __call__( @@ -43,14 +96,14 @@ class Evaluator[Value, **Args]: def __init__( self, name: str, - evaluation: Evaluation[Value, Args], + definition: EvaluatorDefinition[Value, Args], threshold: float | None, ) -> None: assert ( # nosec: B101 threshold is None or 0 <= threshold <= 1 ), "Evaluation threshold has to be between 0 and 1" - self._evaluation: Evaluation[Value, Args] = evaluation + self._definition: EvaluatorDefinition[Value, Args] = definition self.name: str = name self.threshold: float = threshold or 1 @@ -62,7 +115,7 @@ def with_threshold( ) -> Self: return self.__class__( name=self.name, - evaluation=self._evaluation, + definition=self._definition, threshold=threshold, ) @@ -102,8 +155,8 @@ async def evaluation( value: Mapped, *args: Args.args, **kwargs: Args.kwargs, - ) -> EvaluationScore | float | bool: - return await self._evaluation( + ) -> EvaluationResult | EvaluationScore | float | bool: + return await self._definition( mapper(value), *args, **kwargs, @@ -111,7 +164,7 @@ async def evaluation( return Evaluator[Mapped, Args]( name=self.name, - evaluation=evaluation, + definition=evaluation, threshold=self.threshold, ) @@ -123,34 +176,51 @@ async def __call__( **kwargs: Args.kwargs, ) -> EvaluatorResult: evaluation_score: EvaluationScore - match await self._evaluation( - value, - *args, - **kwargs, - ): - case float() as score_value: - evaluation_score = EvaluationScore(value=score_value) + evaluation_meta: dict[str, str | float | int | bool | None] | None + try: + match await self._definition( + value, + *args, + **kwargs, + ): + case EvaluationResult() as result: + evaluation_score = result.score + evaluation_meta = result.meta - case bool() as score_bool: - evaluation_score = EvaluationScore(value=1 if score_bool else 0) + case EvaluationScore() as score: + evaluation_score = score + evaluation_meta = None - case EvaluationScore() as score: - evaluation_score = score + case float() as score_value: + evaluation_score = EvaluationScore(value=score_value) + evaluation_meta = None + + case passed: + evaluation_score = EvaluationScore(value=1 if passed else 0) + evaluation_meta = None - # for whatever reason pyright wants int to be handled... - case int() as score_int: - evaluation_score = EvaluationScore(value=float(score_int)) + except Exception as exc: + ctx.log_error( + f"Evaluator `{self.name}` failed, using `0` score fallback result", + exception=exc, + ) + evaluation_score = EvaluationScore( + value=0, + comment="Evaluation failed", + ) + evaluation_meta = {"exception": str(exc)} return EvaluatorResult( evaluator=self.name, score=evaluation_score, threshold=self.threshold, + meta=evaluation_meta, ) @overload def evaluator[Value, **Args]( - evaluation: Evaluation[Value, Args] | None = None, + definition: EvaluatorDefinition[Value, Args] | None = None, /, ) -> Evaluator[Value, Args]: ... @@ -161,29 +231,29 @@ def evaluator[Value, **Args]( name: str | None = None, threshold: float | None = None, ) -> Callable[ - [Evaluation[Value, Args]], + [EvaluatorDefinition[Value, Args]], Evaluator[Value, Args], ]: ... def evaluator[Value, **Args]( - evaluation: Evaluation[Value, Args] | None = None, + evaluation: EvaluatorDefinition[Value, Args] | None = None, *, name: str | None = None, threshold: float | None = None, ) -> ( Callable[ - [Evaluation[Value, Args]], + [EvaluatorDefinition[Value, Args]], Evaluator[Value, Args], ] | Evaluator[Value, Args] ): def wrap( - evaluation: Evaluation[Value, Args], + definition: EvaluatorDefinition[Value, Args], ) -> Evaluator[Value, Args]: return Evaluator( - name=name or evaluation.__name__, - evaluation=evaluation, + name=name or definition.__name__, + definition=definition, threshold=threshold, ) diff --git a/src/draive/evaluation/scenario.py b/src/draive/evaluation/scenario.py index 84ae8b7..2952d78 100644 --- a/src/draive/evaluation/scenario.py +++ b/src/draive/evaluation/scenario.py @@ -1,8 +1,10 @@ +from asyncio import gather from collections.abc import Callable, Sequence -from typing import Protocol, overload, runtime_checkable +from typing import Protocol, Self, overload, runtime_checkable -from draive.evaluation.evaluator import EvaluatorResult +from draive.evaluation.evaluator import EvaluatorResult, PreparedEvaluator from draive.parameters import DataModel, Field +from draive.scope import ctx from draive.types import frozenlist from draive.utils import freeze @@ -11,6 +13,7 @@ "ScenarioEvaluator", "ScenarioEvaluatorDefinition", "ScenarioEvaluatorResult", + "EvaluationScenarioResult", ] @@ -21,10 +24,44 @@ class ScenarioEvaluatorResult(DataModel): evaluations: frozenlist[EvaluatorResult] = Field( description="Scenario evaluation results", ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) @property def passed(self) -> bool: - return all(case.passed for case in self.evaluations) + # empty evaluations is equivalent of failure + return len(self.evaluations) > 0 and all(case.passed for case in self.evaluations) + + +class EvaluationScenarioResult(DataModel): + @classmethod + async def evaluating[Value]( + cls, + value: Value, + /, + evaluators: PreparedEvaluator[Value], + *_evaluators: PreparedEvaluator[Value], + meta: dict[str, str | float | int | bool | None] | None = None, + ) -> Self: + return cls( + evaluations=tuple( + await gather( + *[evaluator(value) for evaluator in [evaluators, *_evaluators]], + return_exceptions=False, + ), + ), + meta=meta, + ) + + evaluations: frozenlist[EvaluatorResult] = Field( + description="Scenario evaluation results", + ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) @runtime_checkable @@ -47,7 +84,7 @@ async def __call__( /, *args: Args.args, **kwargs: Args.kwargs, - ) -> Sequence[EvaluatorResult]: ... + ) -> Sequence[EvaluatorResult] | EvaluationScenarioResult: ... class ScenarioEvaluator[Value, **Args]: @@ -84,16 +121,35 @@ async def __call__( *args: Args.args, **kwargs: Args.kwargs, ) -> ScenarioEvaluatorResult: - return ScenarioEvaluatorResult( - name=self.name, - evaluations=tuple( - await self._definition( - value, - *args, - **kwargs, - ) - ), - ) + try: + match await self._definition( + value, + *args, + **kwargs, + ): + case EvaluationScenarioResult() as result: + return ScenarioEvaluatorResult( + name=self.name, + evaluations=result.evaluations, + meta=result.meta, + ) + + case [*results]: + return ScenarioEvaluatorResult( + name=self.name, + evaluations=tuple(results), + ) + except Exception as exc: + ctx.log_error( + f"Scenario evaluator `{self.name}` failed, using empty fallback result", + exception=exc, + ) + + return ScenarioEvaluatorResult( + name=self.name, + evaluations=(), + meta={"exception": str(exc)}, + ) @overload diff --git a/src/draive/evaluation/score.py b/src/draive/evaluation/score.py index 3082438..3acf94c 100644 --- a/src/draive/evaluation/score.py +++ b/src/draive/evaluation/score.py @@ -1,9 +1,6 @@ -from typing import Protocol, runtime_checkable - from draive.parameters import DataModel, Field __all__ = [ - "Evaluation", "EvaluationScore", ] @@ -24,17 +21,3 @@ class EvaluationScore(DataModel): description="Explanation of the score", default=None, ) - - -@runtime_checkable -class Evaluation[Value, **Args](Protocol): - @property - def __name__(self) -> str: ... - - async def __call__( - self, - value: Value, - /, - *args: Args.args, - **kwargs: Args.kwargs, - ) -> EvaluationScore | float | bool: ... diff --git a/src/draive/evaluation/suite.py b/src/draive/evaluation/suite.py index bbe592d..9a45f2d 100644 --- a/src/draive/evaluation/suite.py +++ b/src/draive/evaluation/suite.py @@ -15,9 +15,10 @@ "evaluation_suite", "EvaluationCaseResult", "EvaluationSuite", - "EvaluationSuiteCaseResult", "EvaluationSuiteDefinition", "EvaluationSuiteStorage", + "SuiteEvaluatorCaseResult", + "SuiteEvaluatorResult", ] @@ -27,20 +28,33 @@ class EvaluationSuiteCase[CaseParameters: DataModel](DataModel): comment: str | None = None -class EvaluationSuiteCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel): +class SuiteEvaluatorCaseResult[CaseParameters: DataModel, Value: DataModel | str](DataModel): case: EvaluationSuiteCase[CaseParameters] = Field( description="Evaluated case", ) value: Value = Field( description="Evaluated value", ) - results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field( + results: frozenlist[ScenarioEvaluatorResult] = Field( description="Evaluation results", ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) @property def passed(self) -> bool: - return all(result.passed for result in self.results) + # empty results is equivalent of failure + return len(self.results) > 0 and all(result.passed for result in self.results) + + +class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](DataModel): + cases: list[SuiteEvaluatorCaseResult[CaseParameters, Value]] + + @property + def passed(self) -> bool: + return all(case.passed for case in self.cases) class EvaluationCaseResult[Value: DataModel | str](DataModel): @@ -50,10 +64,30 @@ def of( results: ScenarioEvaluatorResult | EvaluatorResult, *_results: ScenarioEvaluatorResult | EvaluatorResult, value: Value, + meta: dict[str, str | float | int | bool | None] | None = None, ) -> Self: + free_results: list[EvaluatorResult] = [] + scenario_results: list[ScenarioEvaluatorResult] = [] + for result in (results, *_results): + match result: + case ScenarioEvaluatorResult() as scenario_result: + scenario_results.append(scenario_result) + + case EvaluatorResult() as evaluator_result: + free_results.append(evaluator_result) + + if free_results: + scenario_results.append( + ScenarioEvaluatorResult( + name="EvaluationSuite", + evaluations=tuple(free_results), + ) + ) + return cls( value=value, - results=(results, *_results), + results=tuple(scenario_results), + meta=meta, ) @classmethod @@ -63,23 +97,27 @@ async def evaluating( /, evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value], *_evaluators: PreparedScenarioEvaluator[Value] | PreparedEvaluator[Value], + meta: dict[str, str | float | int | bool | None] | None = None, ) -> Self: - return cls( - value=value, - results=tuple( - await gather( - *[evaluator(value) for evaluator in [evaluators, *_evaluators]], - return_exceptions=False, - ), + return cls.of( + *await gather( + *[evaluator(value) for evaluator in [evaluators, *_evaluators]], + return_exceptions=False, ), + value=value, + meta=meta, ) value: Value = Field( description="Evaluated value", ) - results: frozenlist[ScenarioEvaluatorResult | EvaluatorResult] = Field( + results: frozenlist[ScenarioEvaluatorResult] = Field( description="Evaluation results", ) + meta: dict[str, str | float | int | bool | None] | None = Field( + description="Additional evaluation metadata", + default=None, + ) @runtime_checkable @@ -124,7 +162,7 @@ async def __call__( /, *, reload: bool = False, - ) -> EvaluationSuiteCaseResult[CaseParameters, Value]: ... + ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]: ... @overload async def __call__( @@ -132,7 +170,7 @@ async def __call__( /, *, reload: bool = False, - ) -> list[EvaluationSuiteCaseResult[CaseParameters, Value]]: ... + ) -> SuiteEvaluatorResult[CaseParameters, Value]: ... async def __call__( self, @@ -141,18 +179,20 @@ async def __call__( *, reload: bool = False, ) -> ( - list[EvaluationSuiteCaseResult[CaseParameters, Value]] - | EvaluationSuiteCaseResult[CaseParameters, Value] + SuiteEvaluatorResult[CaseParameters, Value] + | SuiteEvaluatorCaseResult[CaseParameters, Value] ): async with self._lock: match parameters: case None: - return await gather( - *[ - self._evaluate(case=case) - for case in (await self._data(reload=reload)).cases - ], - return_exceptions=False, + return SuiteEvaluatorResult( + cases=await gather( + *[ + self._evaluate(case=case) + for case in (await self._data(reload=reload)).cases + ], + return_exceptions=False, + ) ) case UUID() as identifier: @@ -180,10 +220,10 @@ async def _evaluate( self, *, case: EvaluationSuiteCase[CaseParameters], - ) -> EvaluationSuiteCaseResult[CaseParameters, Value]: + ) -> SuiteEvaluatorCaseResult[CaseParameters, Value]: result: EvaluationCaseResult[Value] = await self._definition(parameters=case.parameters) - return EvaluationSuiteCaseResult[CaseParameters, Value]( + return SuiteEvaluatorCaseResult[CaseParameters, Value]( case=case, value=result.value, results=result.results, diff --git a/src/draive/evaluators/__init__.py b/src/draive/evaluators/__init__.py index 68d8735..923af8e 100644 --- a/src/draive/evaluators/__init__.py +++ b/src/draive/evaluators/__init__.py @@ -1,25 +1,27 @@ -from draive.evaluators.text_coherence import text_coherence_evaluator -from draive.evaluators.text_conciseness import text_conciseness_evaluator -from draive.evaluators.text_consistency import text_consistency_evaluator -from draive.evaluators.text_coverage import text_coverage_evaluator -from draive.evaluators.text_fluency import text_fluency_evaluator -from draive.evaluators.text_keywords import text_keywords_evaluator -from draive.evaluators.text_readability import text_readability_evaluator -from draive.evaluators.text_relevance import text_relevance_evaluator -from draive.evaluators.text_similarity import ( - text_similarity_evaluator, +from draive.evaluators.coherence import coherence_evaluator +from draive.evaluators.conciseness import conciseness_evaluator +from draive.evaluators.consistency import consistency_evaluator +from draive.evaluators.coverage import coverage_evaluator +from draive.evaluators.fluency import fluency_evaluator +from draive.evaluators.keywords import keywords_evaluator +from draive.evaluators.readability import readability_evaluator +from draive.evaluators.relevance import relevance_evaluator +from draive.evaluators.similarity import ( + image_vector_similarity_evaluator, + similarity_evaluator, text_vector_similarity_evaluator, ) __all__ = [ - "text_coherence_evaluator", - "text_conciseness_evaluator", - "text_consistency_evaluator", - "text_coverage_evaluator", - "text_fluency_evaluator", - "text_readability_evaluator", - "text_relevance_evaluator", - "text_keywords_evaluator", - "text_similarity_evaluator", + "coherence_evaluator", + "conciseness_evaluator", + "consistency_evaluator", + "coverage_evaluator", + "fluency_evaluator", + "image_vector_similarity_evaluator", + "keywords_evaluator", + "readability_evaluator", + "relevance_evaluator", + "similarity_evaluator", "text_vector_similarity_evaluator", ] diff --git a/src/draive/evaluators/coherence.py b/src/draive/evaluators/coherence.py new file mode 100644 index 0000000..d80a58f --- /dev/null +++ b/src/draive/evaluators/coherence.py @@ -0,0 +1,90 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "coherence_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a coherence metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is coherence - a collective quality of the content. +We align this dimension with the DUC (Document Understanding Conference) quality question of \ +structure and coherence, whereby the content should be well-structured and well-organized. +EVALUATED content should not just be a heap of related information, but should build from part +to part into a coherent body of information about the topic. + + + +Assign a coherence score using value between 0.0 and 4.0 where: +0.0 is very low coherence - the content is chaotic, lacking logical connections between parts. +1.0 is low coherence - some connections are visible, but the overall structure is weak. +2.0 is moderate coherence - the content has a noticeable structure, but with some shortcomings. +3.0 is good coherence - the content is well-organized with minor imperfections. +4.0 is excellent coherence - the content is exemplarily structured, with smooth transitions \ +between ideas. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="coherence") +async def coherence_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/conciseness.py b/src/draive/evaluators/conciseness.py new file mode 100644 index 0000000..7fe01a2 --- /dev/null +++ b/src/draive/evaluators/conciseness.py @@ -0,0 +1,88 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "conciseness_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a conciseness metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is conciseness - a extent to which the EVALUATED content is brief and to the \ +point while still covering all key information. +A concise content avoids unnecessary details and repetition, also avoiding being overly verbose \ +or include irrelevant information. + + + +Assign a conciseness score using value between 0.0 and 4.0 where: +0.0 is very low conciseness - the content is excessively verbose with much irrelevant information. +1.0 is low conciseness - the content contains unnecessary details and some irrelevant information. +2.0 is moderate conciseness - the content is somewhat concise but could be more focused. +3.0 is good conciseness - the content is mostly concise with minimal unnecessary information. +4.0 is excellent conciseness - the content is highly concise, containing only essential information. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="conciseness") +async def conciseness_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/consistency.py b/src/draive/evaluators/consistency.py new file mode 100644 index 0000000..93fd4b5 --- /dev/null +++ b/src/draive/evaluators/consistency.py @@ -0,0 +1,90 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "consistency_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a consistency metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is consistency - a factual alignment between the REFERENCE and the EVALUATED content. +A factually consistent content contains only elements that are entailed by the REFERENCE content. + + + +Assign a consistency score using value between 0.0 and 4.0 where: +0.0 is very low consistency - the content contains multiple hallucinated facts \ +or significant misalignments with the reference content. +1.0 is low consistency - the content has several instances of information not supported by \ +the reference content. +2.0 is moderate consistency - the content is mostly consistent but contains a few unsupported \ +statements. +3.0 is good consistency - the content is largely consistent with minor discrepancies. +4.0 is excellent consistency - the content is fully consistent with the reference content, \ +containing only supported information. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" # noqa: E501 + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="consistency") +async def consistency_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/coverage.py b/src/draive/evaluators/coverage.py new file mode 100644 index 0000000..c4713b4 --- /dev/null +++ b/src/draive/evaluators/coverage.py @@ -0,0 +1,88 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "coverage_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a coverage metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is coverage - the extent to which the EVALUATED content includes all \ +the key points from the REFERENCE content. +EVALUATED content with good coverage includes all the important information from \ +the REFERENCE content without omitting critical points. + + + +Assign a coverage score using value between 0.0 and 4.0 where: +0.0 is very low coverage - the content misses most key points from the reference content. +1.0 is low coverage - the content includes some key points but omits several important ones. +2.0 is moderate coverage - the content covers most key points but misses a few important details. +3.0 is good coverage - the content includes nearly all key points with minor omissions. +4.0 is excellent coverage - the content comprehensively covers all key points from the reference content. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" # noqa: E501 + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="coverage") +async def coverage_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/fluency.py b/src/draive/evaluators/fluency.py new file mode 100644 index 0000000..74dac21 --- /dev/null +++ b/src/draive/evaluators/fluency.py @@ -0,0 +1,74 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "fluency_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Carefully examine provided CONTENT, then rate it using solely a \ +fluency metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is fluency - the quality of the content in terms of grammar, spelling, \ +punctuation, content choice, and overall structure. + + + +Assign a fluency score using value between 0.0 and 2.0 where: +0.0 is poor fluency - the content has many errors that make it hard to understand or look unnatural. +1.0 is fair fluency - the content has some errors that affect the clarity or smoothness, \ +but the main points are still comprehensible. +2.0 is good fluency - the content has few or no errors and is easy to read and follow. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("content",), + "", +) + + +@evaluator(name="fluency") +async def fluency_evaluator( + content: Multimodal, + /, +) -> EvaluationScore: + if not content: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + content=content, + ), + ), + ): + return EvaluationScore( + value=float(result) / 2, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/text_keywords.py b/src/draive/evaluators/keywords.py similarity index 73% rename from src/draive/evaluators/text_keywords.py rename to src/draive/evaluators/keywords.py index 737fa8e..1fa8163 100644 --- a/src/draive/evaluators/text_keywords.py +++ b/src/draive/evaluators/keywords.py @@ -1,23 +1,24 @@ from collections.abc import Callable, Sequence from draive.evaluation import EvaluationScore, evaluator +from draive.types import Multimodal, MultimodalContent __all__ = [ - "text_keywords_evaluator", + "keywords_evaluator", ] -@evaluator(name="text_keywords") -async def text_keywords_evaluator( - text: str, +@evaluator(name="keywords") +async def keywords_evaluator( + content: Multimodal, /, keywords: Sequence[str], normalization: Callable[[str], str] | None = None, ) -> EvaluationScore: - if not text: + if not content: return EvaluationScore( value=0, - comment="Input text was empty!", + comment="Input was empty!", ) if not keywords: @@ -33,7 +34,7 @@ async def text_keywords_evaluator( else: text_normalization = _lowercased - normalized_text: str = text_normalization(text) + normalized_text: str = text_normalization(MultimodalContent.of(content).as_string()) return EvaluationScore( value=len( [keyword for keyword in keywords if text_normalization(keyword) in normalized_text] diff --git a/src/draive/evaluators/readability.py b/src/draive/evaluators/readability.py new file mode 100644 index 0000000..3a2b0b7 --- /dev/null +++ b/src/draive/evaluators/readability.py @@ -0,0 +1,80 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "readability_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Carefully examine provided CONTENT, then rate it using solely a \ +readability metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is readability - the ease with which a reader can understand the content. +A readable content uses clear and concise language, is well-structured, +and avoids complex or convoluted elements. + + + +Assign a readability score using value between 0.0 and 4.0 where: +0.0 is very low readability - the content is extremely difficult to understand, \ +with complex language and convoluted structure. +1.0 is low readability - the content is challenging to read, with frequent use of \ +complex sentences, unclear language or irrelevant parts. +2.0 is moderate readability - the content is somewhat clear but has some areas \ +that are difficult to understand. +3.0 is good readability - the content is mostly clear and easy to read, with minor instances \ +of complexity. +4.0 is excellent readability - the content is highly clear, concise, and easy to understand throughout. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" # noqa: E501 + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("content",), + "", +) + + +@evaluator(name="readability") +async def readability_evaluator( + content: Multimodal, + /, +) -> EvaluationScore: + if not content: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + content=content, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/relevance.py b/src/draive/evaluators/relevance.py new file mode 100644 index 0000000..ae62475 --- /dev/null +++ b/src/draive/evaluators/relevance.py @@ -0,0 +1,91 @@ +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.types import Multimodal, MultimodalTemplate +from draive.utils import xml_tag + +__all__ = [ + "relevance_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a relevance metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is relevance - selection of important parts from the REFERENCE content. +The EVALUATED content should include only important information from the REFERENCE avoiding \ +redundancies and excess information. + + + +Assign a relevance score using value between 0.0 and 4.0 where: +0.0 is very low relevance - the content contains mostly irrelevant or redundant information. +1.0 is low coverage - the content includes some important points but has \ +significant irrelevant parts. +2.0 is moderate relevance - the content covers most important points but includes \ +some unnecessary information. +3.0 is good relevance - the content focuses on important information with minor inclusions \ +of less relevant content. +4.0 is excellent relevance - the content precisely captures only the most important information \ +from the reference. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="relevance") +async def relevance_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 4, + comment=None, + ) + + else: + raise ValueError("Invalid result") diff --git a/src/draive/evaluators/score.py b/src/draive/evaluators/score.py index 127e498..3cb5d94 100644 --- a/src/draive/evaluators/score.py +++ b/src/draive/evaluators/score.py @@ -39,14 +39,14 @@ def _score_validator( class CommonScoreModel(DataModel): - score: float = Field( - description="Decimal score value", - validator=_score_validator, - ) comment: str | None = Field( description="Explanation of the score", default=None, ) + score: float = Field( + description="Decimal score value", + validator=_score_validator, + ) def normalized( self, diff --git a/src/draive/evaluators/similarity.py b/src/draive/evaluators/similarity.py new file mode 100644 index 0000000..6480956 --- /dev/null +++ b/src/draive/evaluators/similarity.py @@ -0,0 +1,133 @@ +from base64 import b64decode + +from draive.embedding import Embedded, embed_images, embed_texts +from draive.evaluation import EvaluationScore, evaluator +from draive.generation import generate_text +from draive.similarity.score import vector_similarity_score +from draive.types import ( + ImageBase64Content, + Multimodal, + MultimodalTemplate, +) +from draive.utils import xml_tag + +__all__ = [ + "similarity_evaluator", + "text_vector_similarity_evaluator", + "image_vector_similarity_evaluator", +] + + +INSTRUCTION: str = """\ +Assistant is an evaluator scoring the provided content. + + +Compare the REFERENCE and the EVALUATED content by carefully examining them, then rate \ +the EVALUATED content using solely a similarity metric according to the EVALUATION_CRITERIA. +Think step by step and provide explanation of the score before the final score. +Use the explained RATING scale and the requested FORMAT to provide the result. + + + +Evaluated metric is similarity - the degree of semantic similarity between the REFERENCE \ +and the EVALUATED content. + + + +Assign a similarity score using value between 0.0 and 2.0 where: +0.0 is no similarity - the content is completely unrelated in meaning. +1.0 is moderate similarity - the content share some common themes or ideas. +2.0 is high similarity - the content is very close in meaning \ +or convey the same information. + + + +The final result containing only the numerical score value HAVE to be put inside a `RESULT` \ +xml tag within the result i.e. `score`. + +""" + + +INPUT_TEMPLATE: MultimodalTemplate = MultimodalTemplate.of( + "", + ("reference",), + "", + "", + ("evaluated",), + "", +) + + +@evaluator(name="similarity") +async def similarity_evaluator( + evaluated: Multimodal, + /, + reference: Multimodal, +) -> EvaluationScore: + if not evaluated: + return EvaluationScore( + value=0, + comment="Input was empty!", + ) + + if not reference: + return EvaluationScore( + value=0, + comment="Reference was empty!", + ) + + if result := xml_tag( + "RESULT", + await generate_text( + instruction=INSTRUCTION, + input=INPUT_TEMPLATE.format( + reference=reference, + evaluated=evaluated, + ), + ), + ): + return EvaluationScore( + value=float(result) / 2, + comment=None, + ) + + else: + raise ValueError("Invalid result") + + +@evaluator(name="text_vector_similarity") +async def text_vector_similarity_evaluator( + evaluated: str, + /, + reference: str, +) -> float: + embedding: list[Embedded[str]] = await embed_texts([reference, evaluated]) + + return vector_similarity_score(embedding[0].vector, embedding[1].vector) + + +@evaluator(name="image_vector_similarity") +async def image_vector_similarity_evaluator( + evaluated: ImageBase64Content | bytes, + /, + reference: ImageBase64Content | bytes, +) -> float: + evaluated_data: bytes + match evaluated: + case ImageBase64Content() as base64_data: + evaluated_data = b64decode(base64_data.image_base64) + + case raw_data: + evaluated_data = raw_data + + reference_data: bytes + match reference: + case ImageBase64Content() as base64_data: + reference_data = b64decode(base64_data.image_base64) + + case raw_data: + reference_data = raw_data + + embedding: list[Embedded[bytes]] = await embed_images([reference_data, evaluated_data]) + + return vector_similarity_score(embedding[0].vector, embedding[1].vector) diff --git a/src/draive/evaluators/text_coherence.py b/src/draive/evaluators/text_coherence.py deleted file mode 100644 index f0d5464..0000000 --- a/src/draive/evaluators/text_coherence.py +++ /dev/null @@ -1,134 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_coherence_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a reference text and a compared text based on the reference text. -Your task is to rate the compared text using only the Coherence metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Coherence (0.0-4.0) - the collective quality of all sentences. -We align this dimension with the DUC (Document Understanding Conference) quality question of \ -structure and coherence, whereby the text should be well-structured and well-organized. -The compared text should not just be a heap of related information, but should build from sentence -to sentence into a coherent body of information about a topic. - -Rating Scale: -0.0: Very low coherence - the text is chaotic, lacking logical connections between sentences. -1.0: Low coherence - some connections are visible, but the overall structure is weak. -2.0: Moderate coherence - the text has a noticeable structure, but with some shortcomings. -3.0: Good coherence - the text is well-organized with minor imperfections. -4.0: Excellent coherence - the text is exemplarily structured, with smooth transitions \ -between ideas. - -Evaluation Steps: -1. Read the reference text carefully and identify the main topic and key points. -2. Read the compared text and compare it to the reference text. -Check if the compared text covers the main topic and key points of the reference text, \ -and if it presents them in a clear and logical order. -3. Assign a coherence score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_coherence") -async def text_coherence_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "Solar energy is a renewable energy source that is gaining popularity. " - "Solar panels convert sunlight into electricity. " - "This technology is environmentally friendly and can reduce electricity " - "bills. However, installing solar panels requires an initial investment " - "and is dependent on weather conditions." - ), - compared=( - "Solar panels are on roofs. Energy is important. " - "The sun shines brightly. Electricity bills can be high. " - "Technology is developing fast. People like to save money." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Coffee is a popular beverage worldwide. " - "It's made from roasted coffee beans. Caffeine in coffee " - "can boost energy and alertness. However, excessive consumption may " - "lead to sleep issues." - ), - compared=( - "Coffee is drunk by many people. It comes from beans that are roasted. " - "Caffeine makes you feel more awake. " - "Drinking too much coffee might make it hard to sleep. " - "Some people add milk or sugar to their coffee." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Honey is a natural sweetener produced by bees. " - "It has antibacterial properties and is rich in antioxidants. " - "People use honey in cooking, as a spread, and for medicinal " - "purposes. However, it's high in calories and should be consumed " - "in moderation." - ), - compared=( - "Bees create honey, a natural sweetener with multiple benefits. " - "Its antibacterial and antioxidant-rich composition makes it valuable " - "for culinary, nutritional, and medicinal uses. While versatile, " - "honey's high caloric content necessitates mindful consumption." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_conciseness.py b/src/draive/evaluators/text_conciseness.py deleted file mode 100644 index 0fc911e..0000000 --- a/src/draive/evaluators/text_conciseness.py +++ /dev/null @@ -1,140 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_conciseness_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a reference text and a compared text based on the reference text. -Your task is to rate the compared text using only the Conciseness metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Conciseness (0.0-4.0) - the extent to which the compared text is brief and to the point \ -while still covering all key information. -A concise compared text avoids unnecessary details and repetition. -Annotators should penalize compared texts that are overly verbose or include irrelevant information. - -Rating Scale: -0.0: Very low conciseness - the text is excessively verbose with much irrelevant information. -1.0: Low conciseness - the text contains unnecessary details and some irrelevant information. -2.0: Moderate conciseness - the text is somewhat concise but could be more focused. -3.0: Good conciseness - the text is mostly concise with minimal unnecessary information. -4.0: Excellent conciseness - the text is highly concise, containing only essential information. - -Evaluation Steps: -1. Read the derived text and the reference text carefully. -2. Compare the compared text to the reference text and identify the main \ -points of the reference text. -3. Assess how well the compared text covers the main points of the reference text, \ -and how much irrelevant or redundant information it contains. -4. Assign a conciseness score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_conciseness") -async def text_conciseness_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "Solar energy is a renewable energy source that is gaining popularity. " - "Solar panels convert sunlight into electricity. " - "This technology is environmentally friendly and can reduce electricity " - "bills. However,installing solar panels requires an initial investment and " - "is dependent on weather conditions." - ), - compared=( - "Did you know that solar energy is becoming super popular these days? " - "It's this amazing, eco-friendly way to make electricity using " - "the sun's rays. People are getting really excited about it! Basically, " - "you put these special panels on your roof, and they soak up the sunlight " - "like a sponge. Then, through some pretty cool science stuff, " - "they turn that sunlight into electricity you can use in your house. " - "It's pretty neat, right? And get this - it can actually help you save " - "money on your electricity bills in the long run. But here's the thing: " - "you've got to shell out some cash upfront to get those panels installed. " - "It's kind of like buying a fancy coffee machine - costs a bit at first, " - "but then you save on all those coffee shop visits." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Coffee is a popular beverage worldwide. " - "It's made from roasted coffee beans. Caffeine in coffee " - "can boost energy and alertness. However, excessive consumption may " - "lead to sleep issues." - ), - compared=( - "Coffee is a widely consumed beverage made from roasted coffee beans. " - "It contains caffeine, which can enhance energy and alertness. However, " - "drinking too much coffee may cause sleep problems. " - "People enjoy coffee for its taste and stimulating effects, but it's " - "important to consume it in moderation." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "The water cycle, also known as the hydrologic cycle, " - "describes the continuous movement of water within the Earth and " - "atmosphere. It involves processes such as evaporation, condensation, " - "precipitation, and runoff." - ), - compared=( - "The water cycle is the continuous movement of water on Earth. " - "It includes evaporation, condensation, precipitation, and runoff." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_consistency.py b/src/draive/evaluators/text_consistency.py deleted file mode 100644 index 52582c7..0000000 --- a/src/draive/evaluators/text_consistency.py +++ /dev/null @@ -1,133 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_consistency_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a reference text and a compared text based on the reference text. -Your task is to rate the compared text using only the Consistency metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Consistency(0.0-4.0) - the factual alignment between the reference text and the compared text. -A factually consistent compared text contains only statements that are entailed \ -by the reference text. -Annotators should penalize compared texts that contain hallucinated facts. - -Rating Scale: -0.0: Very low consistency - the text contains multiple hallucinated facts \ -or significant misalignments with the reference text. -1.0: Low consistency - the text has several instances of information not supported by \ -the reference text. -2.0: Moderate consistency - the text is mostly consistent but contains a few unsupported statements. -3.0: Good consistency - the text is largely consistent with minor discrepancies. -4.0: Excellent consistency - the text is fully consistent with the reference text, \ -containing only supported information. - -Evaluation Steps: -1. Read the compared text and the reference text carefully. -2. Compare the compared text to the reference text and identify the main points \ -of the reference text. -3. Assess how well the compared text covers the main points of the reference text \ -and how much irrelevant or redundant information it contains. -4. Assign a consistency score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_consistency") -async def text_consistency_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "Dolphins are intelligent marine mammals. They use echolocation " - "to navigate and hunt. Dolphins live in social groups called pods." - ), - compared=( - "Dolphins are smart fish that can fly short distances. They use sonar " - "to talk to whales. Dolphins live in families and go to school " - "to learn hunting techniques." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Coffee is a popular beverage worldwide. " - "It's made from roasted coffee beans. Caffeine in coffee " - "can boost energy and alertness. However, excessive consumption may " - "lead to sleep issues." - ), - compared=( - "Coffee is a widely consumed drink around the world. It's produced " - "by roasting coffee beans. The caffeine in coffee can increase energy " - "levels and improve alertness. However, drinking too much coffee might " - "cause sleep problems. Coffee is also known to improve memory and reduce " - "the risk of certain diseases." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Photosynthesis is the process by which plants use sunlight to " - "produce energy. It requires water, carbon dioxide, and chlorophyll. " - "Oxygen is released as a byproduct of photosynthesis." - ), - compared=( - "Plants carry out photosynthesis to create energy from sunlight. " - "This process needs water, carbon dioxide, and the green pigment " - "chlorophyll. As plants photosynthesize, " - "they release oxygen into the environment." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_coverage.py b/src/draive/evaluators/text_coverage.py deleted file mode 100644 index 7de871d..0000000 --- a/src/draive/evaluators/text_coverage.py +++ /dev/null @@ -1,137 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_coverage_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a reference text and a compared text based on the reference text. -Your task is to rate the compared text using only the Coverage metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Coverage (0.0-4.0) - the extent to which the compared text includes all \ -the key points from the reference text. -A compared text with good coverage includes all the important information from \ -the reference text without omitting critical points. -Annotators should penalize compared texts that miss significant content. - -Rating Scale: -0.0: Very low coverage - the text misses most key points from the reference text. -1.0: Low coverage - the text includes some key points but omits several important ones. -2.0: Moderate coverage - the text covers most key points but misses a few important details. -3.0: Good coverage - the text includes nearly all key points with minor omissions. -4.0: Excellent coverage - the text comprehensively covers all key points from the reference text. - -Evaluation Steps: -1. Read the reference text carefully and identify all key points and important information. -2. Read the compared text and compare it to the reference text. \ -Check if the compared text includes all the key points and important information \ -from the reference text. -3. Assess how well the compared text covers the reference text, \ -and if any critical points are missing. -4. Assign a coverage score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_coverage") -async def text_coverage_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "Smartphones are versatile devices. They can make calls, send messages, " - "access the internet, take photos, and run various apps. " - "Many people use smartphones for work and entertainment. " - "However, excessive use can lead to addiction and sleep problems." - ), - compared=( - "Smartphones can make calls and send messages. They are popular devices." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Recycling helps protect the environment. It reduces waste in landfills, " - "conserves natural resources, and saves energy. Common recyclable items " - "include paper, plastic, glass, and metal. Many cities have recycling " - "programs, but individual participation is crucial for success." - ), - compared=( - "Recycling is good for the environment. " - "It reduces waste and saves resources. " - "People can recycle things like paper and plastic. " - "Many cities have recycling programs." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Regular exercise is important for health. It strengthens the heart, " - "builds muscle, and improves flexibility. Exercise can also reduce stress " - "and boost mood. Experts recommend at least 30 minutes of moderate " - "activity most days of the week. Walking, swimming, and cycling are " - "good options for many people." - ), - compared=( - "Regular exercise is crucial for maintaining good health. " - "It has many benefits, including strengthening the heart, " - "building muscle, and enhancing flexibility. Exercise also has " - "mental health benefits, such as reducing stress and improving mood. " - "Health experts advise doing at least 30 minutes of moderate exercise " - "on most days. Some popular and accessible forms of exercise " - "include walking, swimming, and cycling." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_fluency.py b/src/draive/evaluators/text_fluency.py deleted file mode 100644 index f184e3f..0000000 --- a/src/draive/evaluators/text_fluency.py +++ /dev/null @@ -1,89 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_fluency_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a text. Your task is to rate this text using only the Fluency metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Fluency (0.0-2.0) - the quality of the text in terms of grammar, spelling, punctuation, \ -word choice, and sentence structure. - -Rating Scale: -0.0: Poor - the text has many errors that make it hard to understand or sound unnatural. -1.0: Fair - the text has some errors that affect the clarity or smoothness of the text, \ -but the main points are still comprehensible. -2.0: Good - the text has few or no errors and is easy to read and follow. - -Evaluation Steps: -1. Read the text and evaluate its fluency based on the given criteria. -2. Assign a fluency score from 0.0 to 2.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{text} - -""" - - -@evaluator(name="text_fluency") -async def text_fluency_evaluator( - text: str, - /, -) -> EvaluationScore: - if not text: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format(text=text), - examples=[ - ( - INPUT_TEMPLATE.format( - text=( - "The cat sitted on mat. It were very comfrotable. " - "The sun shine bright in sky." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - text=( - "The movie was good, but I didn't liked the ending. " - "It left me feeling confuse and unsatisfied." - ), - ), - CommonScoreModel(score=1.0), - ), - ( - INPUT_TEMPLATE.format( - text=( - "The concert last night was amazing. " - "The band played all their hit songs, and the crowd was energetic " - "throughout the performance." - ), - ), - CommonScoreModel(score=2.0), - ), - ], - ) - - return score.normalized(divider=2) diff --git a/src/draive/evaluators/text_readability.py b/src/draive/evaluators/text_readability.py deleted file mode 100644 index 4edccc0..0000000 --- a/src/draive/evaluators/text_readability.py +++ /dev/null @@ -1,100 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_readability_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a text. Your task is to rate this text using only the Readability metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Readability (0.0-4.0) - the ease with which a reader can understand the text. -A readable text uses clear and concise language, is well-structured, -and avoids complex or convoluted sentences. Annotators should penalize texts that \ -are difficult to read or understand. - -Rating Scale: -0.0: Very low readability - the text is extremely difficult to understand, \ -with complex language and convoluted structure. -1.0: Low readability - the text is challenging to read, with frequent use of \ -complex sentences or unclear language. -2.0: Moderate readability - the text is somewhat clear but has some areas \ -that are difficult to understand. -3.0: Good readability - the text is mostly clear and easy to read, with minor instances \ -of complexity. -4.0: Excellent readability - the text is highly clear, concise, and easy to understand throughout. - -Evaluation Steps: -1. Read the text carefully and evaluate how easy it is to read and understand. -2. Consider the language used in the text, including clarity, simplicity, and sentence structure. -3. Assess whether the text is well-structured and free from complex or convoluted sentences. -4. Assign a readability score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{text} - -""" - - -@evaluator(name="text_readability") -async def text_readability_evaluator( - text: str, - /, -) -> EvaluationScore: - if not text: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format(text=text), - examples=[ - ( - INPUT_TEMPLATE.format( - text=( - "The canine species, frequently domesticated for companionship purposes, " - "exhibit characteristics of fidelity and ludic propensities that engender " - "their widespread appeal among human populations as domestic " - "animal companions." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - text=( - "Pizza, a widely consumed dish, consists of a circular bread foundation " - "adorned with various ingredients. Typically, it includes a layer of " - "tomato-based sauce and cheese, though additional toppings may be " - "incorporated to suit individual preferences." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - text=( - "Exercise is good for health. It helps maintain fitness and reduces stress." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_relevance.py b/src/draive/evaluators/text_relevance.py deleted file mode 100644 index 26fa6fd..0000000 --- a/src/draive/evaluators/text_relevance.py +++ /dev/null @@ -1,130 +0,0 @@ -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model - -__all__ = [ - "text_relevance_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given a reference text and a compared text based on the reference text. -Your task is to rate the compared text using only the Relevance metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Relevance (0.0-4.0) - selection of important content from the reference text. -The compared text should include only important information from the reference text. -Annotators should penalize compared texts that contain redundancies and excess information. - -Rating Scale: -0.0: Very low relevance - the text contains mostly irrelevant or redundant information. -1.0: Low relevance - the text includes some important points but has \ -significant irrelevant content. -2.0: Moderate relevance - the text covers most important points but includes \ -some unnecessary information. -3.0: Good relevance - the text focuses on important information with minor inclusions \ -of less relevant content. -4.0: Excellent relevance - the text precisely captures only the most important information \ -from the reference text. - -Evaluation Steps: -1. Read the compared text and the reference text carefully. -2. Compare the compared text to the reference text and identify \ -the main points of the reference text. -3. Assess how well the compared text covers the main points of the reference text, \ -and note any irrelevant or redundant information it contains. -4. Assign a relevance score from 0.0 to 4.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 4.0. 4.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_relevance") -async def text_relevance_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "The sun is the star at the center of our solar system. " - "It provides light and heat to Earth." - ), - compared=( - "Stars twinkle in the night sky. Some people believe in astrology. " - "The moon orbits the Earth. Astronauts have been to space. " - "Solar panels use energy from the sun." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Elephants are the largest land animals. They have long trunks and tusks. " - "Elephants live in herds and are known for their intelligence." - ), - compared=( - "Elephants are very big animals. They use their trunks to grab food " - "and water. Elephants live together in groups. They're smart and have " - "good memories. Some people ride elephants in zoos, " - "but this can be harmful to the animals." - ), - ), - CommonScoreModel(score=2.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Bicycles are a popular mode of transportation. They are eco-friendly " - "and provide exercise. However, cyclists need to follow " - "traffic rules for safety." - ), - compared=( - "Bicycles are widely used for travel. " - "They don't pollute and help people stay fit. " - "Cyclists must obey traffic laws to stay safe." - ), - ), - CommonScoreModel(score=4.0), - ), - ], - ) - - return score.normalized(divider=4) diff --git a/src/draive/evaluators/text_similarity.py b/src/draive/evaluators/text_similarity.py deleted file mode 100644 index b329ff8..0000000 --- a/src/draive/evaluators/text_similarity.py +++ /dev/null @@ -1,128 +0,0 @@ -from draive.embedding import Embedded, embed_texts -from draive.evaluation import EvaluationScore, evaluator -from draive.evaluators.score import CommonScoreModel -from draive.generation import generate_model -from draive.similarity.score import vector_similarity_score - -__all__ = [ - "text_similarity_evaluator", - "text_vector_similarity_evaluator", -] - - -INSTRUCTION: str = """\ -You will be given two texts: a reference text and a compared text. \ -Your task is to rate the compared text using only the Similarity metric, \ -which is described in the Evaluation Criteria. -Please make sure you read and understand these instructions very carefully. -Keep this document open while reviewing, and refer to it as needed. - -Evaluation Criteria: -Similarity (0.0-2.0) - the degree of semantic similarity between the reference text \ -and the compared text. - -Rating Scale: -0.0: No similarity - the reference text and compared text are completely unrelated in meaning. -1.0: Moderate similarity - the reference text and compared text share some common themes or ideas. -2.0: High similarity - the reference text and compared text are very close in meaning \ -or convey the same information. - -Evaluation Steps: -1. Read both the reference text and the compared text carefully. -2. Compare the semantic meaning of the reference text and the compared text. -3. Assign a similarity score from 0.0 to 2.0 based on the provided criteria. - -Important: The score must be a decimal number from 0.0 to 2.0. 2.0 is the maximum, \ -do not exceed this value. -""" - - -INPUT_TEMPLATE: str = """ - -{reference} - - - -{compared} - -""" - - -@evaluator(name="text_similarity") -async def text_similarity_evaluator( - compared: str, - /, - reference: str, -) -> EvaluationScore: - if not compared: - return EvaluationScore( - value=0, - comment="Input text was empty!", - ) - - if not reference: - return EvaluationScore( - value=0, - comment="Reference text was empty!", - ) - - score: CommonScoreModel = await generate_model( - CommonScoreModel, - instruction=INSTRUCTION, - input=INPUT_TEMPLATE.format( - reference=reference, - compared=compared, - ), - examples=[ - ( - INPUT_TEMPLATE.format( - reference=( - "Cats are popular pets. They are independent and like to groom themselves." - ), - compared=( - "Bananas are a healthy fruit. They are rich in potassium and easy to peel." - ), - ), - CommonScoreModel(score=0.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "The beach is a great place for relaxation. " - "People enjoy swimming and sunbathing." - ), - compared=( - "Many people like to spend time outdoors. " - "Parks are popular for picnics and walking." - ), - ), - CommonScoreModel(score=1.0), - ), - ( - INPUT_TEMPLATE.format( - reference=( - "Coffee is a popular morning drink. It contains caffeine which helps " - "people feel more alert." - ), - compared=( - "Many people start their day with coffee. " - "The caffeine in coffee can increase alertness and energy." - ), - ), - CommonScoreModel(score=2.0), - ), - ], - ) - - return score.normalized(divider=2) - - -@evaluator(name="text_vector_similarity") -async def text_vector_similarity_evaluator( - compared: str, - /, - reference: str, -) -> float: - embedding: list[Embedded[str]] = await embed_texts([reference, compared]) - - return vector_similarity_score(embedding[0].vector, embedding[1].vector) diff --git a/src/draive/generation/model/lmm.py b/src/draive/generation/model/lmm.py index c529c7d..9bf8bbd 100644 --- a/src/draive/generation/model/lmm.py +++ b/src/draive/generation/model/lmm.py @@ -145,9 +145,11 @@ async def lmm_generate_model[Generated: DataModel]( # noqa: PLR0913, C901, PLR0 DEFAULT_INSTRUCTION_EXTENSION: str = """\ -The result have to be a JSON conforming to the following schema: + +The result have to be a JSON object conforming to the following schema: ``` {schema} ``` Provide ONLY a single, raw, valid JSON without any comments, formatting or additional elements. + """ diff --git a/src/draive/types/__init__.py b/src/draive/types/__init__.py index 87b6588..a13f7b9 100644 --- a/src/draive/types/__init__.py +++ b/src/draive/types/__init__.py @@ -21,6 +21,8 @@ MultimodalContent, MultimodalContentConvertible, MultimodalContentElement, + MultimodalContentPlaceholder, + MultimodalTemplate, ) from draive.types.text import TextContent from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent @@ -46,10 +48,12 @@ "LMMToolRequests", "LMMToolResponse", "Memory", + "Multimodal", "MultimodalContent", "MultimodalContentConvertible", "MultimodalContentElement", - "Multimodal", + "MultimodalContentPlaceholder", + "MultimodalTemplate", "RateLimitError", "TextContent", "VideoBase64Content", diff --git a/src/draive/types/multimodal.py b/src/draive/types/multimodal.py index 91a607c..e4da41c 100644 --- a/src/draive/types/multimodal.py +++ b/src/draive/types/multimodal.py @@ -10,12 +10,15 @@ from draive.types.video import VideoBase64Content, VideoContent, VideoURLContent __all__ = [ + "Multimodal", "MultimodalContent", - "MultimodalContentElement", "MultimodalContentConvertible", - "Multimodal", + "MultimodalContentElement", + "MultimodalContentPlaceholder", + "MultimodalTemplate", ] + MultimodalContentElement = TextContent | ImageContent | AudioContent | VideoContent | DataModel MultimodalContentConvertible = str | MultimodalContentElement @@ -165,10 +168,70 @@ def extending( def __bool__(self) -> bool: return bool(self.parts) and any(self.parts) + def __str__(self) -> str: + return self.as_string() + Multimodal = MultimodalContent | MultimodalContentConvertible +class MultimodalContentPlaceholder(DataModel): + identifier: str + + +class MultimodalTemplate(DataModel): + @classmethod + def of( + cls, + *elements: Multimodal | MultimodalContentPlaceholder | tuple[str], + merge_text: bool = True, + skip_empty: bool = True, + meta: dict[str, str | float | int | bool | None] | None = None, + ) -> Self: + return cls( + parts=tuple( + [ + MultimodalContentPlaceholder(identifier=element[0]) + if isinstance(element, tuple) + else element + for element in elements + ] + ), + merge_text=merge_text, + skip_empty=skip_empty, + meta=meta, + ) + + parts: frozenlist[Multimodal | MultimodalContentPlaceholder] + merge_text: bool + skip_empty: bool + meta: dict[str, str | float | int | bool | None] | None + + def format( + self, + **variables: Multimodal, + ) -> MultimodalContent: + parts: list[Multimodal] = [] + for part in self.parts: + match part: + case MultimodalContentPlaceholder() as placeholder: + if value := variables.get(placeholder.identifier): + parts.append(value) + + else: + raise ValueError("Missing format variable '%s'", placeholder.identifier) + + case part: + parts.append(part) + + return MultimodalContent.of( + *parts, + merge_text=self.merge_text, + skip_empty=self.skip_empty, + meta=self.meta, + ) + + def _extract_parts( # noqa: PLR0911 element: Multimodal, /, @@ -315,12 +378,18 @@ def _merge_texts( last_text_element: TextContent | None = None while element := next(iterator, None): match element: - case TextContent() as text: # do not merge texts with different metadata - if (last_text := last_text_element) and last_text.meta == text.meta: - last_text_element = TextContent( - text=last_text.text + text.text, - meta=text.meta, - ) + case TextContent() as text: + # do not merge texts with different metadata + if last_text := last_text_element: + if last_text.meta == text.meta: + last_text_element = TextContent( + text=last_text.text + text.text, + meta=text.meta, + ) + + else: + result.append(last_text) + last_text_element = text else: last_text_element = text