diff --git a/pyproject.toml b/pyproject.toml index 1f0e288..9f93f02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta" [project] name = "draive" description = "Framework designed to simplify and accelerate the development of LLM-based applications." -version = "0.28.2" +version = "0.28.3" readme = "README.md" maintainers = [ { name = "Kacper KaliƄski", email = "kacper.kalinski@miquido.com" }, diff --git a/src/draive/evaluation/evaluator.py b/src/draive/evaluation/evaluator.py index 4eeee49..25ba334 100644 --- a/src/draive/evaluation/evaluator.py +++ b/src/draive/evaluation/evaluator.py @@ -47,6 +47,20 @@ class EvaluatorResult(DataModel): def passed(self) -> bool: return self.score.value >= self.threshold + def report(self) -> str: + meta_values: str = ( + f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}" + if self.meta + else "N/A" + ) + return ( + f"{self.evaluator} {'passed' if self.passed else 'failed' }" + f" with score {self.score.value}," + f" required {self.threshold}," + f" comment: {f"'{self.score.comment}'" or 'N/A'}" + f" meta:\n{meta_values}" + ) + class EvaluationResult(DataModel): @classmethod @@ -111,6 +125,7 @@ def __init__( name: str, definition: EvaluatorDefinition[Value, Args], threshold: float | None, + meta: dict[str, str | float | int | bool | None] | None = None, ) -> None: assert ( # nosec: B101 threshold is None or 0 <= threshold <= 1 @@ -119,6 +134,7 @@ def __init__( self._definition: EvaluatorDefinition[Value, Args] = definition self.name: str = name self.threshold: float = threshold or 1 + self.meta: dict[str, str | float | int | bool | None] | None = meta freeze(self) @@ -131,6 +147,19 @@ def with_threshold( name=self.name, definition=self._definition, threshold=threshold_value(value), + meta=self.meta, + ) + + def with_meta( + self, + meta: dict[str, str | float | int | bool | None], + /, + ) -> Self: + return self.__class__( + name=self.name, + definition=self._definition, + threshold=self.threshold, + meta=self.meta | meta if self.meta else meta, ) def prepared( @@ -225,11 +254,25 @@ async def __call__( ) evaluation_meta = {"exception": str(exc)} + result_meta: dict[str, str | float | int | bool | None] | None + if self.meta: + if evaluation_meta: + result_meta = self.meta | evaluation_meta + + else: + result_meta = self.meta + + elif evaluation_meta: + result_meta = evaluation_meta + + else: + result_meta = None + return EvaluatorResult( evaluator=self.name, score=evaluation_score, threshold=self.threshold, - meta=evaluation_meta, + meta=result_meta, ) diff --git a/src/draive/evaluation/scenario.py b/src/draive/evaluation/scenario.py index 33358e0..4aebbe2 100644 --- a/src/draive/evaluation/scenario.py +++ b/src/draive/evaluation/scenario.py @@ -35,6 +35,25 @@ def passed(self) -> bool: # empty evaluations is equivalent of failure return len(self.evaluations) > 0 and all(case.passed for case in self.evaluations) + def report(self) -> str: + report: str = "\n- ".join( + result.report() for result in self.evaluations if not result.passed + ) + + if report: # nonempty report contains failing reports + meta_values: str = ( + f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}" + if self.meta + else "N/A" + ) + return f"Scenario {self.name}, meta: {meta_values}\n---\n{report}" + + elif not self.evaluations: + return f"Scenario {self.name} empty!" + + else: + return f"Scenario {self.name} passed!" + class EvaluationScenarioResult(DataModel): @classmethod @@ -93,9 +112,11 @@ def __init__( self, name: str, definition: ScenarioEvaluatorDefinition[Value, Args], + meta: dict[str, str | float | int | bool | None] | None = None, ) -> None: self.name: str = name self._definition: ScenarioEvaluatorDefinition[Value, Args] = definition + self.meta: dict[str, str | float | int | bool | None] | None = meta freeze(self) @@ -115,6 +136,17 @@ async def evaluate( return evaluate + def with_meta( + self, + meta: dict[str, str | float | int | bool | None], + /, + ) -> Self: + return self.__class__( + name=self.name, + definition=self._definition, + meta=self.meta | meta if self.meta else meta, + ) + def contra_map[Mapped]( self, mapping: Callable[[Mapped], Value] | ParameterPath[Mapped, Value] | Value, @@ -162,17 +194,33 @@ async def __call__( **kwargs, ): case EvaluationScenarioResult() as result: + meta: dict[str, str | float | int | bool | None] | None + if self.meta: + if result.meta: + meta = self.meta | result.meta + + else: + meta = self.meta + + elif result.meta: + meta = result.meta + + else: + meta = None + return ScenarioEvaluatorResult( name=self.name, evaluations=result.evaluations, - meta=result.meta, + meta=meta, ) case [*results]: return ScenarioEvaluatorResult( name=self.name, evaluations=tuple(results), + meta=self.meta, ) + except Exception as exc: ctx.log_error( f"Scenario evaluator `{self.name}` failed, using empty fallback result", @@ -182,7 +230,7 @@ async def __call__( return ScenarioEvaluatorResult( name=self.name, evaluations=(), - meta={"exception": str(exc)}, + meta=(self.meta or {}) | {"exception": str(exc)}, ) diff --git a/src/draive/evaluation/suite.py b/src/draive/evaluation/suite.py index 287ed61..1826ed8 100644 --- a/src/draive/evaluation/suite.py +++ b/src/draive/evaluation/suite.py @@ -49,6 +49,24 @@ def passed(self) -> bool: # empty results is equivalent of failure return len(self.results) > 0 and all(result.passed for result in self.results) + def report(self) -> str: + report: str = "\n---\n".join( + [result.report() for result in self.results if not result.passed] + ) + + if report: # nonempty report contains failing reports + return ( + f"Evaluation case {self.case.identifier}:" + f"\nvalue: {self.value}" + f"\n{self.case.parameters}\n---\n{report}" + ) + + elif not self.results: + return f"Evaluation case {self.case.identifier} empty!" + + else: + return f"Evaluation case {self.case.identifier} passed!" + class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](DataModel): cases: list[SuiteEvaluatorCaseResult[CaseParameters, Value]] @@ -57,6 +75,20 @@ class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](Da def passed(self) -> bool: return all(case.passed for case in self.cases) + def report(self) -> str: + report: str = "\n---\n".join( + [result.report() for result in self.cases if not result.passed] + ) + + if report: # nonempty report contains failing reports + return f"Evaluation suite failed:\n\n{report}" + + elif not self.cases: + return "Evaluation suite empty!" + + else: + return "Evaluation suite passed!" + class EvaluationCaseResult[Value: DataModel | str](DataModel): @classmethod