Skip to content

Commit

Permalink
Add evaluation reports
Browse files Browse the repository at this point in the history
  • Loading branch information
KaQuMiQ authored Aug 30, 2024
1 parent abf0476 commit 90f2eae
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "draive"
description = "Framework designed to simplify and accelerate the development of LLM-based applications."
version = "0.28.2"
version = "0.28.3"
readme = "README.md"
maintainers = [
{ name = "Kacper Kaliński", email = "[email protected]" },
Expand Down
45 changes: 44 additions & 1 deletion src/draive/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ class EvaluatorResult(DataModel):
def passed(self) -> bool:
return self.score.value >= self.threshold

def report(self) -> str:
meta_values: str = (
f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}"
if self.meta
else "N/A"
)
return (
f"{self.evaluator} {'passed' if self.passed else 'failed' }"
f" with score {self.score.value},"
f" required {self.threshold},"
f" comment: {f"'{self.score.comment}'" or 'N/A'}"
f" meta:\n{meta_values}"
)


class EvaluationResult(DataModel):
@classmethod
Expand Down Expand Up @@ -111,6 +125,7 @@ def __init__(
name: str,
definition: EvaluatorDefinition[Value, Args],
threshold: float | None,
meta: dict[str, str | float | int | bool | None] | None = None,
) -> None:
assert ( # nosec: B101
threshold is None or 0 <= threshold <= 1
Expand All @@ -119,6 +134,7 @@ def __init__(
self._definition: EvaluatorDefinition[Value, Args] = definition
self.name: str = name
self.threshold: float = threshold or 1
self.meta: dict[str, str | float | int | bool | None] | None = meta

freeze(self)

Expand All @@ -131,6 +147,19 @@ def with_threshold(
name=self.name,
definition=self._definition,
threshold=threshold_value(value),
meta=self.meta,
)

def with_meta(
self,
meta: dict[str, str | float | int | bool | None],
/,
) -> Self:
return self.__class__(
name=self.name,
definition=self._definition,
threshold=self.threshold,
meta=self.meta | meta if self.meta else meta,
)

def prepared(
Expand Down Expand Up @@ -225,11 +254,25 @@ async def __call__(
)
evaluation_meta = {"exception": str(exc)}

result_meta: dict[str, str | float | int | bool | None] | None
if self.meta:
if evaluation_meta:
result_meta = self.meta | evaluation_meta

else:
result_meta = self.meta

elif evaluation_meta:
result_meta = evaluation_meta

else:
result_meta = None

return EvaluatorResult(
evaluator=self.name,
score=evaluation_score,
threshold=self.threshold,
meta=evaluation_meta,
meta=result_meta,
)


Expand Down
52 changes: 50 additions & 2 deletions src/draive/evaluation/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,25 @@ def passed(self) -> bool:
# empty evaluations is equivalent of failure
return len(self.evaluations) > 0 and all(case.passed for case in self.evaluations)

def report(self) -> str:
report: str = "\n- ".join(
result.report() for result in self.evaluations if not result.passed
)

if report: # nonempty report contains failing reports
meta_values: str = (
f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}"
if self.meta
else "N/A"
)
return f"Scenario {self.name}, meta: {meta_values}\n---\n{report}"

elif not self.evaluations:
return f"Scenario {self.name} empty!"

else:
return f"Scenario {self.name} passed!"


class EvaluationScenarioResult(DataModel):
@classmethod
Expand Down Expand Up @@ -93,9 +112,11 @@ def __init__(
self,
name: str,
definition: ScenarioEvaluatorDefinition[Value, Args],
meta: dict[str, str | float | int | bool | None] | None = None,
) -> None:
self.name: str = name
self._definition: ScenarioEvaluatorDefinition[Value, Args] = definition
self.meta: dict[str, str | float | int | bool | None] | None = meta

freeze(self)

Expand All @@ -115,6 +136,17 @@ async def evaluate(

return evaluate

def with_meta(
self,
meta: dict[str, str | float | int | bool | None],
/,
) -> Self:
return self.__class__(
name=self.name,
definition=self._definition,
meta=self.meta | meta if self.meta else meta,
)

def contra_map[Mapped](
self,
mapping: Callable[[Mapped], Value] | ParameterPath[Mapped, Value] | Value,
Expand Down Expand Up @@ -162,17 +194,33 @@ async def __call__(
**kwargs,
):
case EvaluationScenarioResult() as result:
meta: dict[str, str | float | int | bool | None] | None
if self.meta:
if result.meta:
meta = self.meta | result.meta

else:
meta = self.meta

elif result.meta:
meta = result.meta

else:
meta = None

return ScenarioEvaluatorResult(
name=self.name,
evaluations=result.evaluations,
meta=result.meta,
meta=meta,
)

case [*results]:
return ScenarioEvaluatorResult(
name=self.name,
evaluations=tuple(results),
meta=self.meta,
)

except Exception as exc:
ctx.log_error(
f"Scenario evaluator `{self.name}` failed, using empty fallback result",
Expand All @@ -182,7 +230,7 @@ async def __call__(
return ScenarioEvaluatorResult(
name=self.name,
evaluations=(),
meta={"exception": str(exc)},
meta=(self.meta or {}) | {"exception": str(exc)},
)


Expand Down
32 changes: 32 additions & 0 deletions src/draive/evaluation/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ def passed(self) -> bool:
# empty results is equivalent of failure
return len(self.results) > 0 and all(result.passed for result in self.results)

def report(self) -> str:
report: str = "\n---\n".join(
[result.report() for result in self.results if not result.passed]
)

if report: # nonempty report contains failing reports
return (
f"Evaluation case {self.case.identifier}:"
f"\nvalue: {self.value}"
f"\n{self.case.parameters}\n---\n{report}"
)

elif not self.results:
return f"Evaluation case {self.case.identifier} empty!"

else:
return f"Evaluation case {self.case.identifier} passed!"


class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](DataModel):
cases: list[SuiteEvaluatorCaseResult[CaseParameters, Value]]
Expand All @@ -57,6 +75,20 @@ class SuiteEvaluatorResult[CaseParameters: DataModel, Value: DataModel | str](Da
def passed(self) -> bool:
return all(case.passed for case in self.cases)

def report(self) -> str:
report: str = "\n---\n".join(
[result.report() for result in self.cases if not result.passed]
)

if report: # nonempty report contains failing reports
return f"Evaluation suite failed:\n\n{report}"

elif not self.cases:
return "Evaluation suite empty!"

else:
return "Evaluation suite passed!"


class EvaluationCaseResult[Value: DataModel | str](DataModel):
@classmethod
Expand Down

0 comments on commit 90f2eae

Please sign in to comment.