diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 4715a36c65..90457ce293 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -45,7 +45,7 @@ from .evaluators.spec import EvaluatorSpec from .otel import SpanTree from .otel._context_subtree import context_subtree -from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure +from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure, ReportCaseMultiRun if TYPE_CHECKING: from pydantic_ai.retries import RetryConfig @@ -264,6 +264,7 @@ async def evaluate( retry_task: RetryConfig | None = None, retry_evaluators: RetryConfig | None = None, *, + runs: int = 1, task_name: str | None = None, metadata: dict[str, Any] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: @@ -282,6 +283,7 @@ async def evaluate( progress: Whether to show a progress bar for the evaluation. Defaults to `True`. retry_task: Optional retry configuration for the task execution. retry_evaluators: Optional retry configuration for evaluator execution. + runs: The number of times to run each case. Defaults to 1. task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. metadata: Optional dict of experiment metadata. @@ -291,7 +293,7 @@ async def evaluate( """ task_name = task_name or get_unwrapped_function_name(task) name = name or task_name - total_cases = len(self.cases) + total_cases = len(self.cases) * runs progress_bar = Progress() if progress else None limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack() @@ -306,20 +308,75 @@ async def evaluate( task_name=task_name, dataset_name=self.name, n_cases=len(self.cases), + runs=runs, **extra_attributes, ) as eval_span, progress_bar or nullcontext(), ): task_id = progress_bar.add_task(f'Evaluating {task_name}', total=total_cases) if progress_bar else None - async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str): - async with limiter: - result = await _run_task_and_evaluators( - task, case, report_case_name, self.evaluators, retry_task, retry_evaluators + async def _handle_case( + case: Case[InputsT, OutputT, MetadataT], report_case_name: str + ) -> list[ + ReportCase[InputsT, OutputT, MetadataT] + | ReportCaseMultiRun[InputsT, OutputT, MetadataT] + | ReportCaseFailure[InputsT, OutputT, MetadataT] + ]: + # If we are running multiple times, create a parent span for the case + cm = logfire_span('case: {name}', name=report_case_name) if runs > 1 else nullcontext() + + results: list[ + ReportCase[InputsT, OutputT, MetadataT] + | ReportCaseMultiRun[InputsT, OutputT, MetadataT] + | ReportCaseFailure[InputsT, OutputT, MetadataT] + ] = [] + with cm as span: + trace_id = None + span_id = None + if span and span.context: + trace_id = f'{span.context.trace_id:032x}' + span_id = f'{span.context.span_id:016x}' + + for i in range(runs): + run_name = f'{report_case_name} (run {i + 1})' if runs > 1 else report_case_name + async with limiter: + result = await _run_task_and_evaluators( + task, case, run_name, self.evaluators, retry_task, retry_evaluators + ) + results.append(result) + if progress_bar and task_id is not None: # pragma: no branch + progress_bar.update(task_id, advance=1) + + if runs == 1: + return results + + # Separate successes and failures + successes: list[ReportCase] = [r for r in results if isinstance(r, ReportCase)] + failures: list[ReportCaseFailure] = [r for r in results if isinstance(r, ReportCaseFailure)] + + output_results: list[ + ReportCase[InputsT, OutputT, MetadataT] + | ReportCaseMultiRun[InputsT, OutputT, MetadataT] + | ReportCaseFailure[InputsT, OutputT, MetadataT] + ] = [] + output_results.extend(failures) + + if successes: + # Aggregate successes into a MultiRun + aggregate = ReportCaseAggregate.average(successes, name=report_case_name) + output_results.append( + ReportCaseMultiRun( + name=report_case_name, + inputs=case.inputs, + metadata=case.metadata, + expected_output=case.expected_output, + runs=successes, + aggregate=aggregate, + trace_id=trace_id, + span_id=span_id, + ) ) - if progress_bar and task_id is not None: # pragma: no branch - progress_bar.update(task_id, advance=1) - return result + return output_results if (context := eval_span.context) is None: # pragma: no cover trace_id = None @@ -327,19 +384,25 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name else: trace_id = f'{context.trace_id:032x}' span_id = f'{context.span_id:016x}' - cases_and_failures = await task_group_gather( + + # task_group_gather returns a list of results from each _handle_case call + nested_results = await task_group_gather( [ lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}') for i, case in enumerate(self.cases, 1) ] ) - cases: list[ReportCase] = [] - failures: list[ReportCaseFailure] = [] - for item in cases_and_failures: - if isinstance(item, ReportCase): - cases.append(item) - else: - failures.append(item) + + # Flatten results + cases: list[ReportCase[InputsT, OutputT, MetadataT] | ReportCaseMultiRun[InputsT, OutputT, MetadataT]] = [] + failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = [] + for group_result in nested_results: + for item in group_result: + if isinstance(item, (ReportCase, ReportCaseMultiRun)): + cases.append(item) + else: + failures.append(item) + report = EvaluationReport( name=name, cases=cases, @@ -367,6 +430,7 @@ def evaluate_sync( retry_task: RetryConfig | None = None, retry_evaluators: RetryConfig | None = None, *, + runs: int = 1, task_name: str | None = None, metadata: dict[str, Any] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: @@ -384,6 +448,7 @@ def evaluate_sync( progress: Whether to show a progress bar for the evaluation. Defaults to `True`. retry_task: Optional retry configuration for the task execution. retry_evaluators: Optional retry configuration for evaluator execution. + runs: The number of times to run each case. Defaults to 1. task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. metadata: Optional dict of experiment metadata. @@ -399,6 +464,7 @@ def evaluate_sync( progress=progress, retry_task=retry_task, retry_evaluators=retry_evaluators, + runs=runs, task_name=task_name, metadata=metadata, ) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index 622e70970f..29bc7f6965 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations as _annotations from collections import defaultdict -from collections.abc import Callable, Mapping +from collections.abc import Callable, Mapping, Sequence from dataclasses import dataclass, field from io import StringIO from typing import Any, Generic, Literal, Protocol, cast @@ -35,6 +35,7 @@ 'RenderValueConfig', 'RenderNumberConfig', 'ReportCaseAggregate', + 'ReportCaseMultiRun', ) from ..evaluators.evaluator import EvaluatorFailure @@ -122,12 +123,12 @@ class ReportCaseAggregate(BaseModel): total_duration: float @staticmethod - def average(cases: list[ReportCase]) -> ReportCaseAggregate: + def average(cases: Sequence[ReportCase | ReportCaseMultiRun], name: str = 'Averages') -> ReportCaseAggregate: """Produce a synthetic "summary" case by averaging quantitative attributes.""" num_cases = len(cases) if num_cases == 0: return ReportCaseAggregate( - name='Averages', + name=name, scores={}, labels={}, metrics={}, @@ -136,47 +137,23 @@ def average(cases: list[ReportCase]) -> ReportCaseAggregate: total_duration=0.0, ) - def _scores_averages(scores_by_name: list[dict[str, int | float | bool]]) -> dict[str, float]: - counts_by_name: dict[str, int] = defaultdict(int) - sums_by_name: dict[str, float] = defaultdict(float) - for sbn in scores_by_name: - for name, score in sbn.items(): - counts_by_name[name] += 1 - sums_by_name[name] += score - return {name: sums_by_name[name] / counts_by_name[name] for name in sums_by_name} - - def _labels_averages(labels_by_name: list[dict[str, str]]) -> dict[str, dict[str, float]]: - counts_by_name: dict[str, int] = defaultdict(int) - sums_by_name: dict[str, dict[str, float]] = defaultdict(lambda: defaultdict(float)) - for lbn in labels_by_name: - for name, label in lbn.items(): - counts_by_name[name] += 1 - sums_by_name[name][label] += 1 - return { - name: {value: count / counts_by_name[name] for value, count in sums_by_name[name].items()} - for name in sums_by_name - } - - average_task_duration = sum(case.task_duration for case in cases) / num_cases - average_total_duration = sum(case.total_duration for case in cases) / num_cases - - # average_assertions: dict[str, float] = _scores_averages([{k: v.value for k, v in case.scores.items()} for case in cases]) - average_scores: dict[str, float] = _scores_averages( - [{k: v.value for k, v in case.scores.items()} for case in cases] - ) - average_labels: dict[str, dict[str, float]] = _labels_averages( - [{k: v.value for k, v in case.labels.items()} for case in cases] + average_scores = _calculate_average_scores(cases) + average_labels = _calculate_average_labels(cases) + average_metrics = _calculate_average_metrics(cases) + average_assertions = _calculate_average_assertions(cases) + + average_task_duration = ( + sum(c.aggregate.task_duration if isinstance(c, ReportCaseMultiRun) else c.task_duration for c in cases) + / num_cases ) - average_metrics: dict[str, float] = _scores_averages([case.metrics for case in cases]) - average_assertions: float | None = None - n_assertions = sum(len(case.assertions) for case in cases) - if n_assertions > 0: - n_passing = sum(1 for case in cases for assertion in case.assertions.values() if assertion.value) - average_assertions = n_passing / n_assertions + average_total_duration = ( + sum(c.aggregate.total_duration if isinstance(c, ReportCaseMultiRun) else c.total_duration for c in cases) + / num_cases + ) return ReportCaseAggregate( - name='Averages', + name=name, scores=average_scores, labels=average_labels, metrics=average_metrics, @@ -186,6 +163,103 @@ def _labels_averages(labels_by_name: list[dict[str, str]]) -> dict[str, dict[str ) +def _scores_averages(scores_by_name: list[dict[str, int | float | bool]]) -> dict[str, float]: + counts_by_name: dict[str, int] = defaultdict(int) + sums_by_name: dict[str, float] = defaultdict(float) + for sbn in scores_by_name: + for name, score in sbn.items(): + counts_by_name[name] += 1 + sums_by_name[name] += score + return {name: sums_by_name[name] / counts_by_name[name] for name in sums_by_name} + + +def _labels_averages(labels_by_name: list[dict[str, str]]) -> dict[str, dict[str, float]]: + counts_by_name: dict[str, int] = defaultdict(int) + sums_by_name: dict[str, dict[str, float]] = defaultdict(lambda: defaultdict(float)) + for lbn in labels_by_name: + for name, label in lbn.items(): + counts_by_name[name] += 1 + sums_by_name[name][label] += 1 + return { + name: {value: count / counts_by_name[name] for value, count in sums_by_name[name].items()} + for name in sums_by_name + } + + +def _calculate_average_scores(cases: Sequence[ReportCase | ReportCaseMultiRun]) -> dict[str, float]: + scores_list: list[dict[str, int | float]] = [] + for c in cases: + if isinstance(c, ReportCaseMultiRun): + scores_list.append(c.aggregate.scores) + else: + scores_list.append({k: v.value for k, v in c.scores.items()}) + + return _scores_averages(cast(list[dict[str, int | float | bool]], scores_list)) + + +def _calculate_average_metrics(cases: Sequence[ReportCase | ReportCaseMultiRun]) -> dict[str, float]: + metrics_list: list[dict[str, int | float]] = [] + for c in cases: + if isinstance(c, ReportCaseMultiRun): + metrics_list.append(c.aggregate.metrics) + else: + metrics_list.append(c.metrics) + + return _scores_averages(cast(list[dict[str, int | float | bool]], metrics_list)) + + +def _calculate_average_labels(cases: Sequence[ReportCase | ReportCaseMultiRun]) -> dict[str, dict[str, float]]: + all_labels_list: list[dict[str, str]] = [] + for c in cases: + if isinstance(c, ReportCaseMultiRun): + for run in c.runs: + all_labels_list.append({k: v.value for k, v in run.labels.items()}) + else: + all_labels_list.append({k: v.value for k, v in c.labels.items()}) + + return _labels_averages(all_labels_list) + + +def _calculate_average_assertions(cases: Sequence[ReportCase | ReportCaseMultiRun]) -> float | None: + n_assertions = 0 + n_passing = 0 + for case in cases: + if isinstance(case, ReportCaseMultiRun): + n_assertions += sum(len(r.assertions) for r in case.runs) + n_passing += sum(1 for r in case.runs for a in r.assertions.values() if a.value) + else: + n_assertions += len(case.assertions) + n_passing += sum(1 for assertion in case.assertions.values() if assertion.value) + + if n_assertions > 0: + return n_passing / n_assertions + return None + + +@dataclass(kw_only=True) +class ReportCaseMultiRun(Generic[InputsT, OutputT, MetadataT]): + """A case in an evaluation report that represents multiple runs of the same test case.""" + + name: str + """The name of the [case][pydantic_evals.Case].""" + inputs: InputsT + """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs].""" + metadata: MetadataT | None + """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata].""" + expected_output: OutputT | None + """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output].""" + + runs: list[ReportCase[InputsT, OutputT, MetadataT]] + """The individual runs of the test case.""" + aggregate: ReportCaseAggregate + """Aggregated statistics for the runs.""" + + trace_id: str | None = None + """The trace ID of the case span (parent of individual runs).""" + span_id: str | None = None + """The span ID of the case span (parent of individual runs).""" + + @dataclass(kw_only=True) class EvaluationReport(Generic[InputsT, OutputT, MetadataT]): """A report of the results of evaluating a model on a set of cases.""" @@ -193,7 +267,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]): name: str """The name of the report.""" - cases: list[ReportCase[InputsT, OutputT, MetadataT]] + cases: list[ReportCase[InputsT, OutputT, MetadataT] | ReportCaseMultiRun[InputsT, OutputT, MetadataT]] """The cases in the report.""" failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list) """The failures in the report. These are cases where task execution raised an exception.""" @@ -832,8 +906,11 @@ def build_failures_table(self, title: str) -> Table: table.add_column('Error Stacktrace', overflow='fold') return table - def build_row(self, case: ReportCase) -> list[str]: + def build_row(self, case: ReportCase | ReportCaseMultiRun) -> list[str]: """Build a table row for a single case.""" + if isinstance(case, ReportCaseMultiRun): + return self.build_multi_run_row(case) + row = [case.name] if self.include_input: @@ -868,6 +945,51 @@ def build_row(self, case: ReportCase) -> list[str]: return row + def build_multi_run_row(self, case: ReportCaseMultiRun) -> list[str]: + """Build a table row for a multi-run case.""" + # For multi-run cases, show the aggregated statistics + aggregate = case.aggregate + row = [f'[i]{case.name} ({len(case.runs)} runs)[/]'] + + if self.include_input: + row.append(self.input_renderer.render_value(None, case.inputs) or EMPTY_CELL_STR) + + if self.include_metadata: + row.append(self.metadata_renderer.render_value(None, case.metadata) or EMPTY_CELL_STR) + + if self.include_expected_output: + row.append(self.output_renderer.render_value(None, case.expected_output) or EMPTY_CELL_STR) + + if self.include_output: + row.append(EMPTY_CELL_STR) + + if self.include_scores: + row.append(self._render_dict(aggregate.scores, self.score_renderers)) + + if self.include_labels: + row.append(self._render_dict(aggregate.labels, self.label_renderers)) + + if self.include_metrics: + row.append(self._render_dict(aggregate.metrics, self.metric_renderers)) + + if self.include_assertions: + row.append(self._render_aggregate_assertions(aggregate.assertions)) + + if self.include_evaluator_failures: + # Gather unique failure messages from runs + unique_failures: dict[tuple[str, str], EvaluatorFailure] = {} + for run in case.runs: + for f in run.evaluator_failures: + key = (f.name, f.error_message) + if key not in unique_failures: + unique_failures[key] = f + row.append(self._render_evaluator_failures(list(unique_failures.values()))) + + if self.include_durations: + row.append(self._render_durations(aggregate)) + + return row + def build_aggregate_row(self, aggregate: ReportCaseAggregate) -> list[str]: """Build a table row for an aggregated case.""" row = [f'[b i]{aggregate.name}[/]'] @@ -906,67 +1028,128 @@ def build_aggregate_row(self, aggregate: ReportCaseAggregate) -> list[str]: def build_diff_row( self, - new_case: ReportCase, - baseline: ReportCase, + new_case: ReportCase | ReportCaseMultiRun, + baseline: ReportCase | ReportCaseMultiRun, ) -> list[str]: """Build a table row for a given case ID.""" assert baseline.name == new_case.name, 'This should only be called for matching case IDs' row = [baseline.name] - if self.include_input: # pragma: no branch + row.extend(self._build_diff_row_basics(baseline, new_case)) + row.extend(self._build_diff_row_evals(baseline, new_case)) + row.extend(self._build_diff_row_status(baseline, new_case)) + + return row + + def _build_diff_row_basics( + self, baseline: ReportCase | ReportCaseMultiRun, new_case: ReportCase | ReportCaseMultiRun + ) -> list[str]: + row: list[str] = [] + if self.include_input: input_diff = self.input_renderer.render_diff(None, baseline.inputs, new_case.inputs) or EMPTY_CELL_STR row.append(input_diff) - if self.include_metadata: # pragma: no branch + if self.include_metadata: metadata_diff = ( self.metadata_renderer.render_diff(None, baseline.metadata, new_case.metadata) or EMPTY_CELL_STR ) row.append(metadata_diff) - if self.include_expected_output: # pragma: no branch + if self.include_expected_output: expected_output_diff = ( self.output_renderer.render_diff(None, baseline.expected_output, new_case.expected_output) or EMPTY_CELL_STR ) row.append(expected_output_diff) - if self.include_output: # pragma: no branch - output_diff = self.output_renderer.render_diff(None, baseline.output, new_case.output) or EMPTY_CELL_STR + if self.include_output: + old_output = baseline.output if isinstance(baseline, ReportCase) else None + new_output = new_case.output if isinstance(new_case, ReportCase) else None + output_diff = self.output_renderer.render_diff(None, old_output, new_output) or EMPTY_CELL_STR row.append(output_diff) - if self.include_scores: # pragma: no branch + return row + + def _build_diff_row_evals( + self, baseline: ReportCase | ReportCaseMultiRun, new_case: ReportCase | ReportCaseMultiRun + ) -> list[str]: + row: list[str] = [] + + # Helpers + def get_scores(c: ReportCase | ReportCaseMultiRun) -> dict[str, int | float]: + if isinstance(c, ReportCaseMultiRun): + return c.aggregate.scores + return {k: v.value for k, v in c.scores.items()} + + def get_labels(c: ReportCase | ReportCaseMultiRun) -> dict[str, str]: + if isinstance(c, ReportCaseMultiRun): + return {} + return {k: v.value for k, v in c.labels.items()} + + def get_metrics(c: ReportCase | ReportCaseMultiRun) -> dict[str, int | float]: + if isinstance(c, ReportCaseMultiRun): + return c.aggregate.metrics + return c.metrics + + if self.include_scores: scores_diff = self._render_dicts_diff( - {k: v.value for k, v in baseline.scores.items()}, - {k: v.value for k, v in new_case.scores.items()}, + get_scores(baseline), + get_scores(new_case), self.score_renderers, ) row.append(scores_diff) - if self.include_labels: # pragma: no branch + if self.include_labels: labels_diff = self._render_dicts_diff( - {k: v.value for k, v in baseline.labels.items()}, - {k: v.value for k, v in new_case.labels.items()}, + get_labels(baseline), + get_labels(new_case), self.label_renderers, ) row.append(labels_diff) - if self.include_metrics: # pragma: no branch - metrics_diff = self._render_dicts_diff(baseline.metrics, new_case.metrics, self.metric_renderers) + if self.include_metrics: + metrics_diff = self._render_dicts_diff(get_metrics(baseline), get_metrics(new_case), self.metric_renderers) row.append(metrics_diff) - if self.include_assertions: # pragma: no branch - assertions_diff = self._render_assertions_diff( - list(baseline.assertions.values()), list(new_case.assertions.values()) - ) + return row + + def _build_diff_row_status( + self, baseline: ReportCase | ReportCaseMultiRun, new_case: ReportCase | ReportCaseMultiRun + ) -> list[str]: + row: list[str] = [] + + if self.include_assertions: + if isinstance(baseline, ReportCase) and isinstance(new_case, ReportCase): + assertions_diff = self._render_assertions_diff( + list(baseline.assertions.values()), list(new_case.assertions.values()) + ) + else: + old_val = ( + baseline.aggregate.assertions + if isinstance(baseline, ReportCaseMultiRun) + else _calculate_average_assertions([baseline]) + ) + new_val = ( + new_case.aggregate.assertions + if isinstance(new_case, ReportCaseMultiRun) + else _calculate_average_assertions([new_case]) + ) + assertions_diff = self._render_aggregate_assertions_diff(old_val, new_val) row.append(assertions_diff) - if self.include_evaluator_failures: # pragma: no branch + if self.include_evaluator_failures: + + def get_failures(c: ReportCase | ReportCaseMultiRun) -> list[EvaluatorFailure]: + if isinstance(c, ReportCaseMultiRun): + return [f for r in c.runs for f in r.evaluator_failures] + return c.evaluator_failures + evaluator_failures_diff = self._render_evaluator_failures_diff( - baseline.evaluator_failures, new_case.evaluator_failures + get_failures(baseline), get_failures(new_case) ) row.append(evaluator_failures_diff) - if self.include_durations: # pragma: no branch + if self.include_durations: durations_diff = self._render_durations_diff(baseline, new_case) row.append(durations_diff) @@ -1039,11 +1222,14 @@ def build_failure_row(self, case: ReportCaseFailure) -> list[str]: return row - def _render_durations(self, case: ReportCase | ReportCaseAggregate) -> str: + def _render_durations(self, case: ReportCase | ReportCaseMultiRun | ReportCaseAggregate) -> str: """Build the diff string for a duration value.""" - case_durations: dict[str, float] = {'task': case.task_duration} + task_duration = case.aggregate.task_duration if isinstance(case, ReportCaseMultiRun) else case.task_duration + total_duration = case.aggregate.total_duration if isinstance(case, ReportCaseMultiRun) else case.total_duration + + case_durations: dict[str, float] = {'task': task_duration} if self.include_total_duration: - case_durations['total'] = case.total_duration + case_durations['total'] = total_duration return self._render_dict( case_durations, {'task': self.duration_renderer, 'total': self.duration_renderer}, @@ -1052,15 +1238,33 @@ def _render_durations(self, case: ReportCase | ReportCaseAggregate) -> str: def _render_durations_diff( self, - base_case: ReportCase | ReportCaseAggregate, - new_case: ReportCase | ReportCaseAggregate, + base_case: ReportCase | ReportCaseMultiRun | ReportCaseAggregate, + new_case: ReportCase | ReportCaseMultiRun | ReportCaseAggregate, ) -> str: """Build the diff string for a duration value.""" - base_case_durations: dict[str, float] = {'task': base_case.task_duration} - new_case_durations: dict[str, float] = {'task': new_case.task_duration} + base_task = ( + base_case.aggregate.task_duration if isinstance(base_case, ReportCaseMultiRun) else base_case.task_duration + ) + new_task = ( + new_case.aggregate.task_duration if isinstance(new_case, ReportCaseMultiRun) else new_case.task_duration + ) + + base_case_durations: dict[str, float] = {'task': base_task} + new_case_durations: dict[str, float] = {'task': new_task} + if self.include_total_duration: # pragma: no branch - base_case_durations['total'] = base_case.total_duration - new_case_durations['total'] = new_case.total_duration + base_total = ( + base_case.aggregate.total_duration + if isinstance(base_case, ReportCaseMultiRun) + else base_case.total_duration + ) + new_total = ( + new_case.aggregate.total_duration + if isinstance(new_case, ReportCaseMultiRun) + else new_case.total_duration + ) + base_case_durations['total'] = base_total + new_case_durations['total'] = new_total return self._render_dicts_diff( base_case_durations, new_case_durations, @@ -1210,29 +1414,50 @@ class EvaluationRenderer: include_evaluator_failures: bool def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None): - return any(case.scores for case in self._all_cases(report, baseline)) + return any( + case.aggregate.scores if isinstance(case, ReportCaseMultiRun) else case.scores + for case in self._all_cases(report, baseline) + ) def include_labels(self, report: EvaluationReport, baseline: EvaluationReport | None = None): - return any(case.labels for case in self._all_cases(report, baseline)) + return any( + case.aggregate.labels if isinstance(case, ReportCaseMultiRun) else case.labels + for case in self._all_cases(report, baseline) + ) def include_metrics(self, report: EvaluationReport, baseline: EvaluationReport | None = None): - return any(case.metrics for case in self._all_cases(report, baseline)) + return any( + case.aggregate.metrics if isinstance(case, ReportCaseMultiRun) else case.metrics + for case in self._all_cases(report, baseline) + ) def include_assertions(self, report: EvaluationReport, baseline: EvaluationReport | None = None): - return any(case.assertions for case in self._all_cases(report, baseline)) + return any( + case.aggregate.assertions is not None if isinstance(case, ReportCaseMultiRun) else bool(case.assertions) + for case in self._all_cases(report, baseline) + ) def include_evaluator_failures_column(self, report: EvaluationReport, baseline: EvaluationReport | None = None): - return self.include_evaluator_failures and any( - case.evaluator_failures for case in self._all_cases(report, baseline) + if not self.include_evaluator_failures: + return False + return any( + any(bool(run.evaluator_failures) for run in case.runs) + if isinstance(case, ReportCaseMultiRun) + else bool(case.evaluator_failures) + for case in self._all_cases(report, baseline) ) - def _all_cases(self, report: EvaluationReport, baseline: EvaluationReport | None) -> list[ReportCase]: + def _all_cases( + self, report: EvaluationReport, baseline: EvaluationReport | None + ) -> list[ReportCase | ReportCaseMultiRun]: if not baseline: return report.cases else: return report.cases + self._baseline_cases_to_include(report, baseline) - def _baseline_cases_to_include(self, report: EvaluationReport, baseline: EvaluationReport) -> list[ReportCase]: + def _baseline_cases_to_include( + self, report: EvaluationReport, baseline: EvaluationReport + ) -> list[ReportCase | ReportCaseMultiRun]: if self.include_removed_cases: return baseline.cases report_case_names = {case.name for case in report.cases} @@ -1247,9 +1472,12 @@ def _get_case_renderer( score_renderers = self._infer_score_renderers(report, baseline) label_renderers = self._infer_label_renderers(report, baseline) metric_renderers = self._infer_metric_renderers(report, baseline) - duration_renderer = _NumberRenderer.infer_from_config( - self.duration_config, 'duration', [x.task_duration for x in self._all_cases(report, baseline)] - ) + + all_cases = self._all_cases(report, baseline) + durations = [ + x.aggregate.task_duration if isinstance(x, ReportCaseMultiRun) else x.task_duration for x in all_cases + ] + duration_renderer = _NumberRenderer.infer_from_config(self.duration_config, 'duration', durations) return ReportCaseRenderer( include_input=self.include_input, @@ -1321,9 +1549,9 @@ def build_diff_table( report_cases_by_id = {case.name: case for case in report_cases} baseline_cases_by_id = {case.name: case for case in baseline_cases} - diff_cases: list[tuple[ReportCase, ReportCase]] = [] - removed_cases: list[ReportCase] = [] - added_cases: list[ReportCase] = [] + diff_cases: list[tuple[ReportCase | ReportCaseMultiRun, ReportCase | ReportCaseMultiRun]] = [] + removed_cases: list[ReportCase | ReportCaseMultiRun] = [] + added_cases: list[ReportCase | ReportCaseMultiRun] = [] for case_id in sorted(set(baseline_cases_by_id.keys()) | set(report_cases_by_id.keys())): maybe_baseline_case = baseline_cases_by_id.get(case_id) @@ -1377,8 +1605,14 @@ def _infer_score_renderers( values_by_name: dict[str, list[float | int]] = {} for case in all_cases: - for k, score in case.scores.items(): - values_by_name.setdefault(k, []).append(score.value) + if isinstance(case, ReportCaseMultiRun): + agg_scores = case.aggregate.scores + for k, score in agg_scores.items(): + values_by_name.setdefault(k, []).append(score) + else: + case_scores = case.scores + for k, score_result in case_scores.items(): + values_by_name.setdefault(k, []).append(score_result.value) all_renderers: dict[str, _NumberRenderer] = {} for name, values in values_by_name.items(): @@ -1393,8 +1627,10 @@ def _infer_label_renderers( all_cases = self._all_cases(report, baseline) all_names: set[str] = set() for case in all_cases: - for k in case.labels: - all_names.add(k) + if isinstance(case, ReportCaseMultiRun): + all_names.update(case.aggregate.labels.keys()) + else: + all_names.update(case.labels.keys()) all_renderers: dict[str, _ValueRenderer] = {} for name in all_names: @@ -1410,7 +1646,11 @@ def _infer_metric_renderers( values_by_name: dict[str, list[float | int]] = {} for case in all_cases: - for k, v in case.metrics.items(): + if isinstance(case, ReportCaseMultiRun): + metrics = case.aggregate.metrics + else: + metrics = case.metrics + for k, v in metrics.items(): values_by_name.setdefault(k, []).append(v) all_renderers: dict[str, _NumberRenderer] = {} @@ -1424,7 +1664,14 @@ def _infer_duration_renderer( self, report: EvaluationReport, baseline: EvaluationReport | None ) -> _NumberRenderer: # pragma: no cover all_cases = self._all_cases(report, baseline) - all_durations = [x.task_duration for x in all_cases] - if self.include_total_duration: - all_durations += [x.total_duration for x in all_cases] + all_durations: list[float] = [] + for case in all_cases: + if isinstance(case, ReportCaseMultiRun): + all_durations.append(case.aggregate.task_duration) + if self.include_total_duration: + all_durations.append(case.aggregate.total_duration) + else: + all_durations.append(case.task_duration) + if self.include_total_duration: + all_durations.append(case.total_duration) return _NumberRenderer.infer_from_config(self.duration_config, 'duration', all_durations) diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index 8c3c25a42d..49ed44b130 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -220,6 +220,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: assert report is not None assert len(report.cases) == 2 + assert isinstance(report.cases[0], ReportCase) assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( { 'assertions': { @@ -273,6 +274,7 @@ def mock_sync_task(inputs: TaskInput) -> TaskOutput: assert report is not None assert len(report.cases) == 2 + assert isinstance(report.cases[0], ReportCase) assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( { 'assertions': { @@ -354,6 +356,7 @@ def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]): assert report is not None assert len(report.cases) == 2 + assert isinstance(report.cases[0], ReportCase) assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( { 'assertions': { @@ -407,6 +410,7 @@ async def mock_task(inputs: TaskInput) -> TaskOutput: assert report is not None assert len(report.cases) == 2 + assert isinstance(report.cases[0], ReportCase) assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( { 'assertions': { @@ -1376,6 +1380,7 @@ async def task(inputs: TaskInput) -> TaskOutput: report = await example_dataset.evaluate(task) assert len(report.cases) == 2 + assert isinstance(report.cases[0], ReportCase) assert len(report.cases[0].scores) == 2 @@ -1556,6 +1561,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'dataset_name': {}, 'gen_ai.operation.name': {}, 'n_cases': {}, + 'runs': {}, 'name': {}, 'metadata': {'type': 'object'}, 'logfire.experiment.metadata': { @@ -1572,6 +1578,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: }, 'type': 'object', }, + 'runs': 1, 'logfire.msg': 'evaluate mock_async_task', 'metadata': {'key': 'value'}, 'logfire.msg_template': 'evaluate {name}', diff --git a/tests/evals/test_repeated_runs.py b/tests/evals/test_repeated_runs.py new file mode 100644 index 0000000000..a835b890f6 --- /dev/null +++ b/tests/evals/test_repeated_runs.py @@ -0,0 +1,215 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +import pytest +from pydantic import BaseModel + +from ..conftest import try_import + +with try_import() as imports_successful: + from pydantic_evals import Case, Dataset + from pydantic_evals.evaluators import Evaluator, EvaluatorContext + from pydantic_evals.reporting import ReportCase, ReportCaseMultiRun + + from .utils import render_table + + +pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio] + + +class Input(BaseModel): + val: int + + +class Output(BaseModel): + val: int + + +@dataclass +class RandomScore(Evaluator[Input, Output, Any]): + """Evaluator that returns a score based on output.""" + + def evaluate(self, ctx: EvaluatorContext[Input, Output, Any]) -> float: + return float(ctx.output.val) + + +async def test_repeated_runs_success(): + """Test running a dataset with runs > 1.""" + + # Task that returns the input value + async def task(input: Input) -> Output: + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[ + Case(name='c1', inputs=Input(val=10)), + Case(name='c2', inputs=Input(val=20)), + ], + evaluators=[RandomScore()], + ) + + report = await dataset.evaluate(task, runs=3) + + assert len(report.cases) == 2 + + # Check first case + c1 = report.cases[0] + assert isinstance(c1, ReportCaseMultiRun) + assert c1.name == 'c1' + assert len(c1.runs) == 3 + # Check that individual runs are correct + for run in c1.runs: + assert isinstance(run, ReportCase) + assert run.output.val == 10 + assert run.scores['RandomScore'].value == 10.0 + + # Check aggregate + assert c1.aggregate.scores['RandomScore'] == 10.0 + + # Check second case + c2 = report.cases[1] + assert isinstance(c2, ReportCaseMultiRun) + assert len(c2.runs) == 3 + assert c2.aggregate.scores['RandomScore'] == 20.0 + + +async def test_repeated_runs_mixed_failures(): + """Test repeated runs where some fail.""" + + count = 0 + + async def task(input: Input) -> Output: + nonlocal count + count += 1 + # Fail on every 2nd call + if count % 2 == 0: + raise ValueError('Flaky failure') + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[ + Case(name='c1', inputs=Input(val=10)), + ], + evaluators=[RandomScore()], + ) + + # Run 4 times: 1(ok), 2(fail), 3(ok), 4(fail) + report = await dataset.evaluate(task, runs=4) + + # We expect 2 failures and 1 MultiRun (with 2 successes) + assert len(report.failures) == 2 + assert len(report.cases) == 1 + + c1 = report.cases[0] + assert isinstance(c1, ReportCaseMultiRun) + assert c1.name == 'c1' + assert len(c1.runs) == 2 # 2 successful runs + + # Check aggregates are based on successes + assert c1.aggregate.scores['RandomScore'] == 10.0 + + +async def test_repeated_runs_all_failures(): + """Test repeated runs where all fail.""" + + async def task(input: Input) -> Output: + raise ValueError('Always fail') + + dataset = Dataset[Input, Output, Any]( + cases=[ + Case(name='c1', inputs=Input(val=10)), + ], + ) + + report = await dataset.evaluate(task, runs=3) + + assert len(report.cases) == 0 + assert len(report.failures) == 3 + assert report.failures[0].error_message == 'ValueError: Always fail' + + +async def test_single_run_compat(): + """Test that runs=1 (default) still produces ReportCase.""" + + async def task(input: Input) -> Output: + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[ + Case(name='c1', inputs=Input(val=10)), + ], + ) + + report = await dataset.evaluate(task) # Default runs=1 + + assert len(report.cases) == 1 + assert isinstance(report.cases[0], ReportCase) + # Should NOT be ReportCaseMultiRun + assert not isinstance(report.cases[0], ReportCaseMultiRun) + + +def test_evaluate_sync_repeated(): + """Test evaluate_sync with runs > 1.""" + + def task(input: Input) -> Output: + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[Case(name='c1', inputs=Input(val=10))], + evaluators=[RandomScore()], + ) + + report = dataset.evaluate_sync(task, runs=2) + + assert len(report.cases) == 1 + assert isinstance(report.cases[0], ReportCaseMultiRun) + assert len(report.cases[0].runs) == 2 + + +async def test_rendering_repeated_runs(): + """Test rendering of a report with repeated runs.""" + + async def task(input: Input) -> Output: + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[Case(name='c1', inputs=Input(val=10))], + evaluators=[RandomScore()], + ) + + report = await dataset.evaluate(task, runs=2, name='repeated_test') + + table = report.console_table() + rendered = render_table(table) + + # Check for the multi-run indicator in the Case ID column + assert 'c1 (2 runs)' in rendered + # Check for score + assert 'RandomScore: 10.0' in rendered + + +async def test_rendering_diff_single_vs_multi(): + """Test rendering a diff between a single run baseline and multi-run new report.""" + + async def task(input: Input) -> Output: + return Output(val=input.val) + + dataset = Dataset[Input, Output, Any]( + cases=[Case(name='c1', inputs=Input(val=10))], + evaluators=[RandomScore()], + ) + + # Baseline: single run + baseline = await dataset.evaluate(task, runs=1, name='baseline') + + # New: 2 runs + new_report = await dataset.evaluate(task, runs=2, name='new') + + table = new_report.console_table(baseline=baseline) + rendered = render_table(table) + + # Check that it renders a diff + # The score should be identical (10.0 -> 10.0) + assert 'RandomScore: 10.0' in rendered diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 8e575e4bfc..3930c0137c 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -1,6 +1,7 @@ from __future__ import annotations as _annotations from dataclasses import dataclass +from typing import cast import pytest from inline_snapshot import snapshot @@ -183,6 +184,7 @@ async def test_evaluation_renderer_with_reasons(sample_report: EvaluationReport) async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport): """Test EvaluationRenderer with baseline comparison.""" + sample_case = cast(ReportCase, sample_report.cases[0]) baseline_report = EvaluationReport( cases=[ ReportCase( @@ -198,7 +200,7 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport name='MockEvaluator', value=2.5, reason=None, - source=sample_report.cases[0].scores['score1'].source, + source=sample_case.scores['score1'].source, ) }, labels={ @@ -206,7 +208,7 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport name='MockEvaluator', value='hello', reason=None, - source=sample_report.cases[0].labels['label1'].source, + source=sample_case.labels['label1'].source, ) }, assertions={}, @@ -218,7 +220,6 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport ], name='baseline_report', ) - renderer = EvaluationRenderer( include_input=True, include_metadata=True, @@ -988,13 +989,19 @@ async def test_evaluation_renderer_with_experiment_metadata(sample_report_case: │ temperature: 0.7 │ │ prompt_version: v2 │ ╰───────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Inputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ {'query': 'What is 2+2?'} │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -├───────────┼───────────────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤ -│ Averages │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ 100.0ms │ -└───────────┴───────────────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┓ +┃ ┃ ┃ ┃ ┃ ┃ Assertio ┃ ┃ +┃ Case ID ┃ Inputs ┃ Scores ┃ Labels ┃ Metrics ┃ ns ┃ Duration ┃ +┡━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_ca… │ {'query' │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ : 'What │ 2.50 │ hello │ 0.950 │ │ │ +│ │ is │ │ │ │ │ │ +│ │ 2+2?'} │ │ │ │ │ │ +├──────────┼──────────┼───────────┼──────────┼───────────┼──────────┼──────────┤ +│ Averages │ │ score1: │ label1: │ accuracy: │ 100.0% ✔ │ 100.0ms │ +│ │ │ 2.50 │ {'hello' │ 0.950 │ │ │ +│ │ │ │ : 1.0} │ │ │ │ +└──────────┴──────────┴───────────┴──────────┴───────────┴──────────┴──────────┘ """) @@ -1048,11 +1055,12 @@ async def test_evaluation_renderer_with_long_experiment_metadata(sample_report_c │ frequency_penalty: 0.1 │ │ presence_penalty: 0.1 │ ╰────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1098,13 +1106,16 @@ async def test_evaluation_renderer_diff_with_experiment_metadata(sample_report_c │ model: gpt-4 → gpt-4o │ │ temperature: 0.5 → 0.7 │ ╰─────────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -├───────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤ -│ Averages │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ 100.0ms │ -└───────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +├───────────┼─────────────┼──────────────┼─────────────┼────────────┼──────────┤ +│ Averages │ score1: │ label1: │ accuracy: │ 100.0% ✔ │ 100.0ms │ +│ │ 2.50 │ {'hello': │ 0.950 │ │ │ +│ │ │ 1.0} │ │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1150,11 +1161,12 @@ async def test_evaluation_renderer_diff_with_only_new_metadata(sample_report_cas │ + model: gpt-4o │ │ + temperature: 0.7 │ ╰─────────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1200,11 +1212,12 @@ async def test_evaluation_renderer_diff_with_only_baseline_metadata(sample_repor │ - model: gpt-4 │ │ - temperature: 0.5 │ ╰─────────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1251,11 +1264,12 @@ async def test_evaluation_renderer_diff_with_same_metadata(sample_report_case: R │ model: gpt-4o │ │ temperature: 0.7 │ ╰─────────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1311,11 +1325,12 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case │ preserved-key: preserved value │ │ updated-key: original value → updated value │ ╰─────────────────────────────────────────────────╯ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) @@ -1355,10 +1370,11 @@ async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: Rep include_errors=False, # Prevent failures table from being added ) assert output == snapshot("""\ - Evaluation Diff: baseline_report → new_report \n\ -┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ -┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ -┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ -│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ -└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ + Evaluation Diff: baseline_report → new_report \n\ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: │ label1: │ accuracy: │ ✔ │ 100.0ms │ +│ │ 2.50 │ hello │ 0.950 │ │ │ +└───────────┴─────────────┴──────────────┴─────────────┴────────────┴──────────┘ """) diff --git a/tests/evals/test_reports.py b/tests/evals/test_reports.py index 6bec610e17..dc550917ff 100644 --- a/tests/evals/test_reports.py +++ b/tests/evals/test_reports.py @@ -204,6 +204,7 @@ async def test_report_with_error(mock_evaluator: Evaluator[TaskInput, TaskOutput name='error_report', ) + assert isinstance(report.cases[0], ReportCase) assert ReportCaseAdapter.dump_python(report.cases[0]) == snapshot( { 'assertions': {