Skip to content

Commit e062379

Browse files
author
Aidan Daly
committed
Update to strands-agents-evals 0.1.0 API
1 parent 68f25d0 commit e062379

File tree

10 files changed

+39
-42
lines changed

10 files changed

+39
-42
lines changed

src/bedrock_agentcore/evaluation/integrations/strands_agents_evals/README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Evaluate Strands agents during local development and testing. The integration ca
2828

2929
```python
3030
from strands import Agent, tool
31-
from strands_evals import Dataset, Case
31+
from strands_evals import Experiment, Case
3232
from strands_evals.telemetry import StrandsEvalsTelemetry
3333
from bedrock_agentcore.evaluation import create_strands_evaluator
3434

@@ -82,12 +82,13 @@ cases = [
8282
evaluator = create_strands_evaluator("Builtin.Helpfulness")
8383

8484
# Run evaluations
85-
dataset = Dataset(cases=cases, evaluator=evaluator)
86-
report = dataset.run_evaluations(task_fn)
85+
experiment = Experiment(cases=cases, evaluators=[evaluator])
86+
reports = experiment.run_evaluations(task_fn)
87+
report = reports[0]
8788

8889
# View results
89-
print(f"Average score: {report.average_score:.2f}")
90-
print(f"Pass rate: {report.pass_rate:.1%}")
90+
print(f"Overall score: {report.overall_score:.2f}")
91+
print(f"Pass rate: {sum(report.test_passes) / len(report.test_passes):.1%}")
9192
```
9293

9394
## Production Evaluation with CloudWatch Spans
@@ -128,7 +129,7 @@ spans = fetch_spans_from_cloudwatch(
128129
### Evaluation Workflow
129130

130131
```python
131-
from strands_evals import Case, Dataset
132+
from strands_evals import Case, Experiment
132133
from bedrock_agentcore.evaluation import create_strands_evaluator, fetch_spans_from_cloudwatch
133134
import time
134135

@@ -173,10 +174,11 @@ def task_fn(case):
173174
}
174175

175176
evaluator = create_strands_evaluator("Builtin.Helpfulness")
176-
dataset = Dataset(cases=cases, evaluator=evaluator)
177-
report = dataset.run_evaluations(task_fn)
177+
experiment = Experiment(cases=cases, evaluators=[evaluator])
178+
reports = experiment.run_evaluations(task_fn)
179+
report = reports[0]
178180

179-
print(f"Average score: {report.average_score:.2f}")
181+
print(f"Overall score: {report.overall_score:.2f}")
180182
```
181183

182184
## Available Evaluators

src/bedrock_agentcore/evaluation/integrations/strands_agents_evals/evaluator.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,7 @@
1111
from typing_extensions import TypeVar
1212

1313
from bedrock_agentcore._utils.endpoints import DEFAULT_REGION
14-
1514
from bedrock_agentcore.evaluation.span_to_adot_serializer import convert_strands_to_adot
16-
from bedrock_agentcore.evaluation.utils.cloudwatch_span_helper import fetch_spans_from_cloudwatch
1715

1816
logger = logging.getLogger(__name__)
1917

src/bedrock_agentcore/evaluation/span_to_adot_serializer/strands_converter.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
from .adot_models import (
1616
ADOTDocumentBuilder,
1717
ConversationTurn,
18-
ResourceInfo,
19-
SpanMetadata,
2018
SpanParser,
2119
ToolExecution,
2220
)

src/bedrock_agentcore/evaluation/utils/cloudwatch_span_helper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,15 @@ def query_log_group(
7575

7676
# Poll for completion with exponential backoff
7777
backoff = initial_backoff
78-
for attempt in range(max_attempts):
78+
for _attempt in range(max_attempts):
7979
result = self.logs_client.get_query_results(queryId=query_id)
8080

8181
if result["status"] == "Complete":
8282
# Check if we hit the 10K result limit
8383
statistics = result.get("statistics", {})
8484
records_matched = statistics.get("recordsMatched", 0)
8585
records_returned = len(result.get("results", []))
86-
86+
8787
if records_matched > 10000:
8888
logger.warning(
8989
"CloudWatch query matched %d records but can only return 10,000. "
@@ -92,7 +92,7 @@ def query_log_group(
9292
records_matched,
9393
log_group_name,
9494
)
95-
95+
9696
logger.debug(
9797
"CloudWatch query completed: %d results returned, %d records matched",
9898
records_returned,

tests/bedrock_agentcore/evaluation/integrations/strands_agents_evals/test_end_to_end.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import pytest
66
from strands import Agent, tool
7-
from strands_evals import Case, Dataset
7+
from strands_evals import Case, Experiment
88
from strands_evals.telemetry import StrandsEvalsTelemetry
99

1010
from bedrock_agentcore.evaluation import create_strands_evaluator
@@ -51,8 +51,9 @@ def task_fn(case):
5151
# Create evaluator with mocked client
5252
with patch("boto3.client", return_value=mock_boto_client):
5353
evaluator = create_strands_evaluator("Builtin.Helpfulness")
54-
dataset = Dataset(cases=cases, evaluator=evaluator)
55-
report = dataset.run_evaluations(task_fn)
54+
experiment = Experiment(cases=cases, evaluators=[evaluator])
55+
reports = experiment.run_evaluations(task_fn)
56+
report = reports[0]
5657

5758
# Verify results
5859
assert report.overall_score == 0.85
@@ -80,8 +81,8 @@ def task_fn(case):
8081

8182
with patch("boto3.client", return_value=mock_boto_client):
8283
evaluator = create_strands_evaluator("Builtin.Helpfulness")
83-
dataset = Dataset(cases=cases, evaluator=evaluator)
84-
dataset.run_evaluations(task_fn)
84+
experiment = Experiment(cases=cases, evaluators=[evaluator])
85+
experiment.run_evaluations(task_fn)
8586

8687
# Verify ADOT spans passed through without conversion
8788
call_args = mock_boto_client.evaluate.call_args[1]
@@ -96,8 +97,9 @@ def task_fn(case):
9697

9798
with patch("boto3.client", return_value=mock_boto_client):
9899
evaluator = create_strands_evaluator("Builtin.Helpfulness")
99-
dataset = Dataset(cases=cases, evaluator=evaluator)
100-
report = dataset.run_evaluations(task_fn)
100+
experiment = Experiment(cases=cases, evaluators=[evaluator])
101+
reports = experiment.run_evaluations(task_fn)
102+
report = reports[0]
101103

102104
# Should return 0 score for empty trajectory
103105
assert report.overall_score == 0.0

tests/bedrock_agentcore/evaluation/integrations/strands_agents_evals/test_evaluator.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Tests for Strands AgentCore Evaluator."""
22

3-
from datetime import datetime, timezone
43
from unittest.mock import Mock, patch
54

65
import pytest
@@ -408,6 +407,3 @@ def test_not_a_dict(self):
408407
"""Test non-dict is invalid."""
409408
assert _is_valid_adot_document("not a dict") is False
410409
assert _is_valid_adot_document(None) is False
411-
412-
413-

tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_adot_models.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
ToolExecution,
1414
)
1515

16-
1716
# ==============================================================================
1817
# Fixtures
1918
# ==============================================================================

tests/bedrock_agentcore/evaluation/span_to_adot_serializer/test_strands_converter.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
StrandsToADOTConverter,
1111
)
1212

13-
1413
# ==============================================================================
1514
# Fixtures
1615
# ==============================================================================

tests/bedrock_agentcore/evaluation/utils/test_cloudwatch_span_helper.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
from datetime import datetime, timezone
44
from unittest.mock import Mock, patch
55

6-
import pytest
7-
86
from bedrock_agentcore.evaluation.utils.cloudwatch_span_helper import (
97
CloudWatchSpanHelper,
108
_is_valid_adot_document,

tests_integ/evaluation/integrations/strands_agents_evals/test_strands_evaluation.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
import pytest
1111
from strands import Agent, tool
12-
from strands_evals import Case, Dataset
12+
from strands_evals import Case, Experiment
1313
from strands_evals.telemetry import StrandsEvalsTelemetry
1414

1515
from bedrock_agentcore.evaluation import create_strands_evaluator
@@ -59,8 +59,9 @@ def task_fn(case):
5959
cases = [Case(input="What is 2+2?", expected_output="4")]
6060

6161
evaluator = create_strands_evaluator("Builtin.Helpfulness", region=REGION)
62-
dataset = Dataset(cases=cases, evaluator=evaluator)
63-
report = dataset.run_evaluations(task_fn)
62+
experiment = Experiment(cases=cases, evaluators=[evaluator])
63+
reports = experiment.run_evaluations(task_fn)
64+
report = reports[0]
6465

6566
# Verify results
6667
assert report.overall_score >= 0.0
@@ -84,8 +85,9 @@ def task_fn(case):
8485
cases = [Case(input="Calculate 5 + 3", expected_output="8")]
8586

8687
evaluator = create_strands_evaluator("Builtin.Accuracy", region=REGION)
87-
dataset = Dataset(cases=cases, evaluator=evaluator)
88-
report = dataset.run_evaluations(task_fn)
88+
experiment = Experiment(cases=cases, evaluators=[evaluator])
89+
reports = experiment.run_evaluations(task_fn)
90+
report = reports[0]
8991

9092
assert report.overall_score >= 0.0
9193
assert report.overall_score <= 1.0
@@ -112,8 +114,9 @@ def task_fn(case):
112114
]
113115

114116
evaluator = create_strands_evaluator("Builtin.Helpfulness", region=REGION, test_pass_score=0.6)
115-
dataset = Dataset(cases=cases, evaluator=evaluator)
116-
report = dataset.run_evaluations(task_fn)
117+
experiment = Experiment(cases=cases, evaluators=[evaluator])
118+
reports = experiment.run_evaluations(task_fn)
119+
report = reports[0]
117120

118121
assert report.overall_score >= 0.0
119122
assert report.overall_score <= 1.0
@@ -131,8 +134,9 @@ def task_fn(case):
131134
cases = [Case(input="Test", expected_output="Response")]
132135

133136
evaluator = create_strands_evaluator("Builtin.Helpfulness", region=REGION)
134-
dataset = Dataset(cases=cases, evaluator=evaluator)
135-
report = dataset.run_evaluations(task_fn)
137+
experiment = Experiment(cases=cases, evaluators=[evaluator])
138+
reports = experiment.run_evaluations(task_fn)
139+
report = reports[0]
136140

137141
# Should return 0 score for empty trajectory
138142
assert report.overall_score == 0.0
@@ -156,8 +160,9 @@ def task_fn(case):
156160

157161
# Test with high threshold
158162
evaluator = create_strands_evaluator("Builtin.Helpfulness", region=REGION, test_pass_score=0.9)
159-
dataset = Dataset(cases=cases, evaluator=evaluator)
160-
report = dataset.run_evaluations(task_fn)
163+
experiment = Experiment(cases=cases, evaluators=[evaluator])
164+
reports = experiment.run_evaluations(task_fn)
165+
report = reports[0]
161166

162167
assert report.overall_score >= 0.0
163168
assert report.overall_score <= 1.0

0 commit comments

Comments
 (0)