Skip to content

Commit 1b223aa

Browse files
lievanlievan
and
lievan
authored
feat(llmobs): support joining custom evaluations via tags (#11535)
This PR implements `LLMObs.submit_evaluation_for` method, which gives users two options for joining custom evaluations - by tag via the `span_with_tag` argument, which accepts a tuple containing a tag key/value pair - by span via the `span` argument, which accepts a dictionary containing `span_id` and `trace_id` keys There are also a couple behavior differences between `submit_evaluation_for` and `submit_evaluation`. In the new method, we - throw whenever a required argument is the wrong value or type - remove `metadata` argument - move the warning log for missing api key to the eval metric writer's `periodic` method Other changes: #### Eval metric writer Update the eval metric writer to write to the `v2` eval metric endpoint. The main difference with this endpoint is that it accepts a `join_with` field that holds joining information instead of a top-level trace and span id fields. #### Deprecate `submit_evaluation` Deprecates `submit_evaluation`. **I've set the removal version to be `3.0.0`.** ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan <[email protected]>
1 parent 5e68823 commit 1b223aa

15 files changed

+648
-162
lines changed

ddtrace/llmobs/_llmobs.py

+132-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from ddtrace.internal.service import ServiceStatusError
2929
from ddtrace.internal.telemetry import telemetry_writer
3030
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
31+
from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
3132
from ddtrace.internal.utils.formats import asbool
3233
from ddtrace.internal.utils.formats import parse_tags_str
3334
from ddtrace.llmobs import _constants as constants
@@ -66,6 +67,7 @@
6667
from ddtrace.llmobs.utils import ExportedLLMObsSpan
6768
from ddtrace.llmobs.utils import Messages
6869
from ddtrace.propagation.http import HTTPPropagator
70+
from ddtrace.vendor.debtcollector import deprecate
6971

7072

7173
log = get_logger(__name__)
@@ -904,6 +906,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
904906
return
905907
span._set_ctx_item(METRICS, metrics)
906908

909+
@classmethod
910+
def submit_evaluation_for(
911+
cls,
912+
label: str,
913+
metric_type: str,
914+
value: Union[str, int, float],
915+
span: Optional[dict] = None,
916+
span_with_tag_value: Optional[Dict[str, str]] = None,
917+
tags: Optional[Dict[str, str]] = None,
918+
ml_app: Optional[str] = None,
919+
timestamp_ms: Optional[int] = None,
920+
) -> None:
921+
"""
922+
Submits a custom evaluation metric for a given span.
923+
924+
:param str label: The name of the evaluation metric.
925+
:param str metric_type: The type of the evaluation metric. One of "categorical", "score".
926+
:param value: The value of the evaluation metric.
927+
Must be a string (categorical), integer (score), or float (score).
928+
:param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
929+
the span associated with this evaluation.
930+
:param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
931+
uniquely identifying the span associated with this evaluation.
932+
:param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
933+
:param str ml_app: The name of the ML application
934+
:param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated.
935+
If not set, the current time will be used.
936+
"""
937+
if cls.enabled is False:
938+
log.debug(
939+
"LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
940+
"Evaluation metric data will not be sent.",
941+
)
942+
return
943+
944+
has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None)
945+
946+
if not has_exactly_one_joining_key:
947+
raise ValueError(
948+
"Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric."
949+
)
950+
951+
join_on = {}
952+
if span is not None:
953+
if (
954+
not isinstance(span, dict)
955+
or not isinstance(span.get("span_id"), str)
956+
or not isinstance(span.get("trace_id"), str)
957+
):
958+
raise TypeError(
959+
"`span` must be a dictionary containing both span_id and trace_id keys. "
960+
"LLMObs.export_span() can be used to generate this dictionary from a given span."
961+
)
962+
join_on["span"] = span
963+
elif span_with_tag_value is not None:
964+
if (
965+
not isinstance(span_with_tag_value, dict)
966+
or not isinstance(span_with_tag_value.get("tag_key"), str)
967+
or not isinstance(span_with_tag_value.get("tag_value"), str)
968+
):
969+
raise TypeError(
970+
"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values"
971+
)
972+
join_on["tag"] = {
973+
"key": span_with_tag_value.get("tag_key"),
974+
"value": span_with_tag_value.get("tag_value"),
975+
}
976+
977+
timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
978+
979+
if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
980+
raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
981+
982+
if not label:
983+
raise ValueError("label must be the specified name of the evaluation metric.")
984+
985+
metric_type = metric_type.lower()
986+
if metric_type not in ("categorical", "score"):
987+
raise ValueError("metric_type must be one of 'categorical' or 'score'.")
988+
989+
if metric_type == "categorical" and not isinstance(value, str):
990+
raise TypeError("value must be a string for a categorical metric.")
991+
if metric_type == "score" and not isinstance(value, (int, float)):
992+
raise TypeError("value must be an integer or float for a score metric.")
993+
994+
if tags is not None and not isinstance(tags, dict):
995+
log.warning("tags must be a dictionary of string key-value pairs.")
996+
tags = {}
997+
998+
evaluation_tags = {
999+
"ddtrace.version": ddtrace.__version__,
1000+
"ml_app": ml_app,
1001+
}
1002+
1003+
if tags:
1004+
for k, v in tags.items():
1005+
try:
1006+
evaluation_tags[ensure_text(k)] = ensure_text(v)
1007+
except TypeError:
1008+
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
1009+
1010+
ml_app = ml_app if ml_app else config._llmobs_ml_app
1011+
if not ml_app:
1012+
log.warning(
1013+
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
1014+
"Ensure this configuration is set before running your application."
1015+
)
1016+
return
1017+
1018+
evaluation_metric = {
1019+
"join_on": join_on,
1020+
"label": str(label),
1021+
"metric_type": metric_type,
1022+
"timestamp_ms": timestamp_ms,
1023+
"{}_value".format(metric_type): value,
1024+
"ml_app": ml_app,
1025+
"tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
1026+
}
1027+
1028+
cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
1029+
9071030
@classmethod
9081031
def submit_evaluation(
9091032
cls,
@@ -916,6 +1039,13 @@ def submit_evaluation(
9161039
timestamp_ms: Optional[int] = None,
9171040
metadata: Optional[Dict[str, object]] = None,
9181041
) -> None:
1042+
deprecate(
1043+
"Using `LLMObs.submit_evaluation` is deprecated",
1044+
message="Please use `LLMObs.submit_evaluation_for` instead.",
1045+
removal_version="3.0.0",
1046+
category=DDTraceDeprecationWarning,
1047+
)
1048+
9191049
"""
9201050
Submits a custom evaluation metric for a given span ID and trace ID.
9211051
@@ -931,7 +1061,7 @@ def submit_evaluation(
9311061
evaluation metric.
9321062
"""
9331063
if cls.enabled is False:
934-
log.warning(
1064+
log.debug(
9351065
"LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
9361066
)
9371067
return
@@ -1007,8 +1137,7 @@ def submit_evaluation(
10071137
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
10081138

10091139
evaluation_metric = {
1010-
"span_id": span_id,
1011-
"trace_id": trace_id,
1140+
"join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
10121141
"label": str(label),
10131142
"metric_type": metric_type.lower(),
10141143
"timestamp_ms": timestamp_ms,

ddtrace/llmobs/_writer.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):
5555

5656

5757
class LLMObsEvaluationMetricEvent(TypedDict, total=False):
58-
span_id: str
59-
trace_id: str
58+
join_on: Dict[str, Dict[str, str]]
6059
metric_type: str
6160
label: str
6261
categorical_value: str
@@ -107,6 +106,13 @@ def periodic(self) -> None:
107106
events = self._buffer
108107
self._buffer = []
109108

109+
if not self._headers.get("DD-API-KEY"):
110+
logger.warning(
111+
"DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
112+
"Ensure this configuration is set before running your application.",
113+
)
114+
return
115+
110116
data = self._data(events)
111117
enc_llm_events = safe_json(data)
112118
conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout)
@@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
154160
super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
155161
self._event_type = "evaluation_metric"
156162
self._buffer = []
157-
self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
163+
self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
158164
self._intake = "api.%s" % self._site # type: str
159165

160166
def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
features:
3+
- |
4+
LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation
5+
to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span.
6+
Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs.
7+
Example usage:
8+
- Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`.
9+
- Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
10+
deprecations:
11+
- |
12+
LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0.
13+
As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
14+
To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with:
15+
`LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...)
16+
You may also join an evaluation to a span using a tag key-value pair like so:
17+
`LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`.

tests/llmobs/_utils.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span):
210210

211211

212212
def _expected_llmobs_eval_metric_event(
213-
span_id,
214-
trace_id,
215213
metric_type,
216214
label,
217215
ml_app,
216+
tag_key=None,
217+
tag_value=None,
218+
span_id=None,
219+
trace_id=None,
218220
timestamp_ms=None,
219221
categorical_value=None,
220222
score_value=None,
@@ -223,15 +225,18 @@ def _expected_llmobs_eval_metric_event(
223225
metadata=None,
224226
):
225227
eval_metric_event = {
226-
"span_id": span_id,
227-
"trace_id": trace_id,
228+
"join_on": {},
228229
"metric_type": metric_type,
229230
"label": label,
230231
"tags": [
231232
"ddtrace.version:{}".format(ddtrace.__version__),
232233
"ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
233234
],
234235
}
236+
if tag_key is not None and tag_value is not None:
237+
eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value}
238+
if span_id is not None and trace_id is not None:
239+
eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
235240
if categorical_value is not None:
236241
eval_metric_event["categorical_value"] = categorical_value
237242
if score_value is not None:
@@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span):
542547

543548
def _dummy_evaluator_eval_metric_event(span_id, trace_id):
544549
return LLMObsEvaluationMetricEvent(
545-
span_id=span_id,
546-
trace_id=trace_id,
550+
join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
547551
score_value=1.0,
548552
ml_app="unnamed-ml-app",
549553
timestamp_ms=mock.ANY,

tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml

+8-7
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
11
interactions:
22
- request:
3-
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
4-
"12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
5-
"score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
5+
"score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
6+
"timestamp_ms": 1732568298743}]}}}'
67
headers:
78
Content-Type:
89
- application/json
910
DD-API-KEY:
1011
- XXXXXX
1112
method: POST
12-
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
13+
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
1314
response:
1415
body:
15-
string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
16+
string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
1617
headers:
1718
content-length:
18-
- '316'
19+
- '311'
1920
content-security-policy:
2021
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
2122
content-type:
2223
- application/vnd.api+json
2324
date:
24-
- Wed, 21 Aug 2024 14:11:41 GMT
25+
- Mon, 25 Nov 2024 20:58:19 GMT
2526
strict-transport-security:
2627
- max-age=31536000; includeSubDomains; preload
2728
vary:

tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml

+8-7
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
11
interactions:
22
- request:
3-
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
4-
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
5-
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
5+
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
6+
"timestamp_ms": 1732568297450}]}}}'
67
headers:
78
Content-Type:
89
- application/json
910
DD-API-KEY:
1011
- XXXXXX
1112
method: POST
12-
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
13+
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
1314
response:
1415
body:
15-
string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
16+
string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
1617
headers:
1718
content-length:
18-
- '330'
19+
- '325'
1920
content-security-policy:
2021
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
2122
content-type:
2223
- application/vnd.api+json
2324
date:
24-
- Wed, 21 Aug 2024 14:11:40 GMT
25+
- Mon, 25 Nov 2024 20:58:17 GMT
2526
strict-transport-security:
2627
- max-age=31536000; includeSubDomains; preload
2728
vary:

tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
interactions:
22
- request:
3-
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
4-
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
5-
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
3+
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
4+
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
5+
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
6+
"timestamp_ms": 1732568297307}]}}}'
67
headers:
78
Content-Type:
89
- application/json
910
DD-API-KEY:
1011
- XXXXXX
1112
method: POST
12-
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
13+
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
1314
response:
1415
body:
1516
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
@@ -21,7 +22,7 @@ interactions:
2122
content-type:
2223
- application/json
2324
date:
24-
- Wed, 21 Aug 2024 14:11:40 GMT
25+
- Mon, 25 Nov 2024 20:58:17 GMT
2526
strict-transport-security:
2627
- max-age=31536000; includeSubDomains; preload
2728
x-content-type-options:

0 commit comments

Comments
 (0)