Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(llmobs): support joining custom evaluations via tags #11535

Merged
merged 27 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 132 additions & 3 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ddtrace.internal.service import ServiceStatusError
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
from ddtrace.internal.utils.formats import asbool
from ddtrace.internal.utils.formats import parse_tags_str
from ddtrace.llmobs import _constants as constants
Expand Down Expand Up @@ -66,6 +67,7 @@
from ddtrace.llmobs.utils import ExportedLLMObsSpan
from ddtrace.llmobs.utils import Messages
from ddtrace.propagation.http import HTTPPropagator
from ddtrace.vendor.debtcollector import deprecate


log = get_logger(__name__)
Expand Down Expand Up @@ -904,6 +906,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
return
span._set_ctx_item(METRICS, metrics)

@classmethod
def submit_evaluation_for(
cls,
label: str,
metric_type: str,
value: Union[str, int, float],
span: Optional[dict] = None,
span_with_tag_value: Optional[Dict[str, str]] = None,
tags: Optional[Dict[str, str]] = None,
ml_app: Optional[str] = None,
timestamp_ms: Optional[int] = None,
) -> None:
"""
Submits a custom evaluation metric for a given span.

:param str label: The name of the evaluation metric.
:param str metric_type: The type of the evaluation metric. One of "categorical", "score".
:param value: The value of the evaluation metric.
Must be a string (categorical), integer (score), or float (score).
:param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
the span associated with this evaluation.
:param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
uniquely identifying the span associated with this evaluation.
:param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
:param str ml_app: The name of the ML application
:param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated.
If not set, the current time will be used.
"""
if cls.enabled is False:
log.debug(
"LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
"Evaluation metric data will not be sent.",
)
return

has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None)

if not has_exactly_one_joining_key:
raise ValueError(
"Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric."
)

join_on = {}
if span is not None:
if (
not isinstance(span, dict)
or not isinstance(span.get("span_id"), str)
or not isinstance(span.get("trace_id"), str)
):
raise TypeError(
"`span` must be a dictionary containing both span_id and trace_id keys. "
"LLMObs.export_span() can be used to generate this dictionary from a given span."
)
join_on["span"] = span
elif span_with_tag_value is not None:
if (
not isinstance(span_with_tag_value, dict)
or not isinstance(span_with_tag_value.get("tag_key"), str)
or not isinstance(span_with_tag_value.get("tag_value"), str)
):
raise TypeError(
"`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values"
)
join_on["tag"] = {
"key": span_with_tag_value.get("tag_key"),
"value": span_with_tag_value.get("tag_value"),
}

timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)

if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")

if not label:
raise ValueError("label must be the specified name of the evaluation metric.")

metric_type = metric_type.lower()
if metric_type not in ("categorical", "score"):
raise ValueError("metric_type must be one of 'categorical' or 'score'.")

if metric_type == "categorical" and not isinstance(value, str):
raise TypeError("value must be a string for a categorical metric.")
if metric_type == "score" and not isinstance(value, (int, float)):
raise TypeError("value must be an integer or float for a score metric.")

if tags is not None and not isinstance(tags, dict):
log.warning("tags must be a dictionary of string key-value pairs.")
tags = {}

evaluation_tags = {
"ddtrace.version": ddtrace.__version__,
"ml_app": ml_app,
}

if tags:
for k, v in tags.items():
try:
evaluation_tags[ensure_text(k)] = ensure_text(v)
except TypeError:
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")

ml_app = ml_app if ml_app else config._llmobs_ml_app
if not ml_app:
log.warning(
"ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
"Ensure this configuration is set before running your application."
)
return

evaluation_metric = {
"join_on": join_on,
"label": str(label),
"metric_type": metric_type,
"timestamp_ms": timestamp_ms,
"{}_value".format(metric_type): value,
"ml_app": ml_app,
"tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
}

cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)

@classmethod
def submit_evaluation(
cls,
Expand All @@ -916,6 +1039,13 @@ def submit_evaluation(
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
) -> None:
deprecate(
"Using `LLMObs.submit_evaluation` is deprecated",
message="Please use `LLMObs.submit_evaluation_for` instead.",
removal_version="3.0.0",
category=DDTraceDeprecationWarning,
)

"""
Submits a custom evaluation metric for a given span ID and trace ID.

Expand All @@ -931,7 +1061,7 @@ def submit_evaluation(
evaluation metric.
"""
if cls.enabled is False:
log.warning(
log.debug(
"LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
)
return
Expand Down Expand Up @@ -1007,8 +1137,7 @@ def submit_evaluation(
log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")

evaluation_metric = {
"span_id": span_id,
"trace_id": trace_id,
"join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
"label": str(label),
"metric_type": metric_type.lower(),
"timestamp_ms": timestamp_ms,
Expand Down
12 changes: 9 additions & 3 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):


class LLMObsEvaluationMetricEvent(TypedDict, total=False):
span_id: str
trace_id: str
join_on: Dict[str, Dict[str, str]]
metric_type: str
label: str
categorical_value: str
Expand Down Expand Up @@ -107,6 +106,13 @@ def periodic(self) -> None:
events = self._buffer
self._buffer = []

if not self._headers.get("DD-API-KEY"):
logger.warning(
"DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
"Ensure this configuration is set before running your application.",
)
return

data = self._data(events)
enc_llm_events = safe_json(data)
conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout)
Expand Down Expand Up @@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
self._event_type = "evaluation_metric"
self._buffer = []
self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
self._intake = "api.%s" % self._site # type: str

def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:
Expand Down
17 changes: 17 additions & 0 deletions releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
---
features:
- |
LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation
to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span.
Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs.
Example usage:
- Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`.
- Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
deprecations:
- |
LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0.
As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with:
`LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...)
You may also join an evaluation to a span using a tag key-value pair like so:
`LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`.
16 changes: 10 additions & 6 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span):


def _expected_llmobs_eval_metric_event(
span_id,
trace_id,
metric_type,
label,
ml_app,
tag_key=None,
tag_value=None,
span_id=None,
trace_id=None,
timestamp_ms=None,
categorical_value=None,
score_value=None,
Expand All @@ -223,15 +225,18 @@ def _expected_llmobs_eval_metric_event(
metadata=None,
):
eval_metric_event = {
"span_id": span_id,
"trace_id": trace_id,
"join_on": {},
"metric_type": metric_type,
"label": label,
"tags": [
"ddtrace.version:{}".format(ddtrace.__version__),
"ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
],
}
if tag_key is not None and tag_value is not None:
eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value}
if span_id is not None and trace_id is not None:
eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
if categorical_value is not None:
eval_metric_event["categorical_value"] = categorical_value
if score_value is not None:
Expand Down Expand Up @@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span):

def _dummy_evaluator_eval_metric_event(span_id, trace_id):
return LLMObsEvaluationMetricEvent(
span_id=span_id,
trace_id=trace_id,
join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
score_value=1.0,
ml_app="unnamed-ml-app",
timestamp_ms=mock.ANY,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
"score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
"score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568298743}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
headers:
content-length:
- '316'
- '311'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
content-type:
- application/vnd.api+json
date:
- Wed, 21 Aug 2024 14:11:41 GMT
- Mon, 25 Nov 2024 20:58:19 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568297450}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
headers:
content-length:
- '330'
- '325'
content-security-policy:
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
content-type:
- application/vnd.api+json
date:
- Wed, 21 Aug 2024 14:11:40 GMT
- Mon, 25 Nov 2024 20:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
vary:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
interactions:
- request:
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
"12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
"very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
{"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
"categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
"timestamp_ms": 1732568297307}]}}}'
headers:
Content-Type:
- application/json
DD-API-KEY:
- XXXXXX
method: POST
uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
response:
body:
string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
Expand All @@ -21,7 +22,7 @@ interactions:
content-type:
- application/json
date:
- Wed, 21 Aug 2024 14:11:40 GMT
- Mon, 25 Nov 2024 20:58:17 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
x-content-type-options:
Expand Down
Loading
Loading