diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py index 2937eddff3a..1c519d794eb 100644 --- a/ddtrace/llmobs/_experiment.py +++ b/ddtrace/llmobs/_experiment.py @@ -6,6 +6,7 @@ from dataclasses import field import re import sys +import time import traceback from typing import TYPE_CHECKING from typing import Any @@ -37,6 +38,7 @@ from ddtrace.llmobs import LLMObs from ddtrace.llmobs._writer import LLMObsExperimentEvalMetricEvent from ddtrace.llmobs._writer import LLMObsExperimentsClient + from ddtrace.llmobs.types import ExportedLLMObsSpan logger = get_logger(__name__) @@ -743,6 +745,11 @@ class Experiment: _evaluators: Sequence[Union[EvaluatorType, AsyncEvaluatorType]] _summary_evaluators: Sequence[Union[SummaryEvaluatorType, AsyncSummaryEvaluatorType]] + @classmethod + def _NO_OP_TASK(cls, input_data, config): + """No-op task used when initializing distributed experiment objects on remote hosts.""" + return None + def __init__( self, name: str, @@ -756,6 +763,7 @@ def __init__( _llmobs_instance: Optional["LLMObs"] = None, summary_evaluators: Optional[Sequence[Union[SummaryEvaluatorType, AsyncSummaryEvaluatorType]]] = None, runs: Optional[int] = None, + is_distributed: Optional[bool] = False, ) -> None: self.name = name self._task = task @@ -774,6 +782,7 @@ def __init__( self._config["filtered_record_tags"] = cast(JSONType, dataset.filter_tags) self._runs: int = runs or 1 self._llmobs_instance = _llmobs_instance + self._is_distributed = is_distributed if not project_name: raise ValueError( @@ -785,6 +794,7 @@ def __init__( self._project_id: Optional[str] = None self._id: Optional[str] = None self._run_name: Optional[str] = None + self.experiment_span: Optional["ExportedLLMObsSpan"] = None @property def url(self) -> str: @@ -983,7 +993,7 @@ def _prepare_summary_evaluator_data( return inputs, outputs, expected_outputs, metadata_list, eval_results_by_name - def _setup_experiment(self, llmobs_not_enabled_error: str) -> None: + def _setup_experiment(self, llmobs_not_enabled_error: str, ensure_unique: bool = True) -> None: if not self._llmobs_instance or not self._llmobs_instance.enabled: raise ValueError(llmobs_not_enabled_error) @@ -1000,6 +1010,7 @@ def _setup_experiment(self, llmobs_not_enabled_error: str) -> None: convert_tags_dict_to_list(self._tags), self._description, self._runs, + ensure_unique, ) self._id = experiment_id self._tags["experiment_id"] = str(experiment_id) @@ -1070,6 +1081,8 @@ async def _process_record( experiment_name=self.name, ) as span: span_context = self._llmobs_instance.export_span(span=span) + if self._is_distributed: + self.experiment_span = span_context if span_context: span_id = span_context.get("span_id", "") trace_id = span_context.get("trace_id", "") @@ -1336,6 +1349,92 @@ async def _evaluate_summary_single(summary_evaluator: Any) -> tuple[str, dict[st return evaluations + async def _run_task_single_iteration( + self, + jobs: int = 1, + raise_errors: bool = False, + run_iteration: Optional[int] = 0, + ) -> ExperimentResult: + run = _ExperimentRunInfo(run_iteration or 0) + self._tags["run_id"] = str(run._id) + self._tags["run_iteration"] = str(run._run_iteration) + task_results = await self._run_task(jobs, run, raise_errors, None) + evaluations = await self._run_evaluators(task_results, raise_errors=raise_errors, jobs=jobs) + run_result = self._merge_results(run, task_results, evaluations, []) + experiment_evals = self._generate_metrics_from_exp_results(run_result) + self._llmobs_instance._dne_client.experiment_eval_post( # type: ignore[union-attr] + cast(str, self._id), experiment_evals, convert_tags_dict_to_list(self._tags) + ) + return { + "summary_evaluations": {}, + "rows": [], + "runs": [run_result], + } + + def _submit_eval_metric( + self, + eval_name: str, + eval_value: JSONType, + span: Optional["ExportedLLMObsSpan"] = None, + timestamp_ms: Optional[int] = None, + is_summary_eval: Optional[bool] = None, + reasoning: Optional[str] = None, + assessment: Optional[str] = None, + metadata: Optional[dict[str, JSONType]] = None, + tags: Optional[dict[str, str]] = None, + ) -> None: + """Submit an evaluation metric for a distributed experiment. + + :param eval_name: Name of the evaluation metric + :param eval_value: Value of the evaluation metric + :param span: Optional span context dict with span_id and trace_id. If None and not a + summary eval, uses the last span from _run_task_single_iteration. + :param timestamp_ms: Optional timestamp in milliseconds + :param is_summary_eval: Whether this is a summary-level evaluation + :param reasoning: Optional reasoning string + :param assessment: Optional assessment string + :param metadata: Optional metadata dict + :param tags: Optional tags dict + """ + if not self._is_distributed: + raise ValueError("this method is only used for distributed experiments") + + if span is not None and ( + not isinstance(span, dict) + or not isinstance(span.get("span_id"), str) + or not isinstance(span.get("trace_id"), str) + ): + raise TypeError( + "`span` must be a dictionary containing both span_id and trace_id keys. " + "LLMObs.export_span() can be used to generate this dictionary from a given span." + ) + + if span is None and not is_summary_eval and self.experiment_span is None: + raise TypeError("unexpected state, must supply span or must run the experiment first") + + if span is None and not is_summary_eval: + span = self.experiment_span + + timestamp_ns = int(timestamp_ms * 1e6) if timestamp_ms is not None else int(time.time() * 1e9) + + eval_metric = self._generate_metric_from_evaluation( + eval_name, + eval_value, + None, + span.get("span_id", "") if span else "", + span.get("trace_id", "") if span else "", + timestamp_ns, + "summary" if is_summary_eval else "custom", + reasoning, + assessment, + metadata, + tags, + ) + + self._llmobs_instance._dne_client.experiment_eval_post( # type: ignore[union-attr] + cast(str, self._id), [eval_metric], convert_tags_dict_to_list(self._tags) + ) + class SyncExperiment: """Thin synchronous wrapper around the async-native ``Experiment``. diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index e1e3b1f5c28..2ec321bb9f1 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -107,8 +107,10 @@ from ddtrace.llmobs._experiment import Dataset from ddtrace.llmobs._experiment import DatasetRecord from ddtrace.llmobs._experiment import DatasetRecordInputType +from ddtrace.llmobs._experiment import EvaluatorResult from ddtrace.llmobs._experiment import EvaluatorType from ddtrace.llmobs._experiment import Experiment +from ddtrace.llmobs._experiment import ExperimentResult from ddtrace.llmobs._experiment import JSONType from ddtrace.llmobs._experiment import Project from ddtrace.llmobs._experiment import SummaryEvaluatorType @@ -1328,6 +1330,73 @@ def async_experiment( runs=runs, ) + @classmethod + def _distributed_experiment( + cls, + name: str, + dataset: Dataset, + description: str = "", + project_name: Optional[str] = None, + tags: Optional[dict[str, str]] = None, + config: Optional[ConfigType] = None, + runs: Optional[int] = 1, + ) -> Experiment: + experiment = Experiment( + name, + Experiment._NO_OP_TASK, + dataset, + [], + project_name=project_name or cls._project_name, + tags=tags, + description=description, + config=config, + _llmobs_instance=cls._instance, + runs=runs, + is_distributed=True, + ) + experiment._setup_experiment( + "LLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)`", + ensure_unique=False, + ) + return experiment + + @classmethod + def _run_for_experiment( + cls, + experiment_id: str, + task: Callable[[DatasetRecordInputType, Optional[ConfigType]], JSONType], + dataset_records: list[DatasetRecord], + evaluators: list[ + Union[ + Callable[[DatasetRecordInputType, JSONType, JSONType], Union[JSONType, EvaluatorResult]], + Callable[[], Union[JSONType, EvaluatorResult]], + ] + ], + jobs: int = 1, + raise_errors: bool = False, + run_iteration: Optional[int] = 0, + tags: Optional[dict[str, str]] = None, + ) -> tuple[Experiment, ExperimentResult]: + if not cls._instance or not cls._instance.enabled: + raise ValueError("LLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)`") + experiment = cls._instance._dne_client.experiment_get(experiment_id) + experiment._llmobs_instance = cls._instance + experiment._dataset._records = dataset_records + experiment._task = task + experiment._evaluators = evaluators # type: ignore[assignment] + + coro = experiment._run_task_single_iteration(jobs, raise_errors, run_iteration) + try: + asyncio.get_running_loop() + except RuntimeError: + results = asyncio.run(coro) + else: + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + results = pool.submit(asyncio.run, coro).result() + return experiment, results + @classmethod def register_processor(cls, processor: Optional[Callable[[LLMObsSpan], Optional[LLMObsSpan]]] = None) -> None: """Register a processor to be called on each LLMObs span. diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index afc8297fc76..712cc0ae4bf 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -38,6 +38,7 @@ from ddtrace.llmobs._experiment import Dataset from ddtrace.llmobs._experiment import DatasetRecord from ddtrace.llmobs._experiment import DatasetRecordRaw +from ddtrace.llmobs._experiment import Experiment from ddtrace.llmobs._experiment import JSONType from ddtrace.llmobs._experiment import Project from ddtrace.llmobs._experiment import UpdatableDatasetRecord @@ -662,6 +663,67 @@ def project_create_or_get(self, name: Optional[str] = None) -> Project: return project + def experiment_get(self, id: str, tag_overrides: Optional[dict[str, str]] = None) -> "Experiment": # noqa: A002 + path = f"/api/v2/llm-obs/v1/experiments?filter[id]={id}" + resp = self.request("GET", path) + if resp.status != 200: + raise ValueError(f"Failed to get experiment with ID {id}: {resp.status} {resp.get_json()}") + response_data = resp.get_json() + experiments = response_data.get("data", []) + if len(experiments) < 1: + raise ValueError(f"No experiments found for ID {id}") + experiment = experiments[0]["attributes"] + project_id = experiment["project_id"] + dataset_id = experiment["dataset_id"] + + tags: list[str] = experiment["metadata"].get("tags", []) + tags_dict: dict[str, str] = {} + for tag in tags: + kv = tag.split(":", 1) + if len(kv) == 2: + tags_dict[kv[0]] = kv[1] + + if tag_overrides: + tags_dict.update(tag_overrides) + + # TODO[gh] attempt to find the project & dataset name through tags if possible, + # temporary hack to avoid extra API calls + project_name = tags_dict.get("project_name", project_id) + dataset_name = tags_dict.get("dataset_name", dataset_id) + + project = Project(name=project_name, _id=project_id) + + dataset = Dataset( + name=dataset_name, + project=project, + dataset_id=dataset_id, + records=[], + # TODO[gh] need to fully pull dataset for this to be accurate, not critical for now + description="", + # TODO[gh] this may be incorrect + latest_version=experiment["dataset_version"], + version=experiment["dataset_version"], + _dne_client=self, + ) + + experiment_obj = Experiment( + name=experiment["experiment"], + task=Experiment._NO_OP_TASK, + dataset=dataset, + evaluators=[], + project_name=project_name, + tags=tags_dict, + description=experiment["description"], + config=experiment.get("config", {}), + _llmobs_instance=None, + runs=experiment["run_count"], + is_distributed=True, + ) + experiment_obj._run_name = experiment["name"] + experiment_obj._id = id + experiment_obj._project_id = project_id + return experiment_obj + def experiment_create( self, name: str, @@ -672,6 +734,7 @@ def experiment_create( tags: Optional[list[str]] = None, description: Optional[str] = None, runs: Optional[int] = 1, + ensure_unique: bool = True, ) -> tuple[str, str]: path = "/api/unstable/llm-obs/v1/experiments" resp = self.request( @@ -688,7 +751,7 @@ def experiment_create( "dataset_version": dataset_version, "config": exp_config or {}, "metadata": {"tags": cast(JSONType, tags or [])}, - "ensure_unique": True, + "ensure_unique": ensure_unique, "run_count": runs, }, } diff --git a/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml b/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml index d0d2f41093f..a84f2b55a1f 100644 --- a/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml +++ b/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml @@ -1,6 +1,6 @@ --- features: - | - LLM Observability: Subset of dataset records can now be pulled with tags by using the ``tags`` argument + LLM Observability: Subset of dataset records can now be pulled with tags by using the ``tags`` argument to ``LLMObs.pull_dataset``, provided in a list of strings of key value pairs: ``LLMObs.pull_dataset(dataset_name="my-dataset", tags=["env:prod", "version:1.0"])`` diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml new file mode 100644 index 00000000000..fae04bd567a --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments", + "metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456", + "timestamp_ms": 1771602113367, "metric_type": "score", "label": "dummy_evaluator", + "score_value": 0, "error": null, "tags": [], "experiment_id": "1a7b4baf-d4c5-4053-9afd-d2e656c80757"}], + "tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", "dataset_name:test-dataset-123", + "experiment_name:test_run_for_experiment", "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8", + "run_id:12345678-abcd-abcd-abcd-123456789012", "run_iteration:1"]}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '603' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/1a7b4baf-d4c5-4053-9afd-d2e656c80757/events + response: + body: + string: '' + headers: + content-length: + - '0' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:06:47 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 202 + message: Accepted +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml new file mode 100644 index 00000000000..c3ba886c6bd --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_raises_on_invalid_span", + "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id": + "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {}, + "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", + "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_raises_on_invalid_span", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '547' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"ff62ff7c-5099-4570-915c-023ae44b56d3","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:34.453515088Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_raises_on_invalid_span","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_raises_on_invalid_span","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_raises_on_invalid_span","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:34.453515178Z"}}}' + headers: + content-length: + - '752' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:08:34 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml new file mode 100644 index 00000000000..9da4c91be5c --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml @@ -0,0 +1,51 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_distributed_experiment", + "description": "A distributed experiment", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", + "project_id": "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, + "config": {"models": ["gpt-4.1"]}, "metadata": {"tags": ["ddtrace.version:4.5.0rc4", + "project_name:test-project-clean", "dataset_name:test-dataset-123", "experiment_name:test_distributed_experiment", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '554' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{"models":["gpt-4.1"]},"created_at":"2026-02-20T17:05:49.52469781Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"A + distributed experiment","experiment":"test_distributed_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_distributed_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_distributed_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:49.524697884Z"}}}' + headers: + content-length: + - '738' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:05:49 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml new file mode 100644 index 00000000000..537e67b3054 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_raises_when_no_span_available", + "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id": + "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {}, + "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", + "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_raises_when_no_span_available", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '561' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"e000911f-2410-4114-b635-97884a0515ed","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:36.940475557Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_raises_when_no_span_available","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_raises_when_no_span_available","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_raises_when_no_span_available","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:36.940475631Z"}}}' + headers: + content-length: + - '773' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:08:36 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml new file mode 100644 index 00000000000..ccf07d9afb0 --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_summary", + "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id": + "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {}, + "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", + "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_summary", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '517' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"da544dd1-f1ca-4720-b39d-b7ea32ec0681","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:07.047455296Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_summary","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_summary","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_summary","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:07.047455387Z"}}}' + headers: + content-length: + - '707' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:08:07 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml new file mode 100644 index 00000000000..e26c2fb96ad --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_run_for_experiment", + "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id": + "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {}, + "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", + "dataset_name:test-dataset-123", "experiment_name:test_run_for_experiment", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '501' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"1a7b4baf-d4c5-4053-9afd-d2e656c80757","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:05:29.86277Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_run_for_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_run_for_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_run_for_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:29.86277Z"}}}' + headers: + content-length: + - '675' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:06:37 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml new file mode 100644 index 00000000000..bd2eedbef2c --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml @@ -0,0 +1,50 @@ +interactions: +- request: + body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_with_explicit_span", + "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id": + "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {}, + "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", + "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_with_explicit_span", + "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false, + "run_count": 1}}}' + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + Content-Length: + - '539' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: POST + uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments + response: + body: + string: '{"data":{"id":"8216d32c-28ec-4e57-a6d3-fbb859c249c7","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:07:41.837856806Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_with_explicit_span","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_with_explicit_span","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_with_explicit_span","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:07:41.837856921Z"}}}' + headers: + content-length: + - '740' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:07:41 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml new file mode 100644 index 00000000000..8be93f1017e --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml @@ -0,0 +1,46 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/v2/llm-obs/v1/experiments?filter%5Bid%5D=0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca + response: + body: + string: '{"data":[{"id":"0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{"models":["gpt-4.1"]},"created_at":"2026-02-20T17:05:49.524697Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"A + distributed experiment","experiment":"test_distributed_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_distributed_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_distributed_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:49.524697Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '755' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:05:59 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml new file mode 100644 index 00000000000..7fcc776c0fd --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml @@ -0,0 +1,45 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + ? !!python/object/apply:multidict._multidict.istr + - Accept-Encoding + : - identity + Connection: + - keep-alive + ? !!python/object/apply:multidict._multidict.istr + - Content-Length + : - '0' + ? !!python/object/apply:multidict._multidict.istr + - Content-Type + : - application/json + User-Agent: + - python-requests/2.32.3 + method: GET + uri: https://api.datadoghq.com/api/v2/llm-obs/v1/experiments?filter%5Bid%5D=1a7b4baf-d4c5-4053-9afd-d2e656c80757 + response: + body: + string: '{"data":[{"id":"1a7b4baf-d4c5-4053-9afd-d2e656c80757","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:05:29.86277Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_run_for_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_run_for_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_run_for_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:29.86277Z"}}],"meta":{"after":""}}' + headers: + content-length: + - '697' + content-security-policy: + - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com + content-type: + - application/vnd.api+json + date: + - Fri, 20 Feb 2026 17:06:47 GMT + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - SAMEORIGIN + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py index 54454783b7d..1b563d89353 100644 --- a/tests/llmobs/test_experiments.py +++ b/tests/llmobs/test_experiments.py @@ -27,6 +27,7 @@ from ddtrace.llmobs._experiment import Dataset from ddtrace.llmobs._experiment import DatasetRecord from ddtrace.llmobs._experiment import EvaluatorResult +from ddtrace.llmobs._experiment import Experiment from ddtrace.llmobs._experiment import _ExperimentRunInfo from tests.utils import override_global_config @@ -2409,6 +2410,184 @@ def simple_evaluator(input_data, output_data, expected_output): assert summary_evals_dict["successful_summary_evaluator"]["error"] is None +# ============================================================================= +# Distributed Experiment Tests +# ============================================================================= + + +def test_distributed_experiment(llmobs, test_dataset_one_record): + """Test that _distributed_experiment creates an experiment with is_distributed=True.""" + exp = llmobs._distributed_experiment( + "test_distributed_experiment", + dataset=test_dataset_one_record, + description="A distributed experiment", + config={"models": ["gpt-4.1"]}, + ) + assert exp._is_distributed is True + assert exp._task == Experiment._NO_OP_TASK + assert exp.name == "test_distributed_experiment" + assert exp._description == "A distributed experiment" + assert exp._config == {"models": ["gpt-4.1"]} + assert exp._id is not None + assert exp._run_name is not None + assert exp._project_id is not None + + wait_for_backend(10) + + # Verify the experiment exists on the backend + fetched = llmobs._instance._dne_client.experiment_get(exp._id) + assert fetched._id == exp._id + assert fetched._is_distributed is True + assert fetched.name == "test_distributed_experiment" + assert fetched._project_id == exp._project_id + + # ensure that multiple calls with the same name will not create a new unique experiment + exp_repeated = llmobs._distributed_experiment( + "test_distributed_experiment", + dataset=test_dataset_one_record, + description="A distributed experiment", + config={"models": ["gpt-4.1"]}, + ) + assert exp_repeated._id == exp._id + assert exp_repeated._is_distributed is True + assert exp_repeated._task == Experiment._NO_OP_TASK + assert exp_repeated.name == "test_distributed_experiment" + assert exp_repeated._description == "A distributed experiment" + assert exp_repeated._config == {"models": ["gpt-4.1"]} + assert exp_repeated._id is not None + assert exp_repeated._run_name is not None + assert exp_repeated._project_id is not None + + +def test_run_for_experiment(llmobs, test_dataset_one_record): + """Test _run_for_experiment fetches an experiment by ID, assigns task/evaluators, and runs.""" + exp = llmobs._distributed_experiment( + "test_run_for_experiment", + dataset=test_dataset_one_record, + ) + experiment_id = exp._id + + wait_for_backend(10) + + records = [ + DatasetRecord( + input_data={"prompt": "What is the capital of France?"}, + expected_output={"answer": "Paris"}, + ) + ] + + with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record: + mock_process_record.return_value = { + "idx": 0, + "span_id": "123", + "trace_id": "456", + "timestamp": MOCK_TIMESTAMP_NS, + "output": {"prompt": "What is the capital of France?"}, + "metadata": { + "dataset_record_index": 0, + "experiment_name": "test_run_for_experiment", + "dataset_name": "test-dataset-123", + }, + "error": {"message": None, "type": None, "stack": None}, + } + with mock.patch("ddtrace.llmobs._experiment._ExperimentRunInfo") as mock_experiment_run_info: + mock_experiment_run_info.return_value = run_info_with_stable_id(0) + returned_exp, results = llmobs._run_for_experiment( + experiment_id=experiment_id, + task=dummy_task, + dataset_records=records, + evaluators=[dummy_evaluator], + ) + + assert returned_exp._id == experiment_id + assert returned_exp._is_distributed is True + assert returned_exp._task == dummy_task + assert returned_exp._evaluators == [dummy_evaluator] + assert len(results["runs"]) == 1 + assert len(results["runs"][0].rows) == 1 + + +def test_submit_eval_metric_with_explicit_span(llmobs, test_dataset_one_record): + """Test _submit_eval_metric with an explicitly provided span context.""" + exp = llmobs._distributed_experiment( + "test_submit_eval_metric_with_explicit_span", + dataset=test_dataset_one_record, + ) + + wait_for_backend(10) + + with mock.patch.object(llmobs._instance._dne_client, "experiment_eval_post") as mock_eval_post: + exp._submit_eval_metric( + eval_name="accuracy", + eval_value=0.95, + span={"span_id": "abc123", "trace_id": "def456"}, + ) + mock_eval_post.assert_called_once() + eval_metrics = mock_eval_post.call_args[0][1] + assert len(eval_metrics) == 1 + assert eval_metrics[0]["label"] == "accuracy" + assert eval_metrics[0]["score_value"] == 0.95 + assert eval_metrics[0]["span_id"] == "abc123" + assert eval_metrics[0]["trace_id"] == "def456" + assert eval_metrics[0]["metric_source"] == "custom" + + +def test_submit_eval_metric_summary(llmobs, test_dataset_one_record): + """Test _submit_eval_metric for a summary evaluation.""" + exp = llmobs._distributed_experiment( + "test_submit_eval_metric_summary", + dataset=test_dataset_one_record, + ) + + wait_for_backend(10) + + with mock.patch.object(llmobs._instance._dne_client, "experiment_eval_post") as mock_eval_post: + exp._submit_eval_metric( + eval_name="overall_quality", + eval_value=42, + is_summary_eval=True, + ) + eval_metrics = mock_eval_post.call_args[0][1] + assert eval_metrics[0]["metric_source"] == "summary" + assert eval_metrics[0]["span_id"] == "" + assert eval_metrics[0]["trace_id"] == "" + + +def test_submit_eval_metric_raises_when_not_distributed(llmobs, test_dataset_one_record): + """Test _submit_eval_metric raises when experiment is not distributed.""" + exp = llmobs.experiment( + "test_experiment", + dummy_task, + test_dataset_one_record, + [dummy_evaluator], + ) + with pytest.raises(ValueError, match="this method is only used for distributed experiments"): + exp._experiment._submit_eval_metric(eval_name="test", eval_value=1.0) + + +def test_submit_eval_metric_raises_on_invalid_span(llmobs, test_dataset_one_record): + """Test _submit_eval_metric raises on invalid span format.""" + exp = llmobs._distributed_experiment( + "test_submit_eval_metric_raises_on_invalid_span", + dataset=test_dataset_one_record, + ) + with pytest.raises(TypeError, match="`span` must be a dictionary"): + exp._submit_eval_metric(eval_name="test", eval_value=1.0, span="not-a-dict") + + with pytest.raises(TypeError, match="`span` must be a dictionary"): + exp._submit_eval_metric(eval_name="test", eval_value=1.0, span={"span_id": "abc"}) + + +def test_submit_eval_metric_raises_when_no_span_available(llmobs, test_dataset_one_record): + """Test _submit_eval_metric raises when no span is provided and experiment_span is None.""" + exp = llmobs._distributed_experiment( + "test_submit_eval_metric_raises_when_no_span_available", + dataset=test_dataset_one_record, + ) + with pytest.raises(TypeError, match="unexpected state, must supply span or must run the experiment first"): + exp._submit_eval_metric(eval_name="test", eval_value=1.0) + + # ============================================================================= # AsyncExperiment Tests # =============================================================================