diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 2937eddff3a..1c519d794eb 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -6,6 +6,7 @@
 from dataclasses import field
 import re
 import sys
+import time
 import traceback
 from typing import TYPE_CHECKING
 from typing import Any
@@ -37,6 +38,7 @@
     from ddtrace.llmobs import LLMObs
     from ddtrace.llmobs._writer import LLMObsExperimentEvalMetricEvent
     from ddtrace.llmobs._writer import LLMObsExperimentsClient
+    from ddtrace.llmobs.types import ExportedLLMObsSpan
 
 
 logger = get_logger(__name__)
@@ -743,6 +745,11 @@ class Experiment:
     _evaluators: Sequence[Union[EvaluatorType, AsyncEvaluatorType]]
     _summary_evaluators: Sequence[Union[SummaryEvaluatorType, AsyncSummaryEvaluatorType]]
 
+    @classmethod
+    def _NO_OP_TASK(cls, input_data, config):
+        """No-op task used when initializing distributed experiment objects on remote hosts."""
+        return None
+
     def __init__(
         self,
         name: str,
@@ -756,6 +763,7 @@ def __init__(
         _llmobs_instance: Optional["LLMObs"] = None,
         summary_evaluators: Optional[Sequence[Union[SummaryEvaluatorType, AsyncSummaryEvaluatorType]]] = None,
         runs: Optional[int] = None,
+        is_distributed: Optional[bool] = False,
     ) -> None:
         self.name = name
         self._task = task
@@ -774,6 +782,7 @@ def __init__(
             self._config["filtered_record_tags"] = cast(JSONType, dataset.filter_tags)
         self._runs: int = runs or 1
         self._llmobs_instance = _llmobs_instance
+        self._is_distributed = is_distributed
 
         if not project_name:
             raise ValueError(
@@ -785,6 +794,7 @@ def __init__(
         self._project_id: Optional[str] = None
         self._id: Optional[str] = None
         self._run_name: Optional[str] = None
+        self.experiment_span: Optional["ExportedLLMObsSpan"] = None
 
     @property
     def url(self) -> str:
@@ -983,7 +993,7 @@ def _prepare_summary_evaluator_data(
 
         return inputs, outputs, expected_outputs, metadata_list, eval_results_by_name
 
-    def _setup_experiment(self, llmobs_not_enabled_error: str) -> None:
+    def _setup_experiment(self, llmobs_not_enabled_error: str, ensure_unique: bool = True) -> None:
         if not self._llmobs_instance or not self._llmobs_instance.enabled:
             raise ValueError(llmobs_not_enabled_error)
 
@@ -1000,6 +1010,7 @@ def _setup_experiment(self, llmobs_not_enabled_error: str) -> None:
             convert_tags_dict_to_list(self._tags),
             self._description,
             self._runs,
+            ensure_unique,
         )
         self._id = experiment_id
         self._tags["experiment_id"] = str(experiment_id)
@@ -1070,6 +1081,8 @@ async def _process_record(
                 experiment_name=self.name,
             ) as span:
                 span_context = self._llmobs_instance.export_span(span=span)
+                if self._is_distributed:
+                    self.experiment_span = span_context
                 if span_context:
                     span_id = span_context.get("span_id", "")
                     trace_id = span_context.get("trace_id", "")
@@ -1336,6 +1349,92 @@ async def _evaluate_summary_single(summary_evaluator: Any) -> tuple[str, dict[st
 
         return evaluations
 
+    async def _run_task_single_iteration(
+        self,
+        jobs: int = 1,
+        raise_errors: bool = False,
+        run_iteration: Optional[int] = 0,
+    ) -> ExperimentResult:
+        run = _ExperimentRunInfo(run_iteration or 0)
+        self._tags["run_id"] = str(run._id)
+        self._tags["run_iteration"] = str(run._run_iteration)
+        task_results = await self._run_task(jobs, run, raise_errors, None)
+        evaluations = await self._run_evaluators(task_results, raise_errors=raise_errors, jobs=jobs)
+        run_result = self._merge_results(run, task_results, evaluations, [])
+        experiment_evals = self._generate_metrics_from_exp_results(run_result)
+        self._llmobs_instance._dne_client.experiment_eval_post(  # type: ignore[union-attr]
+            cast(str, self._id), experiment_evals, convert_tags_dict_to_list(self._tags)
+        )
+        return {
+            "summary_evaluations": {},
+            "rows": [],
+            "runs": [run_result],
+        }
+
+    def _submit_eval_metric(
+        self,
+        eval_name: str,
+        eval_value: JSONType,
+        span: Optional["ExportedLLMObsSpan"] = None,
+        timestamp_ms: Optional[int] = None,
+        is_summary_eval: Optional[bool] = None,
+        reasoning: Optional[str] = None,
+        assessment: Optional[str] = None,
+        metadata: Optional[dict[str, JSONType]] = None,
+        tags: Optional[dict[str, str]] = None,
+    ) -> None:
+        """Submit an evaluation metric for a distributed experiment.
+
+        :param eval_name: Name of the evaluation metric
+        :param eval_value: Value of the evaluation metric
+        :param span: Optional span context dict with span_id and trace_id. If None and not a
+                     summary eval, uses the last span from _run_task_single_iteration.
+        :param timestamp_ms: Optional timestamp in milliseconds
+        :param is_summary_eval: Whether this is a summary-level evaluation
+        :param reasoning: Optional reasoning string
+        :param assessment: Optional assessment string
+        :param metadata: Optional metadata dict
+        :param tags: Optional tags dict
+        """
+        if not self._is_distributed:
+            raise ValueError("this method is only used for distributed experiments")
+
+        if span is not None and (
+            not isinstance(span, dict)
+            or not isinstance(span.get("span_id"), str)
+            or not isinstance(span.get("trace_id"), str)
+        ):
+            raise TypeError(
+                "`span` must be a dictionary containing both span_id and trace_id keys. "
+                "LLMObs.export_span() can be used to generate this dictionary from a given span."
+            )
+
+        if span is None and not is_summary_eval and self.experiment_span is None:
+            raise TypeError("unexpected state, must supply span or must run the experiment first")
+
+        if span is None and not is_summary_eval:
+            span = self.experiment_span
+
+        timestamp_ns = int(timestamp_ms * 1e6) if timestamp_ms is not None else int(time.time() * 1e9)
+
+        eval_metric = self._generate_metric_from_evaluation(
+            eval_name,
+            eval_value,
+            None,
+            span.get("span_id", "") if span else "",
+            span.get("trace_id", "") if span else "",
+            timestamp_ns,
+            "summary" if is_summary_eval else "custom",
+            reasoning,
+            assessment,
+            metadata,
+            tags,
+        )
+
+        self._llmobs_instance._dne_client.experiment_eval_post(  # type: ignore[union-attr]
+            cast(str, self._id), [eval_metric], convert_tags_dict_to_list(self._tags)
+        )
+
 
 class SyncExperiment:
     """Thin synchronous wrapper around the async-native ``Experiment``.
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index e1e3b1f5c28..2ec321bb9f1 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -107,8 +107,10 @@
 from ddtrace.llmobs._experiment import Dataset
 from ddtrace.llmobs._experiment import DatasetRecord
 from ddtrace.llmobs._experiment import DatasetRecordInputType
+from ddtrace.llmobs._experiment import EvaluatorResult
 from ddtrace.llmobs._experiment import EvaluatorType
 from ddtrace.llmobs._experiment import Experiment
+from ddtrace.llmobs._experiment import ExperimentResult
 from ddtrace.llmobs._experiment import JSONType
 from ddtrace.llmobs._experiment import Project
 from ddtrace.llmobs._experiment import SummaryEvaluatorType
@@ -1328,6 +1330,73 @@ def async_experiment(
             runs=runs,
         )
 
+    @classmethod
+    def _distributed_experiment(
+        cls,
+        name: str,
+        dataset: Dataset,
+        description: str = "",
+        project_name: Optional[str] = None,
+        tags: Optional[dict[str, str]] = None,
+        config: Optional[ConfigType] = None,
+        runs: Optional[int] = 1,
+    ) -> Experiment:
+        experiment = Experiment(
+            name,
+            Experiment._NO_OP_TASK,
+            dataset,
+            [],
+            project_name=project_name or cls._project_name,
+            tags=tags,
+            description=description,
+            config=config,
+            _llmobs_instance=cls._instance,
+            runs=runs,
+            is_distributed=True,
+        )
+        experiment._setup_experiment(
+            "LLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)`",
+            ensure_unique=False,
+        )
+        return experiment
+
+    @classmethod
+    def _run_for_experiment(
+        cls,
+        experiment_id: str,
+        task: Callable[[DatasetRecordInputType, Optional[ConfigType]], JSONType],
+        dataset_records: list[DatasetRecord],
+        evaluators: list[
+            Union[
+                Callable[[DatasetRecordInputType, JSONType, JSONType], Union[JSONType, EvaluatorResult]],
+                Callable[[], Union[JSONType, EvaluatorResult]],
+            ]
+        ],
+        jobs: int = 1,
+        raise_errors: bool = False,
+        run_iteration: Optional[int] = 0,
+        tags: Optional[dict[str, str]] = None,
+    ) -> tuple[Experiment, ExperimentResult]:
+        if not cls._instance or not cls._instance.enabled:
+            raise ValueError("LLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)`")
+        experiment = cls._instance._dne_client.experiment_get(experiment_id)
+        experiment._llmobs_instance = cls._instance
+        experiment._dataset._records = dataset_records
+        experiment._task = task
+        experiment._evaluators = evaluators  # type: ignore[assignment]
+
+        coro = experiment._run_task_single_iteration(jobs, raise_errors, run_iteration)
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            results = asyncio.run(coro)
+        else:
+            import concurrent.futures
+
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                results = pool.submit(asyncio.run, coro).result()
+        return experiment, results
+
     @classmethod
     def register_processor(cls, processor: Optional[Callable[[LLMObsSpan], Optional[LLMObsSpan]]] = None) -> None:
         """Register a processor to be called on each LLMObs span.
diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
index afc8297fc76..712cc0ae4bf 100644
--- a/ddtrace/llmobs/_writer.py
+++ b/ddtrace/llmobs/_writer.py
@@ -38,6 +38,7 @@
 from ddtrace.llmobs._experiment import Dataset
 from ddtrace.llmobs._experiment import DatasetRecord
 from ddtrace.llmobs._experiment import DatasetRecordRaw
+from ddtrace.llmobs._experiment import Experiment
 from ddtrace.llmobs._experiment import JSONType
 from ddtrace.llmobs._experiment import Project
 from ddtrace.llmobs._experiment import UpdatableDatasetRecord
@@ -662,6 +663,67 @@ def project_create_or_get(self, name: Optional[str] = None) -> Project:
 
         return project
 
+    def experiment_get(self, id: str, tag_overrides: Optional[dict[str, str]] = None) -> "Experiment":  # noqa: A002
+        path = f"/api/v2/llm-obs/v1/experiments?filter[id]={id}"
+        resp = self.request("GET", path)
+        if resp.status != 200:
+            raise ValueError(f"Failed to get experiment with ID {id}: {resp.status} {resp.get_json()}")
+        response_data = resp.get_json()
+        experiments = response_data.get("data", [])
+        if len(experiments) < 1:
+            raise ValueError(f"No experiments found for ID {id}")
+        experiment = experiments[0]["attributes"]
+        project_id = experiment["project_id"]
+        dataset_id = experiment["dataset_id"]
+
+        tags: list[str] = experiment["metadata"].get("tags", [])
+        tags_dict: dict[str, str] = {}
+        for tag in tags:
+            kv = tag.split(":", 1)
+            if len(kv) == 2:
+                tags_dict[kv[0]] = kv[1]
+
+        if tag_overrides:
+            tags_dict.update(tag_overrides)
+
+        # TODO[gh] attempt to find the project & dataset name through tags if possible,
+        # temporary hack to avoid extra API calls
+        project_name = tags_dict.get("project_name", project_id)
+        dataset_name = tags_dict.get("dataset_name", dataset_id)
+
+        project = Project(name=project_name, _id=project_id)
+
+        dataset = Dataset(
+            name=dataset_name,
+            project=project,
+            dataset_id=dataset_id,
+            records=[],
+            # TODO[gh] need to fully pull dataset for this to be accurate, not critical for now
+            description="",
+            # TODO[gh] this may be incorrect
+            latest_version=experiment["dataset_version"],
+            version=experiment["dataset_version"],
+            _dne_client=self,
+        )
+
+        experiment_obj = Experiment(
+            name=experiment["experiment"],
+            task=Experiment._NO_OP_TASK,
+            dataset=dataset,
+            evaluators=[],
+            project_name=project_name,
+            tags=tags_dict,
+            description=experiment["description"],
+            config=experiment.get("config", {}),
+            _llmobs_instance=None,
+            runs=experiment["run_count"],
+            is_distributed=True,
+        )
+        experiment_obj._run_name = experiment["name"]
+        experiment_obj._id = id
+        experiment_obj._project_id = project_id
+        return experiment_obj
+
     def experiment_create(
         self,
         name: str,
@@ -672,6 +734,7 @@ def experiment_create(
         tags: Optional[list[str]] = None,
         description: Optional[str] = None,
         runs: Optional[int] = 1,
+        ensure_unique: bool = True,
     ) -> tuple[str, str]:
         path = "/api/unstable/llm-obs/v1/experiments"
         resp = self.request(
@@ -688,7 +751,7 @@ def experiment_create(
                         "dataset_version": dataset_version,
                         "config": exp_config or {},
                         "metadata": {"tags": cast(JSONType, tags or [])},
-                        "ensure_unique": True,
+                        "ensure_unique": ensure_unique,
                         "run_count": runs,
                     },
                 }
diff --git a/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml b/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml
index d0d2f41093f..a84f2b55a1f 100644
--- a/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml
+++ b/releasenotes/notes/llmobs-dne-experiments-dataset-records-filtering-095c0c445fec2fec.yaml
@@ -1,6 +1,6 @@
 ---
 features:
   - |
-    LLM Observability: Subset of dataset records can now be pulled with tags by using the ``tags`` argument 
+    LLM Observability: Subset of dataset records can now be pulled with tags by using the ``tags``
     argument to ``LLMObs.pull_dataset``, provided in a list of strings of key value pairs: 
     ``LLMObs.pull_dataset(dataset_name="my-dataset", tags=["env:prod", "version:1.0"])``
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml
new file mode 100644
index 00000000000..fae04bd567a
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_1a7b4baf-d4c5-4053-9afd-d2e656c80757_events_post_e68fdc48.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
+      "metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
+      "timestamp_ms": 1771602113367, "metric_type": "score", "label": "dummy_evaluator",
+      "score_value": 0, "error": null, "tags": [], "experiment_id": "1a7b4baf-d4c5-4053-9afd-d2e656c80757"}],
+      "tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean", "dataset_name:test-dataset-123",
+      "experiment_name:test_run_for_experiment", "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8",
+      "run_id:12345678-abcd-abcd-abcd-123456789012", "run_iteration:1"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '603'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/1a7b4baf-d4c5-4053-9afd-d2e656c80757/events
+  response:
+    body:
+      string: ''
+    headers:
+      content-length:
+      - '0'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:06:47 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 202
+      message: Accepted
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml
new file mode 100644
index 00000000000..c3ba886c6bd
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_2fa3eedc.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_raises_on_invalid_span",
+      "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id":
+      "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean",
+      "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_raises_on_invalid_span",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '547'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"ff62ff7c-5099-4570-915c-023ae44b56d3","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:34.453515088Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_raises_on_invalid_span","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_raises_on_invalid_span","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_raises_on_invalid_span","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:34.453515178Z"}}}'
+    headers:
+      content-length:
+      - '752'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:08:34 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml
new file mode 100644
index 00000000000..9da4c91be5c
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_42c62a09.yaml
@@ -0,0 +1,51 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_distributed_experiment",
+      "description": "A distributed experiment", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6",
+      "project_id": "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1,
+      "config": {"models": ["gpt-4.1"]}, "metadata": {"tags": ["ddtrace.version:4.5.0rc4",
+      "project_name:test-project-clean", "dataset_name:test-dataset-123", "experiment_name:test_distributed_experiment",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '554'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{"models":["gpt-4.1"]},"created_at":"2026-02-20T17:05:49.52469781Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"A
+        distributed experiment","experiment":"test_distributed_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_distributed_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_distributed_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:49.524697884Z"}}}'
+    headers:
+      content-length:
+      - '738'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:05:49 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml
new file mode 100644
index 00000000000..537e67b3054
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_49547366.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_raises_when_no_span_available",
+      "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id":
+      "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean",
+      "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_raises_when_no_span_available",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '561'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"e000911f-2410-4114-b635-97884a0515ed","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:36.940475557Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_raises_when_no_span_available","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_raises_when_no_span_available","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_raises_when_no_span_available","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:36.940475631Z"}}}'
+    headers:
+      content-length:
+      - '773'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:08:36 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml
new file mode 100644
index 00000000000..ccf07d9afb0
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_7159ccda.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_summary",
+      "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id":
+      "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean",
+      "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_summary",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '517'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"da544dd1-f1ca-4720-b39d-b7ea32ec0681","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:08:07.047455296Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_summary","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_summary","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_summary","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:08:07.047455387Z"}}}'
+    headers:
+      content-length:
+      - '707'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:08:07 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml
new file mode 100644
index 00000000000..e26c2fb96ad
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_bd334a87.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_run_for_experiment",
+      "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id":
+      "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean",
+      "dataset_name:test-dataset-123", "experiment_name:test_run_for_experiment",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '501'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"1a7b4baf-d4c5-4053-9afd-d2e656c80757","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:05:29.86277Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_run_for_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_run_for_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_run_for_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:29.86277Z"}}}'
+    headers:
+      content-length:
+      - '675'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:06:37 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml
new file mode 100644
index 00000000000..bd2eedbef2c
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_experiments_post_ee65f763.yaml
@@ -0,0 +1,50 @@
+interactions:
+- request:
+    body: '{"data": {"type": "experiments", "attributes": {"name": "test_submit_eval_metric_with_explicit_span",
+      "description": "", "dataset_id": "126b1329-c830-4904-94d6-c278ac3d75c6", "project_id":
+      "569aa33c-7acf-4061-abd0-4f2078e61ee8", "dataset_version": 1, "config": {},
+      "metadata": {"tags": ["ddtrace.version:4.5.0rc4", "project_name:test-project-clean",
+      "dataset_name:test-dataset-123", "experiment_name:test_submit_eval_metric_with_explicit_span",
+      "project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]}, "ensure_unique": false,
+      "run_count": 1}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '539'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments
+  response:
+    body:
+      string: '{"data":{"id":"8216d32c-28ec-4e57-a6d3-fbb859c249c7","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:07:41.837856806Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_submit_eval_metric_with_explicit_span","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_submit_eval_metric_with_explicit_span","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_submit_eval_metric_with_explicit_span","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:07:41.837856921Z"}}}'
+    headers:
+      content-length:
+      - '740'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:07:41 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml
new file mode 100644
index 00000000000..8be93f1017e
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca_get_7fa7e32e.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/v2/llm-obs/v1/experiments?filter%5Bid%5D=0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca
+  response:
+    body:
+      string: '{"data":[{"id":"0ef1c572-785c-4de7-9a3c-b3a5ca19f1ca","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{"models":["gpt-4.1"]},"created_at":"2026-02-20T17:05:49.524697Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"A
+        distributed experiment","experiment":"test_distributed_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_distributed_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_distributed_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:49.524697Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '755'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:05:59 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml
new file mode 100644
index 00000000000..7fcc776c0fd
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_v2_llm-obs_v1_experiments_filter_id__1a7b4baf-d4c5-4053-9afd-d2e656c80757_get_040454bc.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/v2/llm-obs/v1/experiments?filter%5Bid%5D=1a7b4baf-d4c5-4053-9afd-d2e656c80757
+  response:
+    body:
+      string: '{"data":[{"id":"1a7b4baf-d4c5-4053-9afd-d2e656c80757","type":"experiments","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"config":{},"created_at":"2026-02-20T17:05:29.86277Z","dataset_id":"126b1329-c830-4904-94d6-c278ac3d75c6","dataset_version":1,"description":"","experiment":"test_run_for_experiment","metadata":{"tags":["ddtrace.version:4.5.0rc4","project_name:test-project-clean","dataset_name:test-dataset-123","experiment_name:test_run_for_experiment","project_id:569aa33c-7acf-4061-abd0-4f2078e61ee8"]},"name":"test_run_for_experiment","project_id":"569aa33c-7acf-4061-abd0-4f2078e61ee8","run_count":1,"updated_at":"2026-02-20T17:05:29.86277Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '697'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Fri, 20 Feb 2026 17:06:47 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 54454783b7d..1b563d89353 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -27,6 +27,7 @@
 from ddtrace.llmobs._experiment import Dataset
 from ddtrace.llmobs._experiment import DatasetRecord
 from ddtrace.llmobs._experiment import EvaluatorResult
+from ddtrace.llmobs._experiment import Experiment
 from ddtrace.llmobs._experiment import _ExperimentRunInfo
 from tests.utils import override_global_config
 
@@ -2409,6 +2410,184 @@ def simple_evaluator(input_data, output_data, expected_output):
     assert summary_evals_dict["successful_summary_evaluator"]["error"] is None
 
 
+# =============================================================================
+# Distributed Experiment Tests
+# =============================================================================
+
+
+def test_distributed_experiment(llmobs, test_dataset_one_record):
+    """Test that _distributed_experiment creates an experiment with is_distributed=True."""
+    exp = llmobs._distributed_experiment(
+        "test_distributed_experiment",
+        dataset=test_dataset_one_record,
+        description="A distributed experiment",
+        config={"models": ["gpt-4.1"]},
+    )
+    assert exp._is_distributed is True
+    assert exp._task == Experiment._NO_OP_TASK
+    assert exp.name == "test_distributed_experiment"
+    assert exp._description == "A distributed experiment"
+    assert exp._config == {"models": ["gpt-4.1"]}
+    assert exp._id is not None
+    assert exp._run_name is not None
+    assert exp._project_id is not None
+
+    wait_for_backend(10)
+
+    # Verify the experiment exists on the backend
+    fetched = llmobs._instance._dne_client.experiment_get(exp._id)
+    assert fetched._id == exp._id
+    assert fetched._is_distributed is True
+    assert fetched.name == "test_distributed_experiment"
+    assert fetched._project_id == exp._project_id
+
+    # ensure that multiple calls with the same name will not create a new unique experiment
+    exp_repeated = llmobs._distributed_experiment(
+        "test_distributed_experiment",
+        dataset=test_dataset_one_record,
+        description="A distributed experiment",
+        config={"models": ["gpt-4.1"]},
+    )
+    assert exp_repeated._id == exp._id
+    assert exp_repeated._is_distributed is True
+    assert exp_repeated._task == Experiment._NO_OP_TASK
+    assert exp_repeated.name == "test_distributed_experiment"
+    assert exp_repeated._description == "A distributed experiment"
+    assert exp_repeated._config == {"models": ["gpt-4.1"]}
+    assert exp_repeated._id is not None
+    assert exp_repeated._run_name is not None
+    assert exp_repeated._project_id is not None
+
+
+def test_run_for_experiment(llmobs, test_dataset_one_record):
+    """Test _run_for_experiment fetches an experiment by ID, assigns task/evaluators, and runs."""
+    exp = llmobs._distributed_experiment(
+        "test_run_for_experiment",
+        dataset=test_dataset_one_record,
+    )
+    experiment_id = exp._id
+
+    wait_for_backend(10)
+
+    records = [
+        DatasetRecord(
+            input_data={"prompt": "What is the capital of France?"},
+            expected_output={"answer": "Paris"},
+        )
+    ]
+
+    with mock.patch("ddtrace.llmobs._experiment.Experiment._process_record") as mock_process_record:
+        mock_process_record.return_value = {
+            "idx": 0,
+            "span_id": "123",
+            "trace_id": "456",
+            "timestamp": MOCK_TIMESTAMP_NS,
+            "output": {"prompt": "What is the capital of France?"},
+            "metadata": {
+                "dataset_record_index": 0,
+                "experiment_name": "test_run_for_experiment",
+                "dataset_name": "test-dataset-123",
+            },
+            "error": {"message": None, "type": None, "stack": None},
+        }
+        with mock.patch("ddtrace.llmobs._experiment._ExperimentRunInfo") as mock_experiment_run_info:
+            mock_experiment_run_info.return_value = run_info_with_stable_id(0)
+            returned_exp, results = llmobs._run_for_experiment(
+                experiment_id=experiment_id,
+                task=dummy_task,
+                dataset_records=records,
+                evaluators=[dummy_evaluator],
+            )
+
+    assert returned_exp._id == experiment_id
+    assert returned_exp._is_distributed is True
+    assert returned_exp._task == dummy_task
+    assert returned_exp._evaluators == [dummy_evaluator]
+    assert len(results["runs"]) == 1
+    assert len(results["runs"][0].rows) == 1
+
+
+def test_submit_eval_metric_with_explicit_span(llmobs, test_dataset_one_record):
+    """Test _submit_eval_metric with an explicitly provided span context."""
+    exp = llmobs._distributed_experiment(
+        "test_submit_eval_metric_with_explicit_span",
+        dataset=test_dataset_one_record,
+    )
+
+    wait_for_backend(10)
+
+    with mock.patch.object(llmobs._instance._dne_client, "experiment_eval_post") as mock_eval_post:
+        exp._submit_eval_metric(
+            eval_name="accuracy",
+            eval_value=0.95,
+            span={"span_id": "abc123", "trace_id": "def456"},
+        )
+        mock_eval_post.assert_called_once()
+        eval_metrics = mock_eval_post.call_args[0][1]
+        assert len(eval_metrics) == 1
+        assert eval_metrics[0]["label"] == "accuracy"
+        assert eval_metrics[0]["score_value"] == 0.95
+        assert eval_metrics[0]["span_id"] == "abc123"
+        assert eval_metrics[0]["trace_id"] == "def456"
+        assert eval_metrics[0]["metric_source"] == "custom"
+
+
+def test_submit_eval_metric_summary(llmobs, test_dataset_one_record):
+    """Test _submit_eval_metric for a summary evaluation."""
+    exp = llmobs._distributed_experiment(
+        "test_submit_eval_metric_summary",
+        dataset=test_dataset_one_record,
+    )
+
+    wait_for_backend(10)
+
+    with mock.patch.object(llmobs._instance._dne_client, "experiment_eval_post") as mock_eval_post:
+        exp._submit_eval_metric(
+            eval_name="overall_quality",
+            eval_value=42,
+            is_summary_eval=True,
+        )
+        eval_metrics = mock_eval_post.call_args[0][1]
+        assert eval_metrics[0]["metric_source"] == "summary"
+        assert eval_metrics[0]["span_id"] == ""
+        assert eval_metrics[0]["trace_id"] == ""
+
+
+def test_submit_eval_metric_raises_when_not_distributed(llmobs, test_dataset_one_record):
+    """Test _submit_eval_metric raises when experiment is not distributed."""
+    exp = llmobs.experiment(
+        "test_experiment",
+        dummy_task,
+        test_dataset_one_record,
+        [dummy_evaluator],
+    )
+    with pytest.raises(ValueError, match="this method is only used for distributed experiments"):
+        exp._experiment._submit_eval_metric(eval_name="test", eval_value=1.0)
+
+
+def test_submit_eval_metric_raises_on_invalid_span(llmobs, test_dataset_one_record):
+    """Test _submit_eval_metric raises on invalid span format."""
+    exp = llmobs._distributed_experiment(
+        "test_submit_eval_metric_raises_on_invalid_span",
+        dataset=test_dataset_one_record,
+    )
+    with pytest.raises(TypeError, match="`span` must be a dictionary"):
+        exp._submit_eval_metric(eval_name="test", eval_value=1.0, span="not-a-dict")
+
+    with pytest.raises(TypeError, match="`span` must be a dictionary"):
+        exp._submit_eval_metric(eval_name="test", eval_value=1.0, span={"span_id": "abc"})
+
+
+def test_submit_eval_metric_raises_when_no_span_available(llmobs, test_dataset_one_record):
+    """Test _submit_eval_metric raises when no span is provided and experiment_span is None."""
+    exp = llmobs._distributed_experiment(
+        "test_submit_eval_metric_raises_when_no_span_available",
+        dataset=test_dataset_one_record,
+    )
+    with pytest.raises(TypeError, match="unexpected state, must supply span or must run the experiment first"):
+        exp._submit_eval_metric(eval_name="test", eval_value=1.0)
+
+
 # =============================================================================
 # AsyncExperiment Tests
 # =============================================================================