Abingcbc
diff --git a/‎rllib/algorithms/bc/bc.py‎
Lines changed: 8 additions & 0 deletions b/‎rllib/algorithms/bc/bc.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎rllib/algorithms/cql/cql.py‎
Lines changed: 6 additions & 2 deletions b/‎rllib/algorithms/cql/cql.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎rllib/algorithms/marwil/marwil.py‎
Lines changed: 6 additions & 1 deletion b/‎rllib/algorithms/marwil/marwil.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎rllib/algorithms/marwil/tests/test_marwil.py‎
Lines changed: 2 additions & 1 deletion b/‎rllib/algorithms/marwil/tests/test_marwil.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎rllib/core/learner/learner.py‎
Lines changed: 52 additions & 30 deletions b/‎rllib/core/learner/learner.py‎
Lines changed: 52 additions & 30 deletions
diff --git a/‎rllib/core/learner/torch/torch_learner.py‎
Lines changed: 13 additions & 2 deletions b/‎rllib/core/learner/torch/torch_learner.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎rllib/offline/offline_data.py‎
Lines changed: 23 additions & 5 deletions b/‎rllib/offline/offline_data.py‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎rllib/offline/offline_prelearner.py‎
Lines changed: 8 additions & 22 deletions b/‎rllib/offline/offline_prelearner.py‎
Lines changed: 8 additions & 22 deletions
@@ -97,6 +97,14 @@ def build_learner_connector(
         pipeline.remove("AddOneTsToEpisodesAndTruncate")
         pipeline.remove("GeneralAdvantageEstimation")
 
+        # In case we run multiple updates per RLlib training step in the `Learner` or
+        # when training on GPU conversion to tensors is managed in batch prefetching.
+        if self.num_gpus_per_learner > 0 or (
+            self.dataset_num_iters_per_learner
+            and self.dataset_num_iters_per_learner > 1
+        ):
+            pipeline.remove("NumpyToTensor")
+
         return pipeline
 
     @override(MARWILConfig)
 
@@ -212,8 +212,12 @@ def build_learner_connector(
             AddNextObservationsFromEpisodesToTrainBatch(),
         )
 
-        # If training on GPU, do not convert batches to tensors.
-        if self.num_gpus_per_learner > 0:
+        # In case we run multiple updates per RLlib training step in the `Learner` or
+        # when training on GPU conversion to tensors is managed in batch prefetching.
+        if self.num_gpus_per_learner > 0 or (
+            self.dataset_num_iters_per_learner
+            and self.dataset_num_iters_per_learner > 1
+        ):
             pipeline.remove("NumpyToTensor")
 
         return pipeline
 
@@ -377,7 +377,12 @@ def build_learner_connector(
 
         # If training on GPU, convert batches to `numpy` arrays to load them
         # on GPU in the `Learner`.
-        if self.num_gpus_per_learner > 0:
+        # In case we run multiple updates per RLlib training step in the `Learner` or
+        # when training on GPU conversion to tensors is managed in batch prefetching.
+        if self.num_gpus_per_learner > 0 or (
+            self.dataset_num_iters_per_learner
+            and self.dataset_num_iters_per_learner > 1
+        ):
             pipeline.insert_after(GeneralAdvantageEstimation, TensorToNumpy())
 
         return pipeline
 
@@ -10,6 +10,7 @@
 from ray.rllib.core.learner.learner import POLICY_LOSS_KEY, VF_LOSS_KEY
 from ray.rllib.env import INPUT_ENV_SPACES
 from ray.rllib.offline.offline_prelearner import OfflinePreLearner
+from ray.rllib.utils import unflatten_dict
 from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.test_utils import check
 
@@ -172,7 +173,7 @@ def test_marwil_loss_function(self):
         )
         # Note, for `ray.data`'s pipeline everything has to be a dictionary
         # therefore the batch is embedded into another dictionary.
-        batch = offline_prelearner(batch)["batch"][0]
+        batch = unflatten_dict(offline_prelearner(batch))
         if Columns.LOSS_MASK in batch[DEFAULT_MODULE_ID]:
             loss_mask = (
                 batch[DEFAULT_MODULE_ID][Columns.LOSS_MASK].detach().cpu().numpy()
 
@@ -19,7 +19,6 @@
 )
 
 import ray
-from ray.data.iterator import DataIterator
 from ray.rllib.connectors.learner.learner_connector_pipeline import (
     LearnerConnectorPipeline,
 )
@@ -37,6 +36,7 @@
     MultiRLModuleSpec,
 )
 from ray.rllib.core.rl_module.rl_module import RLModule, RLModuleSpec
+from ray.rllib.utils import unflatten_dict
 from ray.rllib.policy.policy import PolicySpec
 from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils.annotations import (
@@ -269,7 +269,7 @@ def __init__(
 
         # In case of offline learning and multiple learners, each learner receives a
         # repeatable iterator that iterates over a split of the streamed data.
-        self.iterator: DataIterator = None
+        self.iterator: MiniBatchRayDataIterator = None
 
     # TODO (sven): Do we really need this API? It seems like LearnerGroup constructs
     #  all Learner workers and then immediately builds them any ways? Unless there is
@@ -727,7 +727,13 @@ def get_parameters(self, module: RLModule) -> Sequence[Param]:
         """
 
     @abc.abstractmethod
-    def _convert_batch_type(self, batch: MultiAgentBatch) -> MultiAgentBatch:
+    def _convert_batch_type(
+        self,
+        batch: MultiAgentBatch,
+        to_device: bool = False,
+        pin_memory: bool = False,
+        use_stream: bool = False,
+    ) -> MultiAgentBatch:
         """Converts the elements of a MultiAgentBatch to Tensors on the correct device.
 
         Args:
@@ -1041,33 +1047,36 @@ def update(
                     "Learner.update(data_iterators=..) requires `num_iters` kwarg!"
                 )
 
+            def _collate_fn(_batch: Dict[str, numpy.ndarray]) -> MultiAgentBatch:
+                _batch = unflatten_dict(_batch)
+                _batch = MultiAgentBatch(
+                    {
+                        module_id: SampleBatch(module_data)
+                        for module_id, module_data in _batch.items()
+                    },
+                    env_steps=sum(
+                        len(next(iter(module_data.values())))
+                        for module_data in _batch.values()
+                    ),
+                )
+                _batch = self._convert_batch_type(_batch, to_device=False)
+                return self._set_slicing_by_batch_id(_batch, value=True)
+
+            def _finalize_fn(batch: MultiAgentBatch) -> MultiAgentBatch:
+                return self._convert_batch_type(batch, to_device=True, use_stream=True)
+
             if not self.iterator:
-                self.iterator = training_data.data_iterators[0]
-
-            def _finalize_fn(_batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]:
-                # Note, the incoming batch is a dictionary with a numpy array
-                # holding the `MultiAgentBatch`.
-                _batch = self._convert_batch_type(_batch["batch"][0])
-                return {"batch": self._set_slicing_by_batch_id(_batch, value=True)}
-
-            batch_iter = MiniBatchRayDataIterator(
-                iterator=self.iterator,
-                finalize_fn=_finalize_fn,
-                num_iters=num_iters,
-                **kwargs,
-            )
-            # Record the number of batches pulled from the dataset.
-            self.metrics.log_value(
-                (ALL_MODULES, DATASET_NUM_ITERS_TRAINED),
-                num_iters,
-                reduce="sum",
-                clear_on_reduce=True,
-            )
-            self.metrics.log_value(
-                (ALL_MODULES, DATASET_NUM_ITERS_TRAINED_LIFETIME),
-                num_iters,
-                reduce="sum",
-            )
+                # This iterator holds a `ray.data.DataIterator` and manages it state.
+                self.iterator = MiniBatchRayDataIterator(
+                    iterator=training_data.data_iterators[0],
+                    collate_fn=_collate_fn,
+                    finalize_fn=_finalize_fn,
+                    minibatch_size=minibatch_size,
+                    num_iters=num_iters,
+                    **kwargs,
+                )
+
+            batch_iter = self.iterator
         else:
             batch = self._make_batch_if_necessary(training_data=training_data)
             assert batch is not None
@@ -1104,7 +1113,7 @@ def _finalize_fn(_batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]:
             )
 
         # Perform the actual looping through the minibatches or the given data iterator.
-        for tensor_minibatch in batch_iter:
+        for iteration, tensor_minibatch in enumerate(batch_iter):
             # Check the MultiAgentBatch, whether our RLModule contains all ModuleIDs
             # found in this batch. If not, throw an error.
             unknown_module_ids = set(tensor_minibatch.policy_batches.keys()) - set(
@@ -1133,6 +1142,19 @@ def _finalize_fn(_batch: Dict[str, numpy.ndarray]) -> Dict[str, Any]:
 
             self._set_slicing_by_batch_id(tensor_minibatch, value=False)
 
+        if self.iterator:
+            # Record the number of batches pulled from the dataset.
+            self.metrics.log_value(
+                (ALL_MODULES, DATASET_NUM_ITERS_TRAINED),
+                iteration + 1,
+                reduce="sum",
+                clear_on_reduce=True,
+            )
+            self.metrics.log_value(
+                (ALL_MODULES, DATASET_NUM_ITERS_TRAINED_LIFETIME),
+                iteration + 1,
+                reduce="sum",
+            )
         # Log all individual RLModules' loss terms and its registered optimizers'
         # current learning rates.
         # Note: We do this only once for the last of the minibatch updates, b/c the
 
@@ -366,8 +366,19 @@ def get_parameters(self, module: RLModule) -> Sequence[Param]:
         return list(module.parameters())
 
     @override(Learner)
-    def _convert_batch_type(self, batch: MultiAgentBatch) -> MultiAgentBatch:
-        batch = convert_to_torch_tensor(batch.policy_batches, device=self._device)
+    def _convert_batch_type(
+        self,
+        batch: MultiAgentBatch,
+        to_device: bool = True,
+        pin_memory: bool = False,
+        use_stream: bool = False,
+    ) -> MultiAgentBatch:
+        batch = convert_to_torch_tensor(
+            batch.policy_batches,
+            device=self._device if to_device else None,
+            pin_memory=pin_memory,
+            use_stream=use_stream,
+        )
         # TODO (sven): This computation of `env_steps` is not accurate!
         length = max(len(b) for b in batch.values())
         batch = MultiAgentBatch(batch, env_steps=length)
 
@@ -1,14 +1,19 @@
 import logging
 from pathlib import Path
 import pyarrow.fs
+import numpy as np
 import ray
 import time
 import types
 
+from typing import Dict
+
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig
 from ray.rllib.core import COMPONENT_RL_MODULE
 from ray.rllib.env import INPUT_ENV_SPACES
 from ray.rllib.offline.offline_prelearner import OfflinePreLearner
+from ray.rllib.utils import unflatten_dict
+from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
 from ray.rllib.utils import force_list
 from ray.rllib.utils.annotations import (
     OverrideToImplementCustomLogic,
@@ -223,13 +228,26 @@ def sample(
                     self.batch_iterators = self.data.iterator()
                 # Otherwise, the user wants batches returned.
                 else:
+                    # Define a collate (last-mile) transformation that maps batches
+                    # to RLlib's `MultiAgentBatch`.
+                    def _collate_fn(_batch: Dict[str, np.ndarray]) -> MultiAgentBatch:
+                        _batch = unflatten_dict(_batch)
+                        return MultiAgentBatch(
+                            {
+                                module_id: SampleBatch(module_data)
+                                for module_id, module_data in _batch.items()
+                            },
+                            env_steps=sum(
+                                len(next(iter(module_data.values())))
+                                for module_data in _batch.values()
+                            ),
+                        )
+
                     # If no iterator should be returned, or if we want to return a single
                     # batch iterator, we instantiate the batch iterator once, here.
                     self.batch_iterators = self.data.iter_batches(
-                        # This is important. The batch size is now 1, because the data
-                        # is already run through the `OfflinePreLearner` and a single
-                        # instance is a single `MultiAgentBatch` of size `num_samples`.
-                        batch_size=1,
+                        batch_size=num_samples,
+                        _collate_fn=_collate_fn,
                         **self.iter_batches_kwargs,
                     )
                     self.batch_iterators = iter(self.batch_iterators)
@@ -240,7 +258,7 @@ def sample(
         else:
             # Return a single batch from the iterator.
             try:
-                return next(self.batch_iterators)["batch"][0]
+                return next(self.batch_iterators)
             except StopIteration:
                 # If the batch iterator is exhausted, reinitiate a new one.
                 logger.debug(
 
@@ -8,7 +8,7 @@
 from ray.rllib.core.columns import Columns
 from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec, MultiRLModule
 from ray.rllib.env.single_agent_episode import SingleAgentEpisode
-from ray.rllib.policy.sample_batch import MultiAgentBatch, SampleBatch
+from ray.rllib.utils import flatten_dict
 from ray.rllib.utils.annotations import (
     OverrideToImplementCustomLogic,
     OverrideToImplementCustomLogic_CallToSuperRecommended,
@@ -137,7 +137,7 @@ def __init__(
             )
 
     @OverrideToImplementCustomLogic
-    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, List[EpisodeType]]:
+    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
         """Prepares plain data batches for training with `Learner`'s.
 
         Args:
@@ -212,7 +212,7 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, List[EpisodeType]]
                 self._is_multi_agent,
                 batch,
                 schema=SCHEMA | self.config.input_read_schema,
-                to_numpy=True,
+                to_numpy=False,
                 input_compress_columns=self.config.input_compress_columns,
                 observation_space=self.observation_space,
                 action_space=self.action_space,
@@ -255,28 +255,14 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, List[EpisodeType]]
             #  LearnerConnector pipeline.
             metrics=None,
         )
-        # Convert to `MultiAgentBatch`.
-        batch = MultiAgentBatch(
-            {
-                module_id: SampleBatch(module_data)
-                for module_id, module_data in batch.items()
-            },
-            # TODO (simon): This can be run once for the batch and the
-            # metrics, but we run it twice: here and later in the learner.
-            env_steps=sum(e.env_steps() for e in episodes),
-        )
         # Remove all data from modules that should not be trained. We do
-        # not want to pass around more data than necessaty.
-        for module_id in list(batch.policy_batches.keys()):
+        # not want to pass around more data than necessary.
+        for module_id in batch:
             if not self._should_module_be_updated(module_id, batch):
-                del batch.policy_batches[module_id]
-
-        # TODO (simon): Log steps trained for metrics (how?). At best in learner
-        # and not here. But we could precompute metrics here and pass it to the learner
-        # for logging. Like this we do not have to pass around episode lists.
+                del batch[module_id]
 
-        # TODO (simon): episodes are only needed for logging here.
-        return {"batch": [batch]}
+        # Flatten the dictionary to increase serialization performance.
+        return flatten_dict(batch)
 
     @property
     def default_prelearner_buffer_class(self) -> ReplayBuffer: