[RLLib] Pass large AlgorithmConfig by reference to RolloutWorker (ray-project#50688)

jjyao · web-flow · commit e94a60bb8073 · 2025-02-19T12:46:28.000-08:00
Signed-off-by: Jiajun Yao &lt;jeromeyjj@gmail.com&gt;
diff --git a/rllib/env/env_runner_group.py b/rllib/env/env_runner_group.py
@@ -130,6 +130,7 @@ def __init__(
         self._env_creator = env_creator
         self._policy_class = default_policy_class
         self._remote_config = config
+        self._remote_config_obj_ref = ray.put(self._remote_config)
         self._remote_args = {
             "num_cpus": self._remote_config.num_cpus_per_env_runner,
             "num_gpus": self._remote_config.num_gpus_per_env_runner,
@@ -665,7 +666,10 @@ def add_workers(self, num_workers: int, validate: bool = False) -> None:
                 validate_env=None,
                 worker_index=old_num_workers + i + 1,
                 num_workers=old_num_workers + num_workers,
-                config=self._remote_config,
+                # self._remote_config can be large
+                # and it's best practice to pass it by reference
+                # instead of value (https://docs.ray.io/en/latest/ray-core/patterns/pass-large-arg-by-value.html)
+                config=self._remote_config_obj_ref,
             )
             for i in range(num_workers)
         ]
diff --git a/rllib/evaluation/tests/test_env_runner_v2.py b/rllib/evaluation/tests/test_env_runner_v2.py
@@ -25,26 +25,32 @@
 register_env("basic_multiagent", lambda _: BasicMultiAgent(2))
 
 
+def _get_mapper():
+    # Note(Artur): This was originally part of the unittest.TestCase.setUpClass
+    # method but caused trouble when serializing the config because we ended up
+    # serializing `self`, which is an instance of unittest.TestCase.
+
+    # When dealing with two policies in these tests, simply alternate between the 2
+    # policies to make sure we have data for inference for both policies for each
+    # step.
+    class AlternatePolicyMapper:
+        def __init__(self):
+            self.policies = ["one", "two"]
+            self.next = 0
+
+        def map(self):
+            p = self.policies[self.next]
+            self.next = 1 - self.next
+            return p
+
+    return AlternatePolicyMapper()
+
+
 class TestEnvRunnerV2(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         ray.init()
 
-        # When dealing with two policies in these tests, simply alternate between the 2
-        # policies to make sure we have data for inference for both policies for each
-        # step.
-        class AlternatePolicyMapper:
-            def __init__(self):
-                self.policies = ["one", "two"]
-                self.next = 0
-
-            def map(self):
-                p = self.policies[self.next]
-                self.next = 1 - self.next
-                return p
-
-        cls.mapper = AlternatePolicyMapper()
-
     @classmethod
     def tearDownClass(cls):
         ray.shutdown()
@@ -215,6 +221,8 @@ def __init__(self, *args, **kwargs):
                 self.view_requirements["rewards"].used_for_compute_actions = False
                 self.view_requirements["terminateds"].used_for_compute_actions = False
 
+        mapper = _get_mapper()
+
         config = (
             PPOConfig()
             .api_stack(
@@ -240,7 +248,7 @@ def __init__(self, *args, **kwargs):
                         policy_class=RandomPolicyTwo,
                     ),
                 },
-                policy_mapping_fn=lambda *args, **kwargs: self.mapper.map(),
+                policy_mapping_fn=lambda *args, **kwargs: mapper.map(),
                 policies_to_train=["one"],
                 count_steps_by="agent_steps",
             )
@@ -316,6 +324,7 @@ def on_create_policy(self, *, policy_id, policy) -> None:
         _ = rollout_worker.sample()
 
     def test_start_episode(self):
+        mapper = _get_mapper()
         config = (
             PPOConfig()
             .api_stack(
@@ -341,7 +350,7 @@ def test_start_episode(self):
                         policy_class=RandomPolicy,
                     ),
                 },
-                policy_mapping_fn=lambda *args, **kwargs: self.mapper.map(),
+                policy_mapping_fn=lambda *args, **kwargs: mapper.map(),
                 policies_to_train=["one"],
                 count_steps_by="agent_steps",
             )
@@ -373,6 +382,7 @@ def test_start_episode(self):
         self.assertEqual(env_runner._active_episodes[0].total_agent_steps, 2)
 
     def test_env_runner_output(self):
+        mapper = _get_mapper()
         # Test if we can produce RolloutMetrics just by stepping
         config = (
             PPOConfig()
@@ -399,7 +409,7 @@ def test_env_runner_output(self):
                         policy_class=RandomPolicy,
                     ),
                 },
-                policy_mapping_fn=lambda *args, **kwargs: self.mapper.map(),
+                policy_mapping_fn=lambda *args, **kwargs: mapper.map(),
                 policies_to_train=["one"],
                 count_steps_by="agent_steps",
             )
@@ -434,6 +444,7 @@ def on_episode_end(
                 # We should see an error episode.
                 assert isinstance(episode, Exception)
 
+        mapper = _get_mapper()
         # Test if we can produce RolloutMetrics just by stepping
         config = (
             PPOConfig()
@@ -460,7 +471,7 @@ def on_episode_end(
                         policy_class=RandomPolicy,
                     ),
                 },
-                policy_mapping_fn=lambda *args, **kwargs: self.mapper.map(),
+                policy_mapping_fn=lambda *args, **kwargs: mapper.map(),
                 policies_to_train=["one"],
                 count_steps_by="agent_steps",
             )