Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/single_controller/test_auto_padding_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
# limitations under the License.

import numpy as np
import ray
import torch

from verl import DataProto
from verl.protocol import DataProtoConfig
from verl.single_controller.base import Worker
from verl.single_controller.base.decorator import Dispatch, register
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
import ray

# or set env var VERL_AUTO_PADDING = "1" / "true"
DataProtoConfig.auto_padding = True
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_colocated_workers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import ray

from verl import DataProto
from verl.single_controller.base import Worker
Expand All @@ -24,6 +23,7 @@
create_colocated_worker_cls,
)
from verl.utils.device import get_device_name
import ray


@ray.remote
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_colocated_workers_fused.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import ray

from verl import DataProto
from verl.single_controller.base import Worker
Expand All @@ -24,6 +23,7 @@
create_colocated_worker_cls_fused,
)
from verl.utils.device import get_device_name
import ray


@ray.remote
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_data_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
In this test, we instantiate a data parallel worker with 8 GPUs
"""

import ray
import tensordict
import torch
from codetiming import Timer
Expand All @@ -28,6 +27,7 @@
from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
from verl.utils.device import get_device_name
from verl.utils.ray_utils import parallel_put
import ray


@ray.remote
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_decorator_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import time

import pytest
import ray
import torch
from tensordict import TensorDict

Expand All @@ -25,6 +24,7 @@
from verl.single_controller.base.worker import Worker
from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
from verl.utils import tensordict_utils as tu
import ray


# Pytest fixture for Ray setup/teardown
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_driverfunc_to_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import os

import ray
import torch
from tensordict import TensorDict

Expand All @@ -23,6 +22,7 @@
from verl.single_controller.ray import RayWorkerGroup
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool
from verl.utils.device import get_device_name
import ray

os.environ["RAY_DEDUP_LOGS"] = "0"
os.environ["NCCL_DEBUG"] = "WARN"
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_fused_workers_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import ray

from verl.single_controller.base import Worker
from verl.single_controller.base.decorator import Dispatch, register
Expand All @@ -22,6 +21,7 @@
RayWorkerGroup,
create_colocated_worker_raw_cls,
)
import ray


@ray.remote
Expand Down
3 changes: 1 addition & 2 deletions tests/single_controller/test_high_level_scheduling_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,10 @@
import gc
import time

import ray

from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, merge_resource_pool
from verl.utils.device import get_device_name
import ray


@ray.remote
Expand Down
3 changes: 1 addition & 2 deletions tests/single_controller/test_nested_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@
# limitations under the License.


import ray

from verl.single_controller.base.decorator import Dispatch, register
from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
from verl.utils.device import get_device_name
import ray


class TestActor(Worker):
Expand Down
4 changes: 2 additions & 2 deletions tests/single_controller/test_ray_collectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
Then, we initiate 4 p2p comms from actor to rollout
"""

import ray
import ray.util.collective as collective
import torch

from verl.single_controller.base import Worker
from verl.single_controller.base.decorator import Dispatch, register
from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
import ray
import ray.util.collective as collective


@ray.remote
Expand Down
3 changes: 1 addition & 2 deletions tests/single_controller/test_ray_local_envs_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@

import os

import ray

from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
import ray


@ray.remote
Expand Down
3 changes: 1 addition & 2 deletions tests/single_controller/test_ray_utils_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
# limitations under the License.

import pytest
import ray

from verl.utils.ray_utils import parallel_put
import ray


# Initialize Ray for testing if not already done globally
Expand Down
1 change: 1 addition & 0 deletions tests/single_controller/test_rvdz.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import verl
import ray


Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_split_resource_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import os

import ray
import torch

from verl import DataProto
Expand All @@ -27,6 +26,7 @@
split_resource_pool,
)
from verl.utils.device import get_device_name, get_nccl_backend
import ray


@ray.remote
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_worker_group_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
e2e test verl.single_controller.ray
"""

import ray
import torch

from verl.single_controller.base.decorator import Dispatch, Execute, collect_all_to_all, register
from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
from verl.utils.device import get_device_name
import ray


def two_to_all_dispatch_fn(worker_group, *args, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion tests/single_controller/test_worker_group_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
os.environ["RAY_DEDUP_LOGS"] = "0"
os.environ["NCCL_DEBUG"] = "WARN"

import ray
import torch
import torch.distributed

from verl.single_controller.base.worker import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
from verl.utils.device import get_device_name
import ray


@ray.remote
Expand Down
2 changes: 2 additions & 0 deletions verl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import logging
import os

import verl.utils.distributed_backend

from packaging.version import parse as parse_version

from .protocol import DataProto
Expand Down
32 changes: 32 additions & 0 deletions verl/utils/distributed_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Distributed backend selection module.

This module allows users to choose between Ray and YuanRong (ray_adapter) backends
via the DISTRIBUTED_BACKEND environment variable.

Usage:
Set DISTRIBUTED_BACKEND=yr or DISTRIBUTED_BACKEND=yuanrong to use ray_adapter
Set DISTRIBUTED_BACKEND=ray or leave unset to use ray (default)

Import this module at the very beginning of entry points:
import verl.utils.distributed_backend # Must be before any other import ray
"""
import os
import sys

_BACKEND = os.getenv("DISTRIBUTED_BACKEND", "ray").lower()

if _BACKEND in ("yr", "yuanrong"):
try:
import ray_adapter as _ray_module
except ImportError:
raise ImportError(
f"DISTRIBUTED_BACKEND is set to '{_BACKEND}' but ray_adapter is not installed. "
"Please install ray_adapter or set DISTRIBUTED_BACKEND=ray to use the default Ray backend."
)
else:
import ray as _ray_module

# Inject the selected module into sys.modules so that all subsequent
# 'import ray' statements will use the selected backend
sys.modules['ray'] = _ray_module