Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/CUDA: Advertise MNNVL inter-node capability with shm device type #10141

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/ucp/core/ucp_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -2961,7 +2961,10 @@ static ucs_status_t ucp_worker_address_pack(ucp_worker_h worker,
if (address_flags & UCP_WORKER_ADDRESS_FLAG_NET_ONLY) {
UCS_STATIC_BITMAP_RESET_ALL(&tl_bitmap);
UCS_STATIC_BITMAP_FOR_EACH_BIT(tl_id, &worker->context->tl_bitmap) {
if (context->tl_rscs[tl_id].tl_rsc.dev_type == UCT_DEVICE_TYPE_NET) {
if ((context->tl_rscs[tl_id].tl_rsc.dev_type ==
UCT_DEVICE_TYPE_NET) ||
(context->tl_rscs[tl_id].tl_rsc.flags &
UCT_TL_RESOURCE_FLAG_INTER_NODE)) {
UCS_STATIC_BITMAP_SET(&tl_bitmap, tl_id);
}
}
Expand Down
3 changes: 3 additions & 0 deletions src/uct/api/uct.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,11 @@ typedef struct uct_tl_resource_desc {
(e.g. UCT_DEVICE_TYPE_NET for a network interface) */
ucs_sys_device_t sys_device; /**< The identifier associated with the device
bus_id as captured in ucs_sys_bus_id_t struct */
uint8_t flags; /**< Associated flags to the resource */
} uct_tl_resource_desc_t;

#define UCT_TL_RESOURCE_FLAG_INTER_NODE UCS_BIT(0) /**< Inter-node capability */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to declare a enum with name, and reference that name in the flags field documentation?
Otherwise it's hard to understand afterwards which enum/values are supposed to be set


#define UCT_TL_RESOURCE_DESC_FMT "%s/%s"
#define UCT_TL_RESOURCE_DESC_ARG(_resource) (_resource)->tl_name, (_resource)->dev_name

Expand Down
1 change: 1 addition & 0 deletions src/uct/base/uct_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ typedef struct uct_tl_device_resource {
(e.g. UCT_DEVICE_TYPE_NET for a network interface) */
ucs_sys_device_t sys_device; /**< The identifier associated with the device
bus_id as captured in ucs_sys_bus_id_t struct */
uint8_t flags; /**< Associated flags to the resource */
} uct_tl_device_resource_t;


Expand Down
1 change: 1 addition & 0 deletions src/uct/base/uct_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ ucs_status_t uct_md_query_tl_resources(uct_md_h md,
sizeof(tmp[num_resources + i].dev_name));
tmp[num_resources + i].dev_type = tl_devices[i].type;
tmp[num_resources + i].sys_device = tl_devices[i].sys_device;
tmp[num_resources + i].flags = tl_devices[i].flags;
}

resources = tmp;
Expand Down
15 changes: 12 additions & 3 deletions src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,16 +611,25 @@ uct_cuda_ipc_query_devices(
uct_md_h uct_md, uct_tl_device_resource_t **tl_devices_p,
unsigned *num_tl_devices_p)
{
uint8_t flags = 0;
uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM;
ucs_status_t status;

#if HAVE_CUDA_FABRIC
uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t);

if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) {
dev_type = UCT_DEVICE_TYPE_NET;
flags = UCT_TL_RESOURCE_FLAG_INTER_NODE;
}
#endif
return uct_cuda_base_query_devices_common(uct_md, dev_type,
tl_devices_p, num_tl_devices_p);
status = uct_cuda_base_query_devices_common(uct_md, dev_type, tl_devices_p,
num_tl_devices_p);
if (status == UCS_OK) {
ucs_assert(*num_tl_devices_p == 1);
(*tl_devices_p)->flags = flags;
}

return status;
}

UCS_CLASS_DEFINE(uct_cuda_ipc_iface_t, uct_cuda_iface_t);
Expand Down
Loading