diff --git a/src/tools/info/tl_info.c b/src/tools/info/tl_info.c index 95d20312ca3..12f22b31e32 100644 --- a/src/tools/info/tl_info.c +++ b/src/tools/info/tl_info.c @@ -122,7 +122,7 @@ static const char *size_limit_to_str(size_t min_size, size_t max_size) } static void print_iface_info(uct_worker_h worker, uct_md_h md, - uct_tl_resource_desc_t *resource) + const uct_tl_resource_desc_v2_t *resource) { char buf[256] = {0}; uct_iface_params_t iface_params = { @@ -131,8 +131,8 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, UCT_IFACE_PARAM_FIELD_STATS_ROOT | UCT_IFACE_PARAM_FIELD_RX_HEADROOM, .open_mode = UCT_IFACE_OPEN_MODE_DEVICE, - .mode.device.tl_name = resource->tl_name, - .mode.device.dev_name = resource->dev_name, + .mode.device.tl_name = resource->desc.tl_name, + .mode.device.dev_name = resource->desc.dev_name, .stats_root = ucs_stats_get_root(), .rx_headroom = 0 }; @@ -142,18 +142,19 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, ucs_status_t status; uct_iface_h iface; - status = uct_md_iface_config_read(md, resource->tl_name, NULL, NULL, &iface_config); + status = uct_md_iface_config_read(md, resource->desc.tl_name, NULL, NULL, &iface_config); if (status != UCS_OK) { return; } - printf("# Transport: %s\n", resource->tl_name); - printf("# Device: %s\n", resource->dev_name); - printf("# Type: %s\n", uct_device_type_names[resource->dev_type]); + printf("# Transport: %s\n", resource->desc.tl_name); + printf("# Device: %s\n", resource->desc.dev_name); + printf("# Type: %s\n", uct_device_type_names[resource->desc.dev_type]); + printf("# Flags: 0x%zx\n", resource->flags); printf("# System device: %s", - ucs_topo_sys_device_get_name(resource->sys_device)); - if (resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) { - printf(" (%d)", resource->sys_device); + ucs_topo_sys_device_get_name(resource->desc.sys_device)); + if (resource->desc.sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) { + printf(" (%d)", resource->desc.sys_device); } printf("\n"); @@ -348,7 +349,7 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, } static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name, - uct_tl_resource_desc_t *resources, + const uct_tl_resource_desc_v2_t *resources, unsigned num_resources, int print_opts, ucs_config_print_flags_t print_flags) @@ -375,7 +376,7 @@ static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name, printf("# (No supported devices found)\n"); } for (i = 0; i < num_resources; ++i) { - ucs_assert(!strcmp(tl_name, resources[i].tl_name)); + ucs_assert(!strcmp(tl_name, resources[i].desc.tl_name)); print_iface_info(worker, md, &resources[i]); } @@ -392,7 +393,7 @@ static void print_md_info(uct_component_h component, const char *req_tl_name) { UCS_STRING_BUFFER_ONSTACK(strb, 256); - uct_tl_resource_desc_t *resources, tmp; + uct_tl_resource_desc_v2_t *resources, tmp; unsigned resource_index, j, num_resources, count; ucs_memory_type_t mem_type; ucs_status_t status; @@ -400,6 +401,7 @@ static void print_md_info(uct_component_h component, uct_md_config_t *md_config; uct_md_attr_v2_t md_attr; uct_md_h md; + uct_md_query_tl_resources_params_t params; status = uct_md_config_read(component, NULL, NULL, &md_config); if (status != UCS_OK) { @@ -413,7 +415,9 @@ static void print_md_info(uct_component_h component, goto out; } - status = uct_md_query_tl_resources(md, &resources, &num_resources); + params.field_mask = 0; + status = uct_md_query_tl_resources_v2(md, &resources, &num_resources, + ¶ms); if (status != UCS_OK) { printf("# < failed to query memory domain resources >\n"); goto out_close_md; @@ -426,7 +430,7 @@ static void print_md_info(uct_component_h component, if (req_tl_name != NULL) { resource_index = 0; while (resource_index < num_resources) { - if (!strcmp(resources[resource_index].tl_name, req_tl_name)) { + if (!strcmp(resources[resource_index].desc.tl_name, req_tl_name)) { break; } ++resource_index; @@ -519,10 +523,10 @@ static void print_md_info(uct_component_h component, resource_index = 0; while (resource_index < num_resources) { /* Gather all resources for this transport */ - tl_name = resources[resource_index].tl_name; + tl_name = resources[resource_index].desc.tl_name; count = 1; for (j = resource_index + 1; j < num_resources; ++j) { - if (!strcmp(tl_name, resources[j].tl_name)) { + if (!strcmp(tl_name, resources[j].desc.tl_name)) { tmp = resources[count + resource_index]; resources[count + resource_index] = resources[j]; resources[j] = tmp; @@ -539,7 +543,7 @@ static void print_md_info(uct_component_h component, } out_free_list: - uct_release_tl_resource_list(resources); + uct_release_tl_resource_list_v2(resources); out_close_md: uct_md_close(md); out: diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 7007499cc3a..dd19cade6aa 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1112,9 +1112,10 @@ static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource static void ucp_add_tl_resource_if_enabled( ucp_context_h context, ucp_md_index_t md_index, const ucp_config_t *config, const ucs_string_set_t *aux_tls, - const uct_tl_resource_desc_t *resource, unsigned *num_resources_p, + const uct_tl_resource_desc_v2_t *tl_resource, unsigned *num_resources_p, uint64_t dev_cfg_masks[], uint64_t *tl_cfg_mask) { + const uct_tl_resource_desc_t *resource = &tl_resource->desc; uint8_t rsc_flags; ucp_rsc_index_t dev_index, i; @@ -1133,10 +1134,15 @@ static void ucp_add_tl_resource_if_enabled( context->tl_rscs[context->num_tls].tl_name_csum = ucs_crc16_string(resource->tl_name); context->tl_rscs[context->num_tls].flags = rsc_flags; + if (tl_resource->flags & UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE) { + context->tl_rscs[context->num_tls].flags |= + UCP_TL_RSC_FLAG_INTER_NODE; + } dev_index = 0; for (i = 0; i < context->num_tls; ++i) { - if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) { + if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, + resource)) { dev_index = context->tl_rscs[i].dev_index; break; } else { @@ -1159,16 +1165,19 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, uint64_t *tl_cfg_mask) { ucp_tl_md_t *md = &context->tl_mds[md_index]; - uct_tl_resource_desc_t *tl_resources; + uct_tl_resource_desc_v2_t *tl_resources; ucp_tl_resource_desc_t *tmp; unsigned num_tl_resources; ucs_status_t status; ucp_rsc_index_t i; + uct_md_query_tl_resources_params_t params; - *num_resources_p = 0; + *num_resources_p = 0; + params.field_mask = 0; /* check what are the available uct resources */ - status = uct_md_query_tl_resources(md->md, &tl_resources, &num_tl_resources); + status = uct_md_query_tl_resources_v2(md->md, &tl_resources, + &num_tl_resources, ¶ms); if (status != UCS_OK) { ucs_error("Failed to query resources: %s", ucs_status_string(status)); goto out; @@ -1197,10 +1206,10 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, /* copy only the resources enabled by user configuration */ context->tl_rscs = tmp; for (i = 0; i < num_tl_resources; ++i) { - ucs_string_set_addf(&avail_devices[tl_resources[i].dev_type], - "'%s'(%s)", tl_resources[i].dev_name, + ucs_string_set_addf(&avail_devices[tl_resources[i].desc.dev_type], + "'%s'(%s)", tl_resources[i].desc.dev_name, context->tl_cmpts[md->cmpt_index].attr.name); - ucs_string_set_add(avail_tls, tl_resources[i].tl_name); + ucs_string_set_add(avail_tls, tl_resources[i].desc.tl_name); ucp_add_tl_resource_if_enabled(context, md_index, config, aux_tls, &tl_resources[i], num_resources_p, dev_cfg_masks, tl_cfg_mask); @@ -1208,7 +1217,7 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, status = UCS_OK; free_resources: - uct_release_tl_resource_list(tl_resources); + uct_release_tl_resource_list_v2(tl_resources); out: return status; } diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h index 1395b9704a3..ccc58d02763 100644 --- a/src/ucp/core/ucp_context.h +++ b/src/ucp/core/ucp_context.h @@ -39,7 +39,10 @@ KHASH_IMPL(ucp_context_imported_mem_hash, uint64_t, ucs_rcache_t*, 1, enum { /* The flag indicates that the resource may be used for auxiliary * wireup communications only */ - UCP_TL_RSC_FLAG_AUX = UCS_BIT(0) + UCP_TL_RSC_FLAG_AUX = UCS_BIT(0), + /* The flag indicates that the resource may be used as inter-node + * communication */ + UCP_TL_RSC_FLAG_INTER_NODE = UCS_BIT(1) }; diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index 7ac0e2fc64e..0b7a7839b03 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -2961,7 +2961,9 @@ static ucs_status_t ucp_worker_address_pack(ucp_worker_h worker, if (address_flags & UCP_WORKER_ADDRESS_FLAG_NET_ONLY) { UCS_STATIC_BITMAP_RESET_ALL(&tl_bitmap); UCS_STATIC_BITMAP_FOR_EACH_BIT(tl_id, &worker->context->tl_bitmap) { - if (context->tl_rscs[tl_id].tl_rsc.dev_type == UCT_DEVICE_TYPE_NET) { + if ((context->tl_rscs[tl_id].tl_rsc.dev_type == + UCT_DEVICE_TYPE_NET) || + (context->tl_rscs[tl_id].flags & UCP_TL_RSC_FLAG_INTER_NODE)) { UCS_STATIC_BITMAP_SET(&tl_bitmap, tl_id); } } diff --git a/src/uct/api/v2/uct_v2.h b/src/uct/api/v2/uct_v2.h index a6649903483..8a3b934fcc7 100644 --- a/src/uct/api/v2/uct_v2.h +++ b/src/uct/api/v2/uct_v2.h @@ -1050,6 +1050,91 @@ int uct_iface_is_reachable_v2(uct_iface_h iface, const uct_iface_is_reachable_params_t *params); +/** + * @ingroup UCT_RESOURCE + * @brief Capability flags of @ref uct_tl_resource_desc_t. + * + * The enumeration defines bit mask of capabilities in @ref + * uct_tl_resource_desc_v2_t::flags, set by @ref uct_md_query_tl_resources_v2. + */ +typedef enum { + /** + * If set, the resource supports inter-node communications. + */ + UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE = UCS_BIT(0) +} uct_md_query_tl_esources_flags_t; + + +/** + * @ingroup UCT_RESOURCE + * @brief Parameters passed to @ref uct_md_query_tl_resources_v2. + */ +typedef struct { + /** + * Mask of valid fields which must currently be set to zero. + * Future fields not specified in this mask will be ignored. + * Provides ABI compatibility with respect to adding new fields. + */ + uint64_t field_mask; +} uct_md_query_tl_resources_params_t; + + +/** + * @ingroup UCT_RESOURCE + * @brief Communication resource descriptor. + * + * Resource descriptor of a standalone communication resource with extraneous + * flags. + */ +typedef struct uct_tl_resource_desc_v2 { + /** + * Main resource descriptor + */ + uct_tl_resource_desc_t desc; + + /** + * Associated resource flags using bits from @ref + * uct_md_query_tl_resources_flags_t. + */ + uint64_t flags; +} uct_tl_resource_desc_v2_t; + + +/** + * @ingroup UCT_RESOURCE + * @brief Query for transport resources. + * + * This routine queries the @ref uct_md_h "memory domain" for communication + * resources that are available for it. + * + * @param [in] md Handle to memory domain. + * @param [out] resources_p Filled with a pointer to an array of resource + * descriptors. + * @param [out] num_resources_p Filled with the number of resources in the array. + * @param [in] params Parameters as defined in @ref + * uct_md_query_tl_resources_params_t. + * + * @return Error code. + */ +ucs_status_t +uct_md_query_tl_resources_v2(uct_md_h md, + uct_tl_resource_desc_v2_t **resources_p, + unsigned *num_resources_p, + uct_md_query_tl_resources_params_t *params); + + +/** + * @ingroup UCT_RESOURCE + * @brief Release the list of resources returned from @ref uct_md_query_tl_resources_v2. + * + * This routine releases the memory associated with the list of resources + * allocated by @ref uct_md_query_tl_resources_v2. + * + * @param [in] resources Array of resource descriptors to release. + */ +void uct_release_tl_resource_list_v2(uct_tl_resource_desc_v2_t *resources); + + /** * @ingroup UCT_RESOURCE * @brief Connect endpoint to a remote endpoint. diff --git a/src/uct/base/uct_iface.h b/src/uct/base/uct_iface.h index b44d4306b8e..31d1ed9e9d2 100644 --- a/src/uct/base/uct_iface.h +++ b/src/uct/base/uct_iface.h @@ -385,6 +385,7 @@ typedef struct uct_tl_device_resource { (e.g. UCT_DEVICE_TYPE_NET for a network interface) */ ucs_sys_device_t sys_device; /**< The identifier associated with the device bus_id as captured in ucs_sys_bus_id_t struct */ + uint64_t flags; /**< Associated flags to the resource */ } uct_tl_device_resource_t; diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 61c0dfff10e..b7357267d7c 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -76,17 +76,25 @@ void uct_md_close(uct_md_h md) md->ops->close(md); } -ucs_status_t uct_md_query_tl_resources(uct_md_h md, - uct_tl_resource_desc_t **resources_p, - unsigned *num_resources_p) +ucs_status_t +uct_md_query_tl_resources_v2(uct_md_h md, + uct_tl_resource_desc_v2_t **resources_p, + unsigned *num_resources_p, + uct_md_query_tl_resources_params_t *params) { uct_component_t *component = md->component; - uct_tl_resource_desc_t *resources, *tmp; + uct_tl_resource_desc_v2_t *resources, *tmp; uct_tl_device_resource_t *tl_devices; unsigned i, num_resources, num_tl_devices; ucs_status_t status; uct_tl_t *tl; + if (params->field_mask != 0) { + ucs_error("invalid field_mask 0x%" PRIu64 " passed", + params->field_mask); + return UCS_ERR_INVALID_PARAM; + } + resources = NULL; num_resources = 0; @@ -114,12 +122,14 @@ ucs_status_t uct_md_query_tl_resources(uct_md_h md, /* add tl devices to overall list of resources */ for (i = 0; i < num_tl_devices; ++i) { - ucs_strncpy_zero(tmp[num_resources + i].tl_name, tl->name, - sizeof(tmp[num_resources + i].tl_name)); - ucs_strncpy_zero(tmp[num_resources + i].dev_name, tl_devices[i].name, - sizeof(tmp[num_resources + i].dev_name)); - tmp[num_resources + i].dev_type = tl_devices[i].type; - tmp[num_resources + i].sys_device = tl_devices[i].sys_device; + ucs_strncpy_zero(tmp[num_resources + i].desc.tl_name, tl->name, + sizeof(tmp[num_resources + i].desc.tl_name)); + ucs_strncpy_zero(tmp[num_resources + i].desc.dev_name, + tl_devices[i].name, + sizeof(tmp[num_resources + i].desc.dev_name)); + tmp[num_resources + i].desc.dev_type = tl_devices[i].type; + tmp[num_resources + i].desc.sys_device = tl_devices[i].sys_device; + tmp[num_resources + i].flags = tl_devices[i].flags; } resources = tmp; @@ -136,6 +146,43 @@ ucs_status_t uct_md_query_tl_resources(uct_md_h md, return status; } +void uct_release_tl_resource_list_v2(uct_tl_resource_desc_v2_t *resources) +{ + ucs_free(resources); +} + +ucs_status_t uct_md_query_tl_resources(uct_md_h md, + uct_tl_resource_desc_t **resources_p, + unsigned *num_resources_p) +{ + uct_md_query_tl_resources_params_t params; + uct_tl_resource_desc_v2_t *resources_v2; + ucs_status_t status; + unsigned i; + + params.field_mask = 0; + status = uct_md_query_tl_resources_v2(md, &resources_v2, + num_resources_p, ¶ms); + if (status != UCS_OK) { + return status; + } + + *resources_p = ucs_malloc(*num_resources_p * sizeof(**resources_p), + "tl resource"); + if (*resources_p == NULL) { + ucs_error("failed to allocate tl resources descriptor"); + uct_release_tl_resource_list_v2(resources_v2); + return UCS_ERR_NO_MEMORY; + } + + for (i = 0; i < *num_resources_p; ++i) { + (*resources_p)[i] = resources_v2[i].desc; + } + + uct_release_tl_resource_list_v2(resources_v2); + return UCS_OK; +} + void uct_release_tl_resource_list(uct_tl_resource_desc_t *resources) { ucs_free(resources); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 893d987267e..5f9c8c0ae9c 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -611,16 +611,23 @@ uct_cuda_ipc_query_devices( uct_md_h uct_md, uct_tl_device_resource_t **tl_devices_p, unsigned *num_tl_devices_p) { - uct_device_type_t dev_type = UCT_DEVICE_TYPE_SHM; + uint64_t flags = 0; + ucs_status_t status; #if HAVE_CUDA_FABRIC uct_cuda_ipc_md_t *md = ucs_derived_of(uct_md, uct_cuda_ipc_md_t); if (uct_cuda_ipc_iface_is_mnnvl_supported(md)) { - dev_type = UCT_DEVICE_TYPE_NET; + flags = UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE; } #endif - return uct_cuda_base_query_devices_common(uct_md, dev_type, - tl_devices_p, num_tl_devices_p); + status = uct_cuda_base_query_devices_common(uct_md, UCT_DEVICE_TYPE_SHM, + tl_devices_p, num_tl_devices_p); + if (status == UCS_OK) { + ucs_assert(*num_tl_devices_p == 1); + (*tl_devices_p)->flags = flags; + } + + return status; } UCS_CLASS_DEFINE(uct_cuda_ipc_iface_t, uct_cuda_iface_t); diff --git a/src/uct/tcp/tcp_iface.c b/src/uct/tcp/tcp_iface.c index 38c31bfac81..200e42edda9 100644 --- a/src/uct/tcp/tcp_iface.c +++ b/src/uct/tcp/tcp_iface.c @@ -964,6 +964,7 @@ ucs_status_t uct_tcp_query_devices(uct_md_h md, (*entry)->d_name); devices[num_devices].type = UCT_DEVICE_TYPE_NET; devices[num_devices].sys_device = sys_dev; + devices[num_devices].flags = 0; ++num_devices; } diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index 06f98293ef0..7e2f8f2c2ff 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -558,6 +558,42 @@ UCS_TEST_P(test_md, mem_query) { } } +UCS_TEST_P(test_md, tl_resource_desc_v1_v2) { + uct_md_query_tl_resources_params_t params = {}; + uct_tl_resource_desc_t *v1; + uct_tl_resource_desc_v2_t *v2; + unsigned num_v1, num_v2; + ucs_status_t status_v1, status_v2; + + status_v1 = uct_md_query_tl_resources(md(), &v1, &num_v1); + status_v2 = uct_md_query_tl_resources_v2(md(), &v2, &num_v2, ¶ms); + + EXPECT_UCS_OK(status_v1); + EXPECT_UCS_OK(status_v2); + EXPECT_GT(num_v2, 0); + EXPECT_EQ(num_v2, num_v1); + + if ((status_v1 == UCS_OK) && (status_v2 == UCS_OK) && + (num_v1 == num_v2)) { + for (auto i = 0; i < num_v1; ++i) { + EXPECT_TRUE(!strcmp(v1[i].tl_name, v2[i].desc.tl_name)); + EXPECT_TRUE(!strcmp(v1[i].dev_name, v2[i].desc.dev_name)); + EXPECT_EQ(v1[i].dev_type, v2[i].desc.dev_type); + EXPECT_EQ(v1[i].sys_device, v2[i].desc.sys_device); + EXPECT_TRUE((v2[i].flags == 0) || + (v2[i].flags == UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE)); + } + } + + if (status_v1 == UCS_OK) { + uct_release_tl_resource_list(v1); + } + + if (status_v2 == UCS_OK) { + uct_release_tl_resource_list_v2(v2); + } +} + UCS_TEST_P(test_md, sys_device) { uct_tl_resource_desc_t *tl_resources; unsigned num_tl_resources;