-
Notifications
You must be signed in to change notification settings - Fork 420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCT/API: Add new MD resource query API implementation #10161
base: master
Are you sure you want to change the base?
Conversation
test/gtest/uct/test_md.cc
Outdated
|
||
ASSERT_UCS_OK(uct_md_query_tl_resources(md(), &v1, &num_v1)); | ||
ASSERT_UCS_OK(uct_md_query_tl_resources_v2(md(), &v2, &num_v2, ¶ms)); | ||
ASSERT_TRUE(num_v2 > 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
EXPECT_GT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
test/gtest/uct/test_md.cc
Outdated
ASSERT_UCS_OK(uct_md_query_tl_resources(md(), &v1, &num_v1)); | ||
ASSERT_UCS_OK(uct_md_query_tl_resources_v2(md(), &v2, &num_v2, ¶ms)); | ||
ASSERT_TRUE(num_v2 > 0); | ||
ASSERT_EQ(num_v2, num_v1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe EXPECT_EQ, to avoid resource leak on failure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
src/uct/api/v2/uct_v2.h
Outdated
* @ingroup UCT_RESOURCE | ||
* @brief Parameters passed to @ref uct_md_query_tl_resources_v2. | ||
*/ | ||
typedef struct uct_md_query_tl_resources_params { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
struct name seems redundant
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
src/uct/api/v2/uct_v2.h
Outdated
* The enumeration defines bit mask of capabilities in @ref | ||
* uct_tl_resource_desc_v2_t::flags, set by @ref uct_md_query_tl_resources_v2. | ||
*/ | ||
enum { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe give it a name and @reference in uct_tl_resource_desc_t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
uct_md_query_tl_resources_v2(uct_md_h md, | ||
uct_tl_resource_desc_v2_t **resources_p, | ||
unsigned *num_resources_p, | ||
uct_md_query_tl_resources_params_t *params); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can params be const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think no
src/uct/base/uct_md.c
Outdated
return status; | ||
} | ||
|
||
*resources_p = malloc(*num_resources_p * sizeof(**resources_p)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ucs_malloc
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
src/tools/info/tl_info.c
Outdated
@@ -348,7 +349,7 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md, | |||
} | |||
|
|||
static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name, | |||
uct_tl_resource_desc_t *resources, | |||
uct_tl_resource_desc_v2_t *resources, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pass by const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
src/tools/info/tl_info.c
Outdated
@@ -122,7 +122,7 @@ static const char *size_limit_to_str(size_t min_size, size_t max_size) | |||
} | |||
|
|||
static void print_iface_info(uct_worker_h worker, uct_md_h md, | |||
uct_tl_resource_desc_t *resource) | |||
uct_tl_resource_desc_v2_t *resource) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
@@ -392,14 +393,15 @@ static void print_md_info(uct_component_h component, | |||
const char *req_tl_name) | |||
{ | |||
UCS_STRING_BUFFER_ONSTACK(strb, 256); | |||
uct_tl_resource_desc_t *resources, tmp; | |||
uct_tl_resource_desc_v2_t *resources, tmp; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we can use old API here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
using the new one allows to dump the added flags
content which can be helpful, so I'd keep it.
src/ucp/core/ucp_context.c
Outdated
@@ -1112,31 +1112,36 @@ static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource | |||
static void ucp_add_tl_resource_if_enabled( | |||
ucp_context_h context, ucp_md_index_t md_index, | |||
const ucp_config_t *config, const ucs_string_set_t *aux_tls, | |||
const uct_tl_resource_desc_t *resource, unsigned *num_resources_p, | |||
const uct_tl_resource_desc_v2_t *resource, unsigned *num_resources_p, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe you can name it tl_resource
and define uct_tl_resource_desc_t *resource = &tl_resource->desc
. Then only a couple of lines need to be changed in this func
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
@@ -1159,16 +1164,19 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, | |||
uint64_t *tl_cfg_mask) | |||
{ | |||
ucp_tl_md_t *md = &context->tl_mds[md_index]; | |||
uct_tl_resource_desc_t *tl_resources; | |||
uct_tl_resource_desc_v2_t *tl_resources; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we can use old API here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need v2 because we call ucp_add_tl_resource_if_enabled()
on the end of the funcction.
} | ||
#endif | ||
return uct_cuda_base_query_devices_common(uct_md, dev_type, | ||
tl_devices_p, num_tl_devices_p); | ||
status = uct_cuda_base_query_devices_common(uct_md, dev_type, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
status = uct_cuda_base_query_devices_common(uct_md, dev_type, | |
status = uct_cuda_base_query_devices_common(uct_md, UCT_DEVICE_TYPE_SHM, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed
test/gtest/uct/test_md.cc
Outdated
EXPECT_GT(num_v2, 0); | ||
EXPECT_EQ(num_v2, num_v1); | ||
|
||
for (auto i = 0; (i < num_v1) && (num_v1 == num_v2) ; ++i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
imo, no need to check num_v1 == num_v2
again
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need it because otherwise we might use out of bound memory.
* Resource descriptor of a standalone communication resource with extraneous | ||
* flags. | ||
*/ | ||
typedef struct uct_tl_resource_desc_v2 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
redundant struct name
What
Implement UCT MD transport resource descriptor query v2.
Relates to #10141 and #10154.
Why ?
We need to introduce a new flag to allow sending address of some device of type SHM, even with net address only restriction.
How ?
Add to
ucx_info -d
and add UT. All users initialize the uct resource descriptor to zero except TCP.