Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/API: Add new MD resource query API implementation #10161

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions src/tools/info/tl_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ static const char *size_limit_to_str(size_t min_size, size_t max_size)
}

static void print_iface_info(uct_worker_h worker, uct_md_h md,
uct_tl_resource_desc_t *resource)
const uct_tl_resource_desc_v2_t *resource)
{
char buf[256] = {0};
uct_iface_params_t iface_params = {
Expand Down Expand Up @@ -150,6 +150,7 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md,
printf("# Transport: %s\n", resource->tl_name);
printf("# Device: %s\n", resource->dev_name);
printf("# Type: %s\n", uct_device_type_names[resource->dev_type]);
printf("# Flags: 0x%zx\n", resource->flags);
printf("# System device: %s",
ucs_topo_sys_device_get_name(resource->sys_device));
if (resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) {
Expand Down Expand Up @@ -348,9 +349,8 @@ static void print_iface_info(uct_worker_h worker, uct_md_h md,
}

static ucs_status_t print_tl_info(uct_md_h md, const char *tl_name,
uct_tl_resource_desc_t *resources,
unsigned num_resources,
int print_opts,
const uct_tl_resource_desc_v2_t *resources,
unsigned num_resources, int print_opts,
ucs_config_print_flags_t print_flags)
{
ucs_async_context_t async;
Expand Down Expand Up @@ -392,14 +392,15 @@ static void print_md_info(uct_component_h component,
const char *req_tl_name)
{
UCS_STRING_BUFFER_ONSTACK(strb, 256);
uct_tl_resource_desc_t *resources, tmp;
uct_tl_resource_desc_v2_t *resources, tmp;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems we can use old API here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using the new one allows to dump the added flags content which can be helpful, so I'd keep it.

unsigned resource_index, j, num_resources, count;
ucs_memory_type_t mem_type;
ucs_status_t status;
const char *tl_name;
uct_md_config_t *md_config;
uct_md_attr_v2_t md_attr;
uct_md_h md;
uct_md_query_tl_resources_params_t params;

status = uct_md_config_read(component, NULL, NULL, &md_config);
if (status != UCS_OK) {
Expand All @@ -413,7 +414,9 @@ static void print_md_info(uct_component_h component,
goto out;
}

status = uct_md_query_tl_resources(md, &resources, &num_resources);
params.field_mask = 0;
status = uct_md_query_tl_resources_v2(md, &params, &resources,
&num_resources);
if (status != UCS_OK) {
printf("# < failed to query memory domain resources >\n");
goto out_close_md;
Expand Down Expand Up @@ -539,7 +542,7 @@ static void print_md_info(uct_component_h component,
}

out_free_list:
uct_release_tl_resource_list(resources);
uct_release_tl_resource_list_v2(resources);
out_close_md:
uct_md_close(md);
out:
Expand Down
24 changes: 14 additions & 10 deletions src/ucp/core/ucp_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,7 @@ ucp_config_is_tl_name_present(const ucs_config_names_array_t *tl_array,
tl_cfg_mask));
}

static int ucp_is_resource_in_device_list(const uct_tl_resource_desc_t *resource,
static int ucp_is_resource_in_device_list(const uct_tl_resource_desc_v2_t *resource,
const ucs_config_names_array_t *devices,
uint64_t *dev_cfg_mask,
uct_device_type_t dev_type)
Expand Down Expand Up @@ -1075,7 +1075,7 @@ ucp_is_resource_in_transports_list(const char *tl_name,
return 1;
}

static int ucp_is_resource_enabled(const uct_tl_resource_desc_t *resource,
static int ucp_is_resource_enabled(const uct_tl_resource_desc_v2_t *resource,
const ucp_config_t *config,
const ucs_string_set_t *aux_tls,
uint8_t *rsc_flags, uint64_t dev_cfg_masks[],
Expand All @@ -1101,8 +1101,8 @@ static int ucp_is_resource_enabled(const uct_tl_resource_desc_t *resource,
return device_enabled && tl_enabled;
}

static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource1,
const uct_tl_resource_desc_t *resource2)
static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_v2_t *resource1,
const uct_tl_resource_desc_v2_t *resource2)
{
return !strcmp(resource1->dev_name, resource2->dev_name) ||
((resource1->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) &&
Expand All @@ -1112,7 +1112,7 @@ static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource
static void ucp_add_tl_resource_if_enabled(
ucp_context_h context, ucp_md_index_t md_index,
const ucp_config_t *config, const ucs_string_set_t *aux_tls,
const uct_tl_resource_desc_t *resource, unsigned *num_resources_p,
const uct_tl_resource_desc_v2_t *resource, unsigned *num_resources_p,
brminich marked this conversation as resolved.
Show resolved Hide resolved
uint64_t dev_cfg_masks[], uint64_t *tl_cfg_mask)
{
uint8_t rsc_flags;
Expand All @@ -1136,7 +1136,8 @@ static void ucp_add_tl_resource_if_enabled(

dev_index = 0;
for (i = 0; i < context->num_tls; ++i) {
if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) {
if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc,
resource)) {
dev_index = context->tl_rscs[i].dev_index;
break;
} else {
Expand All @@ -1159,16 +1160,19 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index,
uint64_t *tl_cfg_mask)
{
ucp_tl_md_t *md = &context->tl_mds[md_index];
uct_tl_resource_desc_t *tl_resources;
uct_tl_resource_desc_v2_t *tl_resources;
brminich marked this conversation as resolved.
Show resolved Hide resolved
ucp_tl_resource_desc_t *tmp;
unsigned num_tl_resources;
ucs_status_t status;
ucp_rsc_index_t i;
uct_md_query_tl_resources_params_t params;

*num_resources_p = 0;
*num_resources_p = 0;
params.field_mask = 0;

/* check what are the available uct resources */
status = uct_md_query_tl_resources(md->md, &tl_resources, &num_tl_resources);
status = uct_md_query_tl_resources_v2(md->md, &params, &tl_resources,
&num_tl_resources);
if (status != UCS_OK) {
ucs_error("Failed to query resources: %s", ucs_status_string(status));
goto out;
Expand Down Expand Up @@ -1208,7 +1212,7 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index,

status = UCS_OK;
free_resources:
uct_release_tl_resource_list(tl_resources);
uct_release_tl_resource_list_v2(tl_resources);
out:
return status;
}
Expand Down
2 changes: 1 addition & 1 deletion src/ucp/core/ucp_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ struct ucp_config {
* UCP communication resource descriptor
*/
typedef struct ucp_tl_resource_desc {
uct_tl_resource_desc_t tl_rsc; /* UCT resource descriptor */
uct_tl_resource_desc_v2_t tl_rsc; /* UCT resource descriptor */
uint16_t tl_name_csum; /* Checksum of transport name */
ucp_md_index_t md_index; /* Memory domain index (within the context) */
ucp_rsc_index_t dev_index; /* Arbitrary device index. Resources
Expand Down
6 changes: 3 additions & 3 deletions src/ucp/core/ucp_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -3144,7 +3144,7 @@ void ucp_ep_config_lane_info_str(ucp_worker_h worker,
ucs_string_buffer_t *strbuf)
{
ucp_context_h context = worker->context;
uct_tl_resource_desc_t *rsc;
uct_tl_resource_desc_v2_t *rsc;
ucp_rsc_index_t rsc_index;
ucp_md_index_t dst_md_index;
ucp_md_index_t md_index;
Expand Down Expand Up @@ -3485,7 +3485,7 @@ void ucp_ep_get_lane_info_str(ucp_ep_h ucp_ep, ucp_lane_index_t lane,
ucs_string_buffer_t *lane_info_strb)
{
ucp_rsc_index_t rsc_index;
uct_tl_resource_desc_t *tl_rsc;
uct_tl_resource_desc_v2_t *tl_rsc;

if (lane == UCP_NULL_LANE) {
ucs_string_buffer_appendf(lane_info_strb, "NULL lane");
Expand Down Expand Up @@ -3691,7 +3691,7 @@ static ucs_status_t ucp_ep_query_transport(ucp_ep_h ep, ucp_ep_attr_t *attr)
{
ucp_worker_h worker = ep->worker;
ucp_ep_config_t *config = ucp_ep_config(ep);
const uct_tl_resource_desc_t *rsc;
const uct_tl_resource_desc_v2_t *rsc;
ucp_transport_entry_t *transport_entry;
size_t device_limit;
size_t transport_limit;
Expand Down
2 changes: 1 addition & 1 deletion src/ucp/core/ucp_ep.inl
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ static inline ucp_rsc_index_t ucp_ep_get_rsc_index(ucp_ep_h ep, ucp_lane_index_t
return ucp_ep_config(ep)->key.lanes[lane].rsc_index;
}

static inline const uct_tl_resource_desc_t *
static inline const uct_tl_resource_desc_v2_t *
ucp_ep_get_tl_rsc(ucp_ep_h ep, ucp_lane_index_t lane)
{
return &ep->worker->context->tl_rscs[ucp_ep_get_rsc_index(ep, lane)].tl_rsc;
Expand Down
5 changes: 4 additions & 1 deletion src/ucp/core/ucp_worker.c
Original file line number Diff line number Diff line change
Expand Up @@ -2961,7 +2961,10 @@ static ucs_status_t ucp_worker_address_pack(ucp_worker_h worker,
if (address_flags & UCP_WORKER_ADDRESS_FLAG_NET_ONLY) {
UCS_STATIC_BITMAP_RESET_ALL(&tl_bitmap);
UCS_STATIC_BITMAP_FOR_EACH_BIT(tl_id, &worker->context->tl_bitmap) {
if (context->tl_rscs[tl_id].tl_rsc.dev_type == UCT_DEVICE_TYPE_NET) {
if ((context->tl_rscs[tl_id].tl_rsc.dev_type ==
UCT_DEVICE_TYPE_NET) ||
(context->tl_rscs[tl_id].tl_rsc.flags &
UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE)) {
UCS_STATIC_BITMAP_SET(&tl_bitmap, tl_id);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/ucp/proto/proto_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,8 @@ void ucp_proto_common_lane_perf_node(ucp_context_h context,
const uct_perf_attr_t *perf_attr,
ucp_proto_perf_node_t **perf_node_p)
{
const uct_tl_resource_desc_t *tl_rsc = &context->tl_rscs[rsc_index].tl_rsc;
const uct_tl_resource_desc_v2_t *tl_rsc =
&context->tl_rscs[rsc_index].tl_rsc;
ucp_proto_perf_node_t *perf_node;

if (perf_attr->operation == UCT_EP_OP_LAST) {
Expand Down
17 changes: 9 additions & 8 deletions src/ucp/wireup/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ ucp_wireup_test_select_flags(const ucp_wireup_select_flags_t *select_flags,
}

static int
ucp_wireup_check_select_flags(const uct_tl_resource_desc_t *resource,
ucp_wireup_check_select_flags(const uct_tl_resource_desc_v2_t *resource,
uint64_t flags,
const ucp_wireup_select_flags_t *select_flags,
const char *title, const char **flag_descs,
Expand All @@ -263,7 +263,7 @@ ucp_wireup_check_select_flags(const uct_tl_resource_desc_t *resource,
return 1;
}

static int ucp_wireup_check_flags(const uct_tl_resource_desc_t *resource,
static int ucp_wireup_check_flags(const uct_tl_resource_desc_v2_t *resource,
uint64_t flags, uint64_t select_flags,
const char *title, const char **flag_descs,
char *reason, size_t max)
Expand All @@ -275,7 +275,7 @@ static int ucp_wireup_check_flags(const uct_tl_resource_desc_t *resource,
flag_descs, reason, max);
}

static int ucp_wireup_check_amo_flags(const uct_tl_resource_desc_t *resource,
static int ucp_wireup_check_amo_flags(const uct_tl_resource_desc_v2_t *resource,
uint64_t flags, uint64_t required_flags,
int op_size, int fetch,
const char *title, char *reason,
Expand Down Expand Up @@ -306,9 +306,10 @@ ucp_wireup_check_keepalive(const ucp_wireup_select_params_t *select_params,
const char *title, int is_keepalive,
const char **flag_descs, char *reason, size_t max)
{
ucp_worker_h worker = select_params->ep->worker;
ucp_context_h context = worker->context;
const uct_tl_resource_desc_t *resource = &context->tl_rscs[rsc_index].tl_rsc;
ucp_worker_h worker = select_params->ep->worker;
ucp_context_h context = worker->context;
const uct_tl_resource_desc_v2_t *resource =
&context->tl_rscs[rsc_index].tl_rsc;
char title_keepalive[128];
char title_ep_check[128];
char title_am_based[128];
Expand Down Expand Up @@ -404,7 +405,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
ucp_tl_addr_bitmap_t addr_index_map, rsc_addr_index_map;
const ucp_wireup_lane_desc_t *lane_desc;
unsigned addr_index;
uct_tl_resource_desc_t *resource;
uct_tl_resource_desc_v2_t *resource;
const ucp_address_entry_t *ae;
ucp_worker_iface_t *wiface;
ucp_rsc_index_t rsc_index;
Expand Down Expand Up @@ -2072,7 +2073,7 @@ ucp_wireup_select_wireup_msg_lane(ucp_worker_h worker,
ucp_context_h context = worker->context;
ucp_lane_index_t p2p_lane = UCP_NULL_LANE;
ucp_wireup_criteria_t criteria = {0};
uct_tl_resource_desc_t *resource;
uct_tl_resource_desc_v2_t *resource;
ucp_rsc_index_t rsc_index;
uct_iface_attr_t *attrs;
ucp_lane_index_t lane;
Expand Down
106 changes: 106 additions & 0 deletions src/uct/api/v2/uct_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -1050,6 +1050,112 @@ int uct_iface_is_reachable_v2(uct_iface_h iface,
const uct_iface_is_reachable_params_t *params);


/**
* @ingroup UCT_RESOURCE
* @brief Capability flags of @ref uct_tl_resource_desc_t.
*
* The enumeration defines bit mask of capabilities in @ref
* uct_tl_resource_desc_v2_t::flags, set by @ref uct_md_query_tl_resources_v2.
*/
typedef enum {
/**
* If set, the resource supports inter-node communications.
*/
UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE = UCS_BIT(0)
} uct_md_query_tl_resources_flags_t;


/**
* @ingroup UCT_RESOURCE
* @brief Parameters passed to @ref uct_md_query_tl_resources_v2.
*/
typedef struct {
/**
* Mask of valid fields which must currently be set to zero.
* Future fields not specified in this mask will be ignored.
* Provides ABI compatibility with respect to adding new fields.
*/
uint64_t field_mask;
} uct_md_query_tl_resources_params_t;


/**
* @ingroup UCT_RESOURCE
* @brief Communication resource descriptor.
*
* Resource descriptor is an object representing the network resource.
* Resource descriptor could represent a stand-alone communication resource
* such as an HCA port, network interface, or multiple resources such as
* multiple network interfaces or communication ports. It could also represent
* virtual communication resources that are defined over a single physical
* network interface.
*/
typedef struct uct_tl_resource_desc_v2 {
tvegas1 marked this conversation as resolved.
Show resolved Hide resolved
/**
* Transport name
*/
char tl_name[UCT_TL_NAME_MAX];

/**
* Hardware device name
*/
char dev_name[UCT_DEVICE_NAME_MAX];

/**
* Device represented by this resource
* (e.g. UCT_DEVICE_TYPE_NET for a network interface)
*/
uct_device_type_t dev_type;

/**
* The identifier associated with the device bus_id as captured in
* @ref ucs_sys_bus_id_t
*/
ucs_sys_device_t sys_device;

/**
* Associated resource flags using bits from @ref
* uct_md_query_tl_resources_flags_t.
*/
uint64_t flags;
} uct_tl_resource_desc_v2_t;


/**
* @ingroup UCT_RESOURCE
* @brief Query for transport resources.
*
* This routine queries the @ref uct_md_h "memory domain" for communication
* resources that are available for it.
*
* @param [in] md Handle to memory domain.
* @param [inout] params Parameters as defined in @ref
* uct_md_query_tl_resources_params_t.
* @param [out] resources_p Filled with a pointer to an array of resource
* descriptors.
* @param [out] num_resources_p Filled with the number of resources in the array.
*
* @return Error code.
*/
ucs_status_t
uct_md_query_tl_resources_v2(uct_md_h md,
uct_md_query_tl_resources_params_t *params,
uct_tl_resource_desc_v2_t **resources_p,
unsigned *num_resources_p);


/**
* @ingroup UCT_RESOURCE
* @brief Release the list of resources returned from @ref uct_md_query_tl_resources_v2.
*
* This routine releases the memory associated with the list of resources
* allocated by @ref uct_md_query_tl_resources_v2.
*
* @param [in] resources Array of resource descriptors to release.
*/
void uct_release_tl_resource_list_v2(uct_tl_resource_desc_v2_t *resources);


/**
* @ingroup UCT_RESOURCE
* @brief Connect endpoint to a remote endpoint.
Expand Down
1 change: 1 addition & 0 deletions src/uct/base/uct_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ typedef struct uct_tl_device_resource {
(e.g. UCT_DEVICE_TYPE_NET for a network interface) */
ucs_sys_device_t sys_device; /**< The identifier associated with the device
bus_id as captured in ucs_sys_bus_id_t struct */
uint64_t flags; /**< Associated flags to the resource */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need it to carry the UCT_TL_RESOURCE_DESC_FLAG_INTER_NODE flag, to be used in UCP at the time packing for net addresses only.

Copy link
Contributor Author

@tvegas1 tvegas1 Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

double checked again, this is internal structure populated from ->query_devices() and not public so we need to keep it

} uct_tl_device_resource_t;


Expand Down
Loading
Loading