Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCP: Enable ppln protos with cuda buffers by default #10104

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions src/ucp/am/eager_multi.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ ucp_am_eager_multi_bcopy_proto_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING |
UCP_PROTO_COMMON_INIT_FLAG_RESUME,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.max_lanes = context->config.ext.max_eager_lanes,
.initial_reg_md_map = 0,
.first.lane_type = UCP_LANE_TYPE_AM,
Expand Down Expand Up @@ -197,6 +198,7 @@ ucp_am_eager_multi_zcopy_proto_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = init_params->select_param->mem_type,
.max_lanes = context->config.ext.max_eager_lanes,
.initial_reg_md_map = 0,
.opt_align_offs = UCP_PROTO_COMMON_OFFSET_INVALID,
Expand Down
3 changes: 3 additions & 0 deletions src/ucp/am/eager_single.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ ucp_am_eager_short_probe_common(const ucp_proto_init_params_t *init_params,
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_AM,
.tl_cap_flags = UCT_IFACE_FLAG_AM_SHORT
};
Expand Down Expand Up @@ -240,6 +241,7 @@ static void ucp_am_eager_single_bcopy_probe_common(
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_AM,
.tl_cap_flags = UCT_IFACE_FLAG_AM_BCOPY
};
Expand Down Expand Up @@ -330,6 +332,7 @@ static void ucp_am_eager_single_zcopy_probe_common(
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = init_params->select_param->mem_type,
.lane_type = UCP_LANE_TYPE_AM,
.tl_cap_flags = UCT_IFACE_FLAG_AM_ZCOPY
};
Expand Down
14 changes: 9 additions & 5 deletions src/ucp/core/ucp_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -340,11 +340,15 @@ static ucs_config_field_t ucp_context_config_table[] = {
"and the resulting performance.",
ucs_offsetof(ucp_context_config_t, estimated_num_ppn), UCS_CONFIG_TYPE_ULUNITS},

{"RNDV_FRAG_MEM_TYPE", "host",
"Memory type of fragments used for RNDV pipeline protocol.\n"
"Allowed memory types is one of: host, cuda, rocm, ze-host, ze-device",
ucs_offsetof(ucp_context_config_t, rndv_frag_mem_type),
UCS_CONFIG_TYPE_ENUM(ucs_memory_type_names)},
{"RNDV_FRAG_MEM_TYPE", NULL, "",
ucs_offsetof(ucp_context_config_t, rndv_frag_mem_types),
UCS_CONFIG_TYPE_BITMAP(ucs_memory_type_names)},

{"RNDV_FRAG_MEM_TYPES", "host,cuda",
"Memory types of fragments used for RNDV pipeline protocol.\n"
"Allowed memory types are: host, cuda, rocm, ze-host, ze-device",
ucs_offsetof(ucp_context_config_t, rndv_frag_mem_types),
UCS_CONFIG_TYPE_BITMAP(ucs_memory_type_names)},

{"RNDV_PIPELINE_SEND_THRESH", "inf",
"RNDV size threshold to enable sender side pipeline for mem type",
Expand Down
4 changes: 2 additions & 2 deletions src/ucp/core/ucp_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ typedef struct ucp_context_config {
size_t rndv_frag_size[UCS_MEMORY_TYPE_LAST];
/** Number of RNDV pipeline fragments per allocation */
size_t rndv_num_frags[UCS_MEMORY_TYPE_LAST];
/** Memory type of fragments used for RNDV pipeline protocol */
ucs_memory_type_t rndv_frag_mem_type;
/** Memory types of fragments used for RNDV pipeline protocol */
uint64_t rndv_frag_mem_types;
/** RNDV pipeline send threshold */
size_t rndv_pipeline_send_thresh;
/** Enabling 2-stage pipeline rndv protocol */
Expand Down
19 changes: 10 additions & 9 deletions src/ucp/core/ucp_mm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1700,27 +1700,28 @@ ucs_status_t
ucp_mm_get_alloc_md_index(ucp_context_h context, ucp_md_index_t *md_idx,
ucs_memory_type_t alloc_mem_type)
{
ucs_status_t status;
ucs_status_t status = UCS_OK;
uct_allocated_memory_t mem;

if (!context->alloc_md[alloc_mem_type].initialized) {
context->alloc_md[alloc_mem_type].initialized = 1;

status = ucp_mem_do_alloc(context, NULL, 1,
UCT_MD_MEM_ACCESS_RMA |
UCT_MD_MEM_FLAG_HIDE_ERRORS,
alloc_mem_type, "get_alloc_md_id",
&mem);
if (status != UCS_OK) {
return status;
if (status == UCS_OK) {
context->alloc_md[alloc_mem_type].md_index =
ucp_mem_get_md_index(context, mem.md, mem.method);
uct_mem_free(&mem);
} else {
context->alloc_md[alloc_mem_type].md_index = UCP_NULL_RESOURCE;
}

context->alloc_md[alloc_mem_type].initialized = 1;
context->alloc_md[alloc_mem_type].md_index =
ucp_mem_get_md_index(context, mem.md, mem.method);
uct_mem_free(&mem);
}

*md_idx = context->alloc_md[alloc_mem_type].md_index;
return UCS_OK;
return status;
}

static ucs_status_t
Expand Down
27 changes: 13 additions & 14 deletions src/ucp/proto/proto_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,9 @@ ucp_lane_index_t
ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params,
uct_ep_operation_t memtype_op, unsigned flags,
ptrdiff_t max_iov_offs, size_t min_iov,
ucp_lane_type_t lane_type, uint64_t tl_cap_flags,
ucp_lane_index_t max_lanes,
ucp_lane_type_t lane_type,
ucs_memory_type_t reg_mem_type,
uint64_t tl_cap_flags, ucp_lane_index_t max_lanes,
ucp_lane_map_t exclude_map, ucp_lane_index_t *lanes)
{
UCS_STRING_BUFFER_ONSTACK(sel_param_strb, UCP_PROTO_SELECT_PARAM_STR_MAX);
Expand Down Expand Up @@ -494,25 +495,23 @@ ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params,
}

/* Check memory registration capabilities for zero-copy case */
if (flags & UCP_PROTO_COMMON_INIT_FLAG_SEND_ZCOPY) {
if (reg_mem_type != UCS_MEMORY_TYPE_UNKNOWN) {
if (md_attr->flags & UCT_MD_FLAG_NEED_MEMH) {
/* Memory domain must support registration on the relevant
* memory type */
if (!(context->reg_md_map[select_param->mem_type] &
UCS_BIT(md_index))) {
if (!(context->reg_md_map[reg_mem_type] & UCS_BIT(md_index))) {
ucs_trace("%s: md %s cannot register %s memory", lane_desc,
context->tl_mds[md_index].rsc.md_name,
ucs_memory_type_names[select_param->mem_type]);
ucs_memory_type_names[reg_mem_type]);
continue;
}
} else if (!(md_attr->access_mem_types &
UCS_BIT(select_param->mem_type))) {
} else if (!(md_attr->access_mem_types & UCS_BIT(reg_mem_type))) {
/*
* Memory domain which does not require a registration for zero
* copy operation must be able to access the relevant memory type
*/
ucs_trace("%s: no access to mem type %s", lane_desc,
ucs_memory_type_names[select_param->mem_type]);
ucs_memory_type_names[reg_mem_type]);
continue;
}
}
Expand Down Expand Up @@ -605,11 +604,11 @@ ucp_lane_index_t ucp_proto_common_find_lanes_with_min_frag(
const uct_iface_attr_t *iface_attr;
size_t tl_min_frag, tl_max_frag;

num_lanes = ucp_proto_common_find_lanes(&params->super, params->memtype_op,
params->flags, params->max_iov_offs,
params->min_iov, lane_type,
tl_cap_flags, max_lanes,
exclude_map, lanes);
num_lanes = ucp_proto_common_find_lanes(
&params->super, params->memtype_op, params->flags,
params->max_iov_offs, params->min_iov, lane_type,
params->reg_mem_type, tl_cap_flags, max_lanes, exclude_map,
lanes);

num_valid_lanes = 0;
for (lane_index = 0; lane_index < num_lanes; ++lane_index) {
Expand Down
9 changes: 7 additions & 2 deletions src/ucp/proto/proto_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ typedef enum {

/* Supports starting the request when its datatype iterator offset is > 0 */
UCP_PROTO_COMMON_INIT_FLAG_RESUME = UCS_BIT(10),
UCP_PROTO_COMMON_KEEP_MD_MAP = UCS_BIT(11)
} ucp_proto_common_init_flags_t;


Expand Down Expand Up @@ -120,6 +121,9 @@ typedef struct {

/* Map of unsuitable lanes */
ucp_lane_map_t exclude_map;

/* Memory type that the transport should be capable of registering. */
ucs_memory_type_t reg_mem_type;
} ucp_proto_common_init_params_t;


Expand Down Expand Up @@ -255,8 +259,9 @@ ucp_lane_index_t
ucp_proto_common_find_lanes(const ucp_proto_init_params_t *params,
uct_ep_operation_t memtype_op, unsigned flags,
ptrdiff_t max_iov_offs, size_t min_iov,
ucp_lane_type_t lane_type, uint64_t tl_cap_flags,
ucp_lane_index_t max_lanes,
ucp_lane_type_t lane_type,
ucs_memory_type_t reg_mem_type,
uint64_t tl_cap_flags, ucp_lane_index_t max_lanes,
ucp_lane_map_t exclude_map,
ucp_lane_index_t *lanes);

Expand Down
15 changes: 12 additions & 3 deletions src/ucp/proto/proto_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ ucp_proto_common_init_send_perf(const ucp_proto_common_init_params_t *params,
{
ucp_proto_perf_node_t *child_perf_node;
ucs_linear_func_t send_overhead;
ucs_memory_type_t remote_mem_type;
ucs_status_t status;

send_perf->node = ucp_proto_perf_node_new_data("send-ovrh", "");
Expand All @@ -494,10 +495,18 @@ ucp_proto_common_init_send_perf(const ucp_proto_common_init_params_t *params,
send_overhead = UCS_LINEAR_FUNC_ZERO;
} else {
ucs_assert(reg_md_map == 0);

if ((params->flags & UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS) &&
(params->reg_mem_type != UCS_MEMORY_TYPE_UNKNOWN)) {
remote_mem_type = params->reg_mem_type;
} else {
remote_mem_type = UCS_MEMORY_TYPE_HOST;
}

status = ucp_proto_init_buffer_copy_time(
params->super.worker, "send copy", UCS_MEMORY_TYPE_HOST,
params->super.select_param->mem_type, params->memtype_op,
&send_overhead, &child_perf_node);
params->super.worker, "send copy",
params->super.select_param->mem_type, remote_mem_type,
params->memtype_op, &send_overhead, &child_perf_node);
if (status != UCS_OK) {
ucp_proto_perf_node_deref(&send_perf->node);
return status;
Expand Down
1 change: 1 addition & 0 deletions src/ucp/rma/amo_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ static void ucp_proto_amo_probe(const ucp_proto_init_params_t *init_params,
UCP_PROTO_COMMON_INIT_FLAG_RECV_ZCOPY |
UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_AMO,
.tl_cap_flags = 0
};
Expand Down
1 change: 1 addition & 0 deletions src/ucp/rma/amo_sw.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,7 @@ static void ucp_proto_amo_sw_probe(const ucp_proto_init_params_t *init_params,
.super.flags = flags | UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG |
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_AM,
.tl_cap_flags = 0
};
Expand Down
1 change: 1 addition & 0 deletions src/ucp/rma/get_am.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ ucp_proto_get_am_bcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_CAP_SEG_SIZE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_AM,
.tl_cap_flags = UCT_IFACE_FLAG_AM_BCOPY
};
Expand Down
2 changes: 2 additions & 0 deletions src/ucp/rma/get_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ ucp_proto_get_offload_bcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS |
UCP_PROTO_COMMON_INIT_FLAG_RESPONSE,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.max_lanes = UCP_PROTO_RMA_MAX_BCOPY_LANES,
.initial_reg_md_map = 0,
.first.tl_cap_flags = UCT_IFACE_FLAG_GET_BCOPY,
Expand Down Expand Up @@ -202,6 +203,7 @@ ucp_proto_get_offload_zcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_RESPONSE |
UCP_PROTO_COMMON_INIT_FLAG_MIN_FRAG,
.super.exclude_map = 0,
.super.reg_mem_type = init_params->select_param->mem_type,
.max_lanes = context->config.ext.max_rma_lanes,
.initial_reg_md_map = 0,
.first.tl_cap_flags = UCT_IFACE_FLAG_GET_ZCOPY,
Expand Down
1 change: 1 addition & 0 deletions src/ucp/rma/put_am.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ ucp_proto_put_am_bcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING |
UCP_PROTO_COMMON_INIT_FLAG_RESUME,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.max_lanes = 1,
.initial_reg_md_map = 0,
.first.tl_cap_flags = UCT_IFACE_FLAG_AM_BCOPY,
Expand Down
3 changes: 3 additions & 0 deletions src/ucp/rma/put_offload.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ ucp_proto_put_offload_short_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_SINGLE_FRAG |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.lane_type = UCP_LANE_TYPE_RMA,
.tl_cap_flags = UCT_IFACE_FLAG_PUT_SHORT
};
Expand Down Expand Up @@ -166,6 +167,7 @@ ucp_proto_put_offload_bcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.max_lanes = UCP_PROTO_RMA_MAX_BCOPY_LANES,
.initial_reg_md_map = 0,
.first.tl_cap_flags = UCT_IFACE_FLAG_PUT_BCOPY,
Expand Down Expand Up @@ -254,6 +256,7 @@ ucp_proto_put_offload_zcopy_probe(const ucp_proto_init_params_t *init_params)
UCP_PROTO_COMMON_INIT_FLAG_REMOTE_ACCESS |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = init_params->select_param->mem_type,
.max_lanes = context->config.ext.max_rma_lanes,
.initial_reg_md_map = 0,
.first.tl_cap_flags = UCT_IFACE_FLAG_PUT_ZCOPY,
Expand Down
7 changes: 6 additions & 1 deletion src/ucp/rndv/proto_rndv.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ ucp_proto_rndv_ctrl_perf(const ucp_proto_init_params_t *params,
return UCS_OK;
}

ucs_assert(lane < UCP_MAX_LANES);

perf_attr.field_mask = UCT_PERF_ATTR_FIELD_OPERATION |
UCT_PERF_ATTR_FIELD_SEND_PRE_OVERHEAD |
UCT_PERF_ATTR_FIELD_SEND_POST_OVERHEAD |
Expand Down Expand Up @@ -268,7 +270,8 @@ ucp_proto_rndv_ctrl_init_priv(const ucp_proto_rndv_ctrl_init_params_t *params,

/* Use only memory domains for which the unpacking of the remote key was
* successful */
if (init_params->rkey_config_key != NULL) {
if ((init_params->rkey_config_key != NULL) &&
!(params->super.flags & UCP_PROTO_COMMON_KEEP_MD_MAP)) {
rpriv->md_map &= ~init_params->rkey_config_key->unreachable_md_map;
}

Expand Down Expand Up @@ -522,6 +525,7 @@ ucp_proto_rndv_find_ctrl_lane(const ucp_proto_init_params_t *params)
UCP_PROTO_COMMON_INIT_FLAG_HDR_ONLY,
UCP_PROTO_COMMON_OFFSET_INVALID, 1,
UCP_LANE_TYPE_AM,
UCS_MEMORY_TYPE_UNKNOWN,
UCT_IFACE_FLAG_AM_BCOPY, 1, 0,
&lane);
if (num_lanes == 0) {
Expand Down Expand Up @@ -556,6 +560,7 @@ void ucp_proto_rndv_rts_probe(const ucp_proto_init_params_t *init_params)
.super.flags = UCP_PROTO_COMMON_INIT_FLAG_RESPONSE |
UCP_PROTO_COMMON_INIT_FLAG_ERR_HANDLING,
.super.exclude_map = 0,
.super.reg_mem_type = UCS_MEMORY_TYPE_UNKNOWN,
.remote_op_id = UCP_OP_ID_RNDV_RECV,
.lane = ucp_proto_rndv_find_ctrl_lane(init_params),
.unpack_time = UCS_LINEAR_FUNC_ZERO,
Expand Down
1 change: 1 addition & 0 deletions src/ucp/rndv/proto_rndv.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ typedef struct {
*/
typedef struct {
ucp_proto_rndv_ack_priv_t super;
ucs_memory_type_t frag_mem_type;

/* Multi-lane common part. Must be the last field, see
@ref ucp_proto_multi_priv_t */
Expand Down
Loading
Loading