Skip to content

Commit

Permalink
Merge pull request #9986 from michal-shalev/suggest-pf-log-bar-size
Browse files Browse the repository at this point in the history
UCT/IB/MLX5: suggest user to increase PF_LOG_BAR_SIZE when UAR allocation fails
  • Loading branch information
michal-shalev committed Aug 11, 2024
2 parents bb5d8bf + 684f818 commit 600b468
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 43 deletions.
2 changes: 1 addition & 1 deletion src/uct/ib/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ AS_IF([test "x$with_ib" = "xyes"],
MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE,
MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE,
MLX5DV_UAR_ALLOC_TYPE_BF,
MLX5DV_UAR_ALLOC_TYPE_NC,
MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED,
mlx5dv_devx_umem_reg_ex],
[], [], [[#include <infiniband/mlx5dv.h>]])
AC_CHECK_MEMBERS([struct mlx5dv_cq.cq_uar],
Expand Down
16 changes: 1 addition & 15 deletions src/uct/ib/mlx5/dv/ib_mlx5dv_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -1883,20 +1883,6 @@ static void uct_ib_mlx5_md_port_counter_set_id_init(uct_ib_mlx5_md_t *md)
}
}

static int uct_ib_mlx5_check_uar(uct_ib_mlx5_md_t *md)
{
uct_ib_mlx5_devx_uar_t uar;
ucs_status_t status;

status = uct_ib_mlx5_devx_uar_init(&uar, md, 0);
if (status != UCS_OK) {
return UCS_ERR_UNSUPPORTED;
}

uct_ib_mlx5_devx_uar_cleanup(&uar);
return UCS_OK;
}

ucs_status_t
uct_ib_mlx5_devx_device_mem_alloc(uct_md_h uct_md, size_t *length_p,
void **address_p, ucs_memory_type_t mem_type,
Expand Down Expand Up @@ -2127,7 +2113,7 @@ ucs_status_t uct_ib_mlx5_devx_md_open(struct ibv_device *ibv_device,
goto err_free_context;
}

status = uct_ib_mlx5_check_uar(md);
status = uct_ib_mlx5_devx_check_uar(md);
if (status != UCS_OK) {
goto err_free_md;
}
Expand Down
106 changes: 83 additions & 23 deletions src/uct/ib/mlx5/ib_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,48 +543,108 @@ int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
}

#if HAVE_DEVX
static ucs_status_t
uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md, unsigned flags, int log_level,
char *title, char *fallback,
struct mlx5dv_devx_uar **uar_p)
static ucs_status_t uct_ib_mlx5_devx_alloc_uar(uct_ib_mlx5_md_t *md,
uint32_t flags,
struct mlx5dv_devx_uar **uar_p)
{
const char *uar_type_str = (flags == UCT_IB_MLX5_UAR_ALLOC_TYPE_WC) ?
"WC" : "NC_DEDICATED";
ucs_log_level_t err_log_level = UCS_LOG_LEVEL_DIAG;
UCS_STRING_BUFFER_ONSTACK(strb, 256);
struct mlx5dv_devx_uar *uar;
char buf[512];
ucs_status_t status;
int err;

uar = mlx5dv_devx_alloc_uar(md->super.dev.ibv_context, flags);
if (uar == NULL) {
sprintf(buf, "mlx5dv_devx_alloc_uar(device=%s, flags=0x%x(%s)) "
"failed: %m", uct_ib_device_name(&md->super.dev), flags, title);
if (fallback == NULL) {
ucs_log(log_level, "%s", buf);
} else {
ucs_log(log_level, "%s, fallback to %s", buf, fallback);
}
if (uar != NULL) {
*uar_p = uar;
return UCS_OK;
}

return UCS_ERR_NO_MEMORY;
err = errno;
ucs_string_buffer_appendf(&strb,
"mlx5dv_devx_alloc_uar(device=%s, flags=0x%x)"
"type=%s failed: %s. ",
uct_ib_device_name(&md->super.dev),
flags,
uar_type_str,
strerror(err));
switch (err) {
case ENOMEM:
ucs_string_buffer_appendf(&strb,
"Consider increasing PF_LOG_BAR_SIZE "
"using mlxconfig tool (requires reboot)");
err_log_level = UCS_LOG_LEVEL_ERROR;
status = UCS_ERR_NO_MEMORY;
break;
case EOPNOTSUPP:
status = UCS_ERR_UNSUPPORTED;
break;
case EINVAL:
status = UCS_ERR_INVALID_PARAM;
break;
default:
status = UCS_ERR_NO_MEMORY;
break;
}

*uar_p = uar;
return UCS_OK;
ucs_log(err_log_level, "%s", ucs_string_buffer_cstr(&strb));
return status;
}
#endif

ucs_status_t uct_ib_mlx5_devx_check_uar(uct_ib_mlx5_md_t *md)
{
#if HAVE_DEVX
uct_ib_mlx5_devx_uar_t uar;
ucs_status_t status;

status = uct_ib_mlx5_devx_alloc_uar(md,
UCT_IB_MLX5_UAR_ALLOC_TYPE_WC,
&uar.uar);
if (status == UCS_ERR_UNSUPPORTED) {
status = uct_ib_mlx5_devx_alloc_uar(md,
UCT_IB_MLX5_UAR_ALLOC_TYPE_NC,
&uar.uar);
if (status == UCS_ERR_UNSUPPORTED) {
ucs_error("%s: both WC and NC_DEDICATED UAR allocation types "
" are not supported", uct_ib_device_name(&md->super.dev));
return status;
} else if (status != UCS_OK) {
return status;
}
/* NC_DEDICATED is supported - the flag is automatically set to 0 */
} else if (status != UCS_OK) {
/* The error is unrelated to the UAR allocation type, no fallback */
return status;
} else {
/* WC is supported - set the flag to 1 */
md->flags |= UCT_IB_MLX5_MD_FLAG_UAR_USE_WC;
}

uct_ib_mlx5_devx_uar_cleanup(&uar);
return UCS_OK;
#else
return UCS_ERR_UNSUPPORTED;
#endif
}

ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
uct_ib_mlx5_md_t *md,
uct_ib_mlx5_mmio_mode_t mmio_mode)
{
#if HAVE_DEVX
ucs_status_t status;
uint32_t flags;

status = uct_ib_mlx5_devx_alloc_uar(md, UCT_IB_MLX5_UAR_ALLOC_TYPE_WC,
UCS_LOG_LEVEL_DEBUG, "WC", "NC",
&uar->uar);
if (status != UCS_OK) {
status = uct_ib_mlx5_devx_alloc_uar(md, UCT_IB_MLX5_UAR_ALLOC_TYPE_NC,
UCS_LOG_LEVEL_ERROR, "NC", NULL,
&uar->uar);
/* Use UCT_IB_MLX5_MD_FLAG_UAR_USE_WC to determine the supported UAR allocation type */
if (md->flags & UCT_IB_MLX5_MD_FLAG_UAR_USE_WC) {
flags = UCT_IB_MLX5_UAR_ALLOC_TYPE_WC;
} else {
flags = UCT_IB_MLX5_UAR_ALLOC_TYPE_NC;
}

status = uct_ib_mlx5_devx_alloc_uar(md, flags, &uar->uar);
if (status != UCS_OK) {
return status;
}
Expand Down
12 changes: 8 additions & 4 deletions src/uct/ib/mlx5/ib_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@
# define UCT_IB_MLX5_UAR_ALLOC_TYPE_WC 0x0
#endif

#if HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC
# define UCT_IB_MLX5_UAR_ALLOC_TYPE_NC MLX5DV_UAR_ALLOC_TYPE_NC
#if HAVE_DECL_MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED
# define UCT_IB_MLX5_UAR_ALLOC_TYPE_NC MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED
#else
# define UCT_IB_MLX5_UAR_ALLOC_TYPE_NC 0x1
# define UCT_IB_MLX5_UAR_ALLOC_TYPE_NC (1U << 31)
#endif

#define UCT_IB_MLX5_OPMOD_EXT_ATOMIC(_log_arg_size) \
Expand Down Expand Up @@ -197,9 +197,11 @@ enum {
UCT_IB_MLX5_MD_FLAG_MMO_DMA = UCS_BIT(15),
/* Device supports XGVMI UMR workflow */
UCT_IB_MLX5_MD_FLAG_XGVMI_UMR = UCS_BIT(16),
/* Device supports UAR WC allocation type */
UCT_IB_MLX5_MD_FLAG_UAR_USE_WC = UCS_BIT(17),

/* Object to be created by DevX */
UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 17,
UCT_IB_MLX5_MD_FLAG_DEVX_OBJS_SHIFT = 18,
UCT_IB_MLX5_MD_FLAG_DEVX_RC_QP = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCQP),
UCT_IB_MLX5_MD_FLAG_DEVX_RC_SRQ = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(RCSRQ),
UCT_IB_MLX5_MD_FLAG_DEVX_DCT = UCT_IB_MLX5_MD_FLAG_DEVX_OBJS(DCT),
Expand Down Expand Up @@ -858,6 +860,8 @@ int uct_ib_mlx5_devx_uar_cmp(uct_ib_mlx5_devx_uar_t *uar,
uct_ib_mlx5_md_t *md,
uct_ib_mlx5_mmio_mode_t mmio_mode);

ucs_status_t uct_ib_mlx5_devx_check_uar(uct_ib_mlx5_md_t *md);

ucs_status_t uct_ib_mlx5_devx_uar_init(uct_ib_mlx5_devx_uar_t *uar,
uct_ib_mlx5_md_t *md,
uct_ib_mlx5_mmio_mode_t mmio_mode);
Expand Down

0 comments on commit 600b468

Please sign in to comment.