From bdbc8fb340c9c10b917cd1bc92268d37eed936fb Mon Sep 17 00:00:00 2001 From: Jacob Moroni Date: Tue, 3 Sep 2024 17:33:04 +0000 Subject: [PATCH] prov/verbs: add param for setting RDMA CM ToS By default, if the RDMA CM application does not explicitly set a ToS, then the system default is used. This system default can be changed via the default_roce_tos configfs param, but this is global to the system and is somewhat cumbersome to deal with. This change introduces a new environment param: FI_VERBS_TOS This param can be used to explicitly set the ToS via the rdma_set_option API. If the ToS is set, then the system default is ignored and the new value is used instead. This allows for multiple concurrent workloads to use different ToS values. Valid range is -1 through 255. If unset or set to -1, then the call to rdma_set_option is omitted, thus preserving existing behavior. Not supported/tested on Windows. Signed-off-by: Jacob Moroni --- man/fi_verbs.7.md | 5 +++++ prov/verbs/src/verbs_domain_xrc.c | 7 +++++++ prov/verbs/src/verbs_ep.c | 3 +++ prov/verbs/src/verbs_eq.c | 3 +++ prov/verbs/src/verbs_init.c | 9 +++++++++ prov/verbs/src/verbs_ofi.h | 13 +++++++++++++ prov/verbs/src/windows/verbs_nd_rdma.c | 7 +++++++ 7 files changed, 47 insertions(+) diff --git a/man/fi_verbs.7.md b/man/fi_verbs.7.md index 0d42c06e63b..cdecaf99bf0 100644 --- a/man/fi_verbs.7.md +++ b/man/fi_verbs.7.md @@ -157,6 +157,11 @@ The verbs provider checks for the following environment variables. ### Common variables: +*FI_VERBS_TOS* +: RDMA CM ToS value. If unset or set to -1, then the ToS will not be + explicitly set and the system default will be used. Valid range is -1 + through 255. + *FI_VERBS_TX_SIZE* : Default maximum tx context size (default: 384) diff --git a/prov/verbs/src/verbs_domain_xrc.c b/prov/verbs/src/verbs_domain_xrc.c index 2c95c948a52..6e706e9111b 100644 --- a/prov/verbs/src/verbs_domain_xrc.c +++ b/prov/verbs/src/verbs_domain_xrc.c @@ -67,6 +67,10 @@ static int vrb_create_ini_qp(struct vrb_xrc_ep *ep) "XRC INI QP rdma_create_qp_ex failed %d\n", -ret); return ret; } + + if (vrb_rdma_set_tos(ep->base_ep.id)) + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "vrb_rdma_set_tos"); + return FI_SUCCESS; #else /* VERBS_HAVE_XRC */ return -FI_ENOSYS; @@ -400,6 +404,9 @@ int vrb_ep_create_tgt_qp(struct vrb_xrc_ep *ep, uint32_t tgt_qpn) } ep->tgt_ibv_qp = ep->tgt_id->qp; + if (vrb_rdma_set_tos(ep->tgt_id)) + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "vrb_rdma_set_tos"); + return FI_SUCCESS; #else /* VERBS_HAVE_XRC */ return -FI_ENOSYS; diff --git a/prov/verbs/src/verbs_ep.c b/prov/verbs/src/verbs_ep.c index 63aea82778d..b4c2e4b8c21 100644 --- a/prov/verbs/src/verbs_ep.c +++ b/prov/verbs/src/verbs_ep.c @@ -1063,6 +1063,9 @@ static int vrb_ep_enable(struct fid_ep *ep_fid) /* Allow shared XRC INI QP not controlled by RDMA CM * to share same post functions as RC QP. */ ep->ibv_qp = ep->id->qp; + + if (vrb_rdma_set_tos(ep->id)) + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "vrb_rdma_set_tos"); } break; case FI_EP_DGRAM: diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index f9bc78a828f..590dc6d2107 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -893,6 +893,9 @@ vrb_eq_addr_resolved_event(struct vrb_ep *ep) /* Allow shared XRC INI QP not controlled by RDMA CM * to share same post functions as RC QP. */ ep->ibv_qp = ep->id->qp; + + if (vrb_rdma_set_tos(ep->id)) + VRB_WARN_ERRNO(FI_LOG_EP_CTRL, "vrb_rdma_set_tos"); } assert(ep->ibv_qp); diff --git a/prov/verbs/src/verbs_init.c b/prov/verbs/src/verbs_init.c index 67b999d2a61..279ee91e14b 100644 --- a/prov/verbs/src/verbs_init.c +++ b/prov/verbs/src/verbs_init.c @@ -45,6 +45,7 @@ static const char *local_node = "localhost"; #define VERBS_DEFAULT_MIN_RNR_TIMER 12 struct vrb_gl_data vrb_gl_data = { + .tos = VERBS_TOS_UNSET, .def_tx_size = 384, .def_rx_size = 384, .def_tx_iov_limit = 4, @@ -637,6 +638,14 @@ static int vrb_get_param_str(const char *param_name, int vrb_read_params(void) { /* Common parameters */ + if (vrb_get_param_int("tos", "RDMA CM ToS value. If unset or set to -1, then " + "the ToS will not be explicitly set and the system " + "default will be used. Valid range is -1 through 255.", + &vrb_gl_data.tos) || + (vrb_gl_data.tos < -1 || vrb_gl_data.tos > 255)) { + VRB_WARN(FI_LOG_CORE, "Invalid value of ToS\n"); + return -FI_EINVAL; + } if (vrb_get_param_int("tx_size", "Default maximum tx context size", &vrb_gl_data.def_tx_size) || (vrb_gl_data.def_tx_size < 0)) { diff --git a/prov/verbs/src/verbs_ofi.h b/prov/verbs/src/verbs_ofi.h index 0c54f879c45..efaa6e9c130 100644 --- a/prov/verbs/src/verbs_ofi.h +++ b/prov/verbs/src/verbs_ofi.h @@ -161,6 +161,8 @@ #define VERBS_ANY_DOMAIN "verbs_any_domain" #define VERBS_ANY_FABRIC "verbs_any_fabric" +#define VERBS_TOS_UNSET (-1) + #ifdef HAVE_FABRIC_PROFILE struct vrb_profile; typedef struct vrb_profile vrb_profile_t; @@ -176,6 +178,7 @@ extern ofi_mutex_t vrb_init_mutex; extern struct dlist_entry verbs_devs; extern struct vrb_gl_data { + int tos; int def_tx_size; int def_rx_size; int def_tx_iov_limit; @@ -1050,6 +1053,16 @@ vrb_free_recv_wr(struct vrb_progress *progress, struct vrb_recv_wr *wr) ofi_buf_free(wr); } +static inline int vrb_rdma_set_tos(struct rdma_cm_id *id) +{ + if (vrb_gl_data.tos == VERBS_TOS_UNSET) + return 0; + + uint8_t tos = vrb_gl_data.tos; + return rdma_set_option(id, RDMA_OPTION_ID, RDMA_OPTION_ID_TOS, &tos, + sizeof(tos)); +} + int vrb_ep_ops_open(struct fid *fid, const char *name, uint64_t flags, void **ops, void *context); diff --git a/prov/verbs/src/windows/verbs_nd_rdma.c b/prov/verbs/src/windows/verbs_nd_rdma.c index 3235a21d03e..b44e36f01aa 100644 --- a/prov/verbs/src/windows/verbs_nd_rdma.c +++ b/prov/verbs/src/windows/verbs_nd_rdma.c @@ -240,6 +240,13 @@ int rdma_destroy_id(struct rdma_cm_id *id) return 0; } +int rdma_set_option(struct rdma_cm_id *id, int level, int optname, + void *optval, size_t optlen) +{ + errno = ENOSYS; + return -1; +} + int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel) { struct nd_cm_id *id_nd;