From 6aa6708f99234ddef233182d656ec25eb1c5159b Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Fri, 24 May 2024 16:21:54 -0700 Subject: [PATCH] prov/efa: Refactor dmabuf reg Introduce a boolean dmabuf_supported in efa_hmem_info, check if dmabuf is supported for different hmem ifaces. When dmabuf is supported, retrieve the dmabuf fd and use ibv_reg_dmabuf_mr to register memory. Otherwise, fall back to ibv_reg_mr. Always use ibv_reg_dmabuf_mr when FI_MR_DMABUF is set. Remove macros in efa_mr_reg_ibv_mr and combine duplicate logic of different hmem ifaces. Signed-off-by: Jessie Yang --- prov/efa/src/efa_hmem.c | 43 +++++++++-------- prov/efa/src/efa_hmem.h | 1 + prov/efa/src/efa_mr.c | 100 +++++++++++----------------------------- prov/efa/src/efa_mr.h | 28 +++++++++++ 4 files changed, 81 insertions(+), 91 deletions(-) diff --git a/prov/efa/src/efa_hmem.c b/prov/efa/src/efa_hmem.c index 666ed6f0305..96868c42faf 100644 --- a/prov/efa/src/efa_hmem.c +++ b/prov/efa/src/efa_hmem.c @@ -97,6 +97,8 @@ static int efa_domain_hmem_info_init_system(struct efa_domain *efa_domain) info->p2p_disabled_by_user = false; info->p2p_required_by_impl = false; info->p2p_supported_by_device = true; + info->dmabuf_supported = false; + efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYSTEM); return 0; } @@ -137,6 +139,7 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain) info->initialized = true; info->p2p_disabled_by_user = false; + info->dmabuf_supported = false; /* If user is using libfabric API 1.18 or later, by default EFA provider is permitted to * use CUDA library to support CUDA memory, therefore p2p is not required. @@ -146,26 +149,24 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain) else info->p2p_required_by_impl = true; -#if HAVE_EFA_DMABUF_MR - ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset); + ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset); if (ret == FI_SUCCESS) { - ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset, + ibv_mr = efa_mr_reg_ibv_dmabuf_mr(efa_domain->ibv_pd, dmabuf_offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); if (!ibv_mr) { EFA_INFO(FI_LOG_DOMAIN, "Unable to register CUDA device buffer via dmabuf: %s. " "Fall back to ibv_reg_mr\n", fi_strerror(-errno)); - ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); + ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access); + } else { + info->dmabuf_supported = true; } } else { EFA_INFO(FI_LOG_DOMAIN, "Unable to retrieve dmabuf fd of CUDA device buffer: %d. " "Fall back to ibv_reg_mr\n", ret); - ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); + ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access); } -#else - ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); -#endif if (!ibv_mr) { info->p2p_supported_by_device = false; @@ -247,22 +248,27 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain) info->p2p_disabled_by_user = false; /* Neuron currently requires P2P */ info->p2p_required_by_impl = true; + info->dmabuf_supported = false; -#if HAVE_EFA_DMABUF_MR - ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset); + ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_NEURON, ptr, (uint64_t)len, &dmabuf_fd, &offset); if (ret == FI_SUCCESS) { - ibv_mr = ibv_reg_dmabuf_mr( - g_device_list[0].ibv_pd, offset, + ibv_mr = efa_mr_reg_ibv_dmabuf_mr( + efa_domain->ibv_pd, offset, len, (uint64_t)ptr, dmabuf_fd, ibv_access); - } else if (ret == -FI_ENOPROTOOPT) { - EFA_INFO(FI_LOG_MR, + if (!ibv_mr) { + EFA_INFO(FI_LOG_DOMAIN, + "Unable to register neuron device buffer via dmabuf: %s. " + "Fall back to ibv_reg_mr\n", fi_strerror(-errno)); + ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access); + } else { + info->dmabuf_supported = true; + } + } else { + EFA_INFO(FI_LOG_DOMAIN, "Unable to retrieve dmabuf fd of Neuron device buffer, " "Fall back to ibv_reg_mr\n"); - ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); + ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access); } -#else - ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access); -#endif if (!ibv_mr) { info->p2p_supported_by_device = false; @@ -325,6 +331,7 @@ static int efa_domain_hmem_info_init_synapseai(struct efa_domain *efa_domain) /* SynapseAI currently requires P2P */ info->p2p_required_by_impl = true; info->p2p_supported_by_device = true; + info->dmabuf_supported = true; efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYNAPSEAI); /* Only the long read protocol is supported */ diff --git a/prov/efa/src/efa_hmem.h b/prov/efa/src/efa_hmem.h index db376c1a2b4..0804ea5ddac 100644 --- a/prov/efa/src/efa_hmem.h +++ b/prov/efa/src/efa_hmem.h @@ -26,6 +26,7 @@ struct efa_hmem_info { bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */ bool p2p_required_by_impl; /* Is p2p required for this interface? */ bool p2p_supported_by_device; /* do we support p2p with this device */ + bool dmabuf_supported; size_t max_intra_eager_size; /* Maximum message size to use eager protocol for intra-node */ size_t max_medium_msg_size; diff --git a/prov/efa/src/efa_mr.c b/prov/efa/src/efa_mr.c index 161707a7dab..e8a247aa449 100644 --- a/prov/efa/src/efa_mr.c +++ b/prov/efa/src/efa_mr.c @@ -475,30 +475,6 @@ struct fi_ops efa_mr_ops = { .ops_open = fi_no_ops_open, }; -#if HAVE_EFA_DMABUF_MR - -static inline -struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, - size_t len, uint64_t iova, int fd, int access) -{ - return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access); -} - -#else - -static inline -struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, - size_t len, uint64_t iova, int fd, int access) -{ - EFA_WARN(FI_LOG_MR, - "ibv_reg_dmabuf_mr is required for memory" - " registration with FI_MR_DMABUF flags, but " - " not available in the current rdma-core library." - " please build libfabric with rdma-core >= 34.0\n"); - return NULL; -} - -#endif /** * @brief Register a memory buffer with rdma-core api. * @@ -511,7 +487,20 @@ struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr, int access, const uint64_t flags) { - if (flags & FI_MR_DMABUF) + int dmabuf_fd; + uint64_t offset; + int ret; + + assert(efa_mr->domain->hmem_info[mr_attr->iface].p2p_supported_by_device); + + if (flags & FI_MR_DMABUF) { + if (OFI_UNLIKELY(!efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported)) { + EFA_WARN(FI_LOG_MR, "Requested FI_MR_DMABUF, but dmabuf is not supported.\n"); + return NULL; + } + + EFA_INFO(FI_LOG_MR, "FI_MR_DMABUF is set. Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n", + mr_attr->dmabuf->fd, mr_attr->dmabuf->offset, mr_attr->dmabuf->len); return efa_mr_reg_ibv_dmabuf_mr( efa_mr->domain->ibv_pd, mr_attr->dmabuf->offset, @@ -520,64 +509,29 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr mr_attr->dmabuf->fd, access ); + } - /* - * TODO: remove the synapseai and neuron blocks by onboarding the - * ofi_hmem_get_dmabuf_fd API. - */ -#if HAVE_SYNAPSEAI - if (efa_mr_is_synapseai(efa_mr)) { - int dmabuf_fd; - uint64_t offset; - int ret; - - ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base, - (uint64_t) mr_attr->mr_iov->iov_len, - &dmabuf_fd, &offset); + if (efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported) { + ret = ofi_hmem_get_dmabuf_fd( + mr_attr->iface, + mr_attr->mr_iov->iov_base, + (uint64_t) mr_attr->mr_iov->iov_len, + &dmabuf_fd, &offset); if (ret != FI_SUCCESS) { - EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n"); + EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for device buffer. errno: %d, err_msg: %s\n", + ret, fi_strerror(-ret)); return NULL; } + EFA_INFO(FI_LOG_MR, "Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n", + dmabuf_fd, offset, mr_attr->mr_iov->iov_len); return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset, mr_attr->mr_iov->iov_len, (uint64_t)mr_attr->mr_iov->iov_base, dmabuf_fd, access); } -#endif - -#if HAVE_NEURON - if (efa_mr_is_neuron(efa_mr)) { - int dmabuf_fd; - uint64_t offset; - int ret; - - ret = neuron_get_dmabuf_fd( - mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, - &dmabuf_fd, - &offset); - - if (ret == FI_SUCCESS) { - /* Success => invoke ibv_reg_dmabuf_mr */ - return efa_mr_reg_ibv_dmabuf_mr( - efa_mr->domain->ibv_pd, 0, - mr_attr->mr_iov->iov_len, - (uint64_t)mr_attr->mr_iov->iov_base, - dmabuf_fd, access); - } else if (ret == -FI_ENOPROTOOPT) { - /* Protocol not availabe => fallback */ - EFA_INFO(FI_LOG_MR, - "Unable to get dmabuf fd for Neuron device buffer, " - "Fall back to ibv_reg_mr\n"); - return ibv_reg_mr( - efa_mr->domain->ibv_pd, - (void *)mr_attr->mr_iov->iov_base, - mr_attr->mr_iov->iov_len, access); - } - return NULL; - } -#endif + EFA_INFO(FI_LOG_MR, "Dmabuf is not supported. Registering memory via ibv_reg_mr with addr: %lu, len: %zu\n", + (uint64_t)mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len); return ibv_reg_mr(efa_mr->domain->ibv_pd, (void *)mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len, access); diff --git a/prov/efa/src/efa_mr.h b/prov/efa/src/efa_mr.h index e4c0e2ca143..72c65aff620 100644 --- a/prov/efa/src/efa_mr.h +++ b/prov/efa/src/efa_mr.h @@ -6,6 +6,9 @@ #include #include +#include + +#include "efa_prov.h" /* * Descriptor returned for FI_HMEM peer memory registrations @@ -35,6 +38,31 @@ struct efa_mr { bool needs_sync; }; +#if HAVE_EFA_DMABUF_MR + +static inline +struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, + size_t len, uint64_t iova, int fd, int access) +{ + return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access); +} + +#else + +static inline +struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, + size_t len, uint64_t iova, int fd, int access) +{ + EFA_WARN(FI_LOG_MR, + "ibv_reg_dmabuf_mr is required for memory" + " registration with FI_MR_DMABUF flags, but " + " not available in the current rdma-core library." + " please build libfabric with rdma-core >= 34.0\n"); + return NULL; +} + +#endif + extern int efa_mr_cache_enable; extern size_t efa_mr_max_cached_count; extern size_t efa_mr_max_cached_size;