Skip to content

Commit 07042d4

Browse files
committed
prov/efa: Refactor dmabuf reg
Introduce a boolean is_dmabuf_supported in efa_hmem_info, check if dmabuf is supported for different hmem ifaces. When dmabuf is supported, retrieve the dmabuf fd and offset, and use ibv_reg_dmabuf_mr to register memory. Otherwise, fall back to ibv_reg_mr. Signed-off-by: Jessie Yang <[email protected]>
1 parent f58d5f9 commit 07042d4

File tree

3 files changed

+29
-49
lines changed

3 files changed

+29
-49
lines changed

prov/efa/src/efa_hmem.c

+8
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ static int efa_domain_hmem_info_init_system(struct efa_domain *efa_domain)
9797
info->p2p_disabled_by_user = false;
9898
info->p2p_required_by_impl = false;
9999
info->p2p_supported_by_device = true;
100+
info->is_dmabuf_supported = false;
101+
100102
efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYSTEM);
101103
return 0;
102104
}
@@ -137,6 +139,7 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)
137139

138140
info->initialized = true;
139141
info->p2p_disabled_by_user = false;
142+
info->is_dmabuf_supported = false;
140143

141144
/* If user is using libfabric API 1.18 or later, by default EFA provider is permitted to
142145
* use CUDA library to support CUDA memory, therefore p2p is not required.
@@ -156,6 +159,8 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)
156159
"Unable to register CUDA device buffer via dmabuf: %s. "
157160
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
158161
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
162+
} else {
163+
info->is_dmabuf_supported = true;
159164
}
160165
} else {
161166
EFA_INFO(FI_LOG_DOMAIN,
@@ -247,13 +252,15 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
247252
info->p2p_disabled_by_user = false;
248253
/* Neuron currently requires P2P */
249254
info->p2p_required_by_impl = true;
255+
info->is_dmabuf_supported = false;
250256

251257
#if HAVE_EFA_DMABUF_MR
252258
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
253259
if (ret == FI_SUCCESS) {
254260
ibv_mr = ibv_reg_dmabuf_mr(
255261
g_device_list[0].ibv_pd, offset,
256262
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
263+
info->is_dmabuf_supported = true;
257264
} else if (ret == -FI_ENOPROTOOPT) {
258265
EFA_INFO(FI_LOG_MR,
259266
"Unable to retrieve dmabuf fd of Neuron device buffer, "
@@ -325,6 +332,7 @@ static int efa_domain_hmem_info_init_synapseai(struct efa_domain *efa_domain)
325332
/* SynapseAI currently requires P2P */
326333
info->p2p_required_by_impl = true;
327334
info->p2p_supported_by_device = true;
335+
info->is_dmabuf_supported = true;
328336
efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYNAPSEAI);
329337

330338
/* Only the long read protocol is supported */

prov/efa/src/efa_hmem.h

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct efa_hmem_info {
2626
bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */
2727
bool p2p_required_by_impl; /* Is p2p required for this interface? */
2828
bool p2p_supported_by_device; /* do we support p2p with this device */
29+
bool is_dmabuf_supported;
2930

3031
size_t max_intra_eager_size; /* Maximum message size to use eager protocol for intra-node */
3132
size_t max_medium_msg_size;

prov/efa/src/efa_mr.c

+20-49
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,10 @@ struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
511511
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr,
512512
int access, const uint64_t flags)
513513
{
514-
if (flags & FI_MR_DMABUF)
514+
if (flags & FI_MR_DMABUF) {
515+
assert(efa_mr->domain->hmem_info[mr_attr->iface].is_dmabuf_supported);
516+
EFA_INFO(FI_LOG_MR, "FI_MR_DMABUF is set. Registering dmabuf mr with fd %d, offset %lu, len: %zu\n",
517+
mr_attr->dmabuf->fd, mr_attr->dmabuf->offset, mr_attr->dmabuf->len);
515518
return efa_mr_reg_ibv_dmabuf_mr(
516519
efa_mr->domain->ibv_pd,
517520
mr_attr->dmabuf->offset,
@@ -520,64 +523,32 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
520523
mr_attr->dmabuf->fd,
521524
access
522525
);
526+
}
523527

524-
/*
525-
* TODO: remove the synapseai and neuron blocks by onboarding the
526-
* ofi_hmem_get_dmabuf_fd API.
527-
*/
528-
#if HAVE_SYNAPSEAI
529-
if (efa_mr_is_synapseai(efa_mr)) {
530-
int dmabuf_fd;
531-
uint64_t offset;
532-
int ret;
533-
534-
ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base,
535-
(uint64_t) mr_attr->mr_iov->iov_len,
536-
&dmabuf_fd, &offset);
528+
int dmabuf_fd;
529+
uint64_t offset;
530+
int ret;
531+
532+
if (efa_mr->domain->hmem_info[mr_attr->iface].is_dmabuf_supported) {
533+
ret = ofi_hmem_get_dmabuf_fd(
534+
mr_attr->iface,
535+
mr_attr->mr_iov->iov_base,
536+
(uint64_t) mr_attr->mr_iov->iov_len,
537+
&dmabuf_fd, &offset);
537538
if (ret != FI_SUCCESS) {
538-
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
539+
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for device buffer \n");
539540
return NULL;
540541
}
542+
EFA_INFO(FI_LOG_MR, "Registering dmabuf mr with fd %d, offset: %lu, len: %zu, \n",
543+
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);
541544
return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset,
542545
mr_attr->mr_iov->iov_len,
543546
(uint64_t)mr_attr->mr_iov->iov_base,
544547
dmabuf_fd, access);
545548
}
546-
#endif
547-
548-
#if HAVE_NEURON
549-
if (efa_mr_is_neuron(efa_mr)) {
550-
int dmabuf_fd;
551-
uint64_t offset;
552-
int ret;
553-
554-
ret = neuron_get_dmabuf_fd(
555-
mr_attr->mr_iov->iov_base,
556-
mr_attr->mr_iov->iov_len,
557-
&dmabuf_fd,
558-
&offset);
559-
560-
if (ret == FI_SUCCESS) {
561-
/* Success => invoke ibv_reg_dmabuf_mr */
562-
return efa_mr_reg_ibv_dmabuf_mr(
563-
efa_mr->domain->ibv_pd, 0,
564-
mr_attr->mr_iov->iov_len,
565-
(uint64_t)mr_attr->mr_iov->iov_base,
566-
dmabuf_fd, access);
567-
} else if (ret == -FI_ENOPROTOOPT) {
568-
/* Protocol not availabe => fallback */
569-
EFA_INFO(FI_LOG_MR,
570-
"Unable to get dmabuf fd for Neuron device buffer, "
571-
"Fall back to ibv_reg_mr\n");
572-
return ibv_reg_mr(
573-
efa_mr->domain->ibv_pd,
574-
(void *)mr_attr->mr_iov->iov_base,
575-
mr_attr->mr_iov->iov_len, access);
576-
}
577-
return NULL;
578-
}
579-
#endif
580549

550+
assert(efa_mr->domain->hmem_info[mr_attr->iface].p2p_supported_by_device);
551+
EFA_INFO(FI_LOG_MR, "Dmabuf is not supported. Registering memory via ibv_reg_mr \n");
581552
return ibv_reg_mr(efa_mr->domain->ibv_pd,
582553
(void *)mr_attr->mr_iov->iov_base,
583554
mr_attr->mr_iov->iov_len, access);

0 commit comments

Comments
 (0)