From 0f0343c4e673320bbaae57fbcf6a7a8a8af67202 Mon Sep 17 00:00:00 2001 From: Raghu Raja Date: Tue, 20 Aug 2024 07:21:36 +0000 Subject: [PATCH] rdma: Poll the control cq if no match If a match isn't found for the current send, poll the control cq to see if the match can be found. While this extends the current send() call, it potentially lowers the time until data transfer starts. Signed-off-by: Brian Barrett Signed-off-by: Raghu Raja --- src/nccl_ofi_rdma.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 5786da184..67a7a126e 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -4643,6 +4643,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t * props->maxRecvs > 1. */ + bool polled_cq = false; bool have_ctrl = false; uint16_t msg_seq_num = s_comm->next_msg_seq_num; @@ -4651,6 +4652,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t nccl_ofi_msgbuff_status_t msg_stat; nccl_ofi_msgbuff_result_t mb_res; +retry: /* Retrive entry from message buffer for msg_seq_num index */ mb_res = nccl_ofi_msgbuff_retrieve(s_comm->msgbuff, msg_seq_num, &elem, &type, &msg_stat); @@ -4687,6 +4689,17 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t goto error; } + /* look for control messages and then retry the message search + to avoid unnecessary polling / queueing. */ + if (OFI_UNLIKELY(!polled_cq && !have_ctrl)) { + ret = ofi_process_cq_rail(ep, &ep->control_rail); + if (ret != 0) { + goto error; + } + polled_cq = true; + goto retry; + } + /* Determine if this should be sent eagerly. */ bool eager = false; if ((!have_ctrl && size <= eager_max_size) ||