From 1026010b6f59fb7c5b73a188588e90a2bd030b6d Mon Sep 17 00:00:00 2001 From: Raghu Raja Date: Tue, 20 Aug 2024 07:21:36 +0000 Subject: [PATCH] rdma: Poll the control cq if no match If a match isn't found for the current send, poll the control cq to see if the match can be found. While this extends the current send() call, it potentially lowers the time until data transfer starts. Signed-off-by: Brian Barrett Signed-off-by: Raghu Raja --- src/nccl_ofi_rdma.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 42b7d575b..8d00f8bc6 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -4641,6 +4641,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t * props->maxRecvs > 1. */ + bool polled_cq = false; bool have_ctrl = false; uint16_t msg_seq_num = s_comm->next_msg_seq_num; @@ -4649,6 +4650,7 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t nccl_ofi_msgbuff_status_t msg_stat; nccl_ofi_msgbuff_result_t mb_res; +retry: /* Retrive entry from message buffer for msg_seq_num index */ mb_res = nccl_ofi_msgbuff_retrieve(s_comm->msgbuff, msg_seq_num, &elem, &type, &msg_stat); @@ -4685,6 +4687,14 @@ static int send(nccl_net_ofi_send_comm_t *send_comm, void *data, int size, int t goto error; } + /* look for control messages and then retry the message search + to avoid unnecessary polling / queueing. */ + if (OFI_UNLIKELY(!polled_cq && !have_ctrl)) { + ofi_process_cq_rail(ep, &ep->control_rail); + polled_cq = true; + goto retry; + } + /* Determine if this should be sent eagerly. */ bool eager = false; if ((!have_ctrl && size <= eager_max_size) ||