From 99925f1c0028d9086731d92cb02e1b4c8bfc42f6 Mon Sep 17 00:00:00 2001 From: Thomas Barrett Date: Sun, 28 Jan 2024 00:25:30 +0000 Subject: [PATCH] Use IB_MERGE_VFS argument when detecting PCI path When running in a cloud-hypervisor guest, IB VFs are exposed as a RCiEP. If the IB VFs are merged, NCCL does not correctly detect PCI topology. --- include/p2p_plugin.h | 2 ++ src/p2p_plugin.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/p2p_plugin.h b/include/p2p_plugin.h index 80bc118d..1ac916d6 100644 --- a/include/p2p_plugin.h +++ b/include/p2p_plugin.h @@ -123,6 +123,8 @@ int nccl_p2p_ib_speed(int speed); int64_t ncclParamSharpMaxComms(); +int64_t ncclParamIbMergeVfs(); + int ncclIbRelaxedOrderingCapable(void); nccl_p2p_plugin_t nccl_p2p_get_plugin_type(); diff --git a/src/p2p_plugin.c b/src/p2p_plugin.c index a638f66d..8a257b52 100644 --- a/src/p2p_plugin.c +++ b/src/p2p_plugin.c @@ -385,7 +385,7 @@ ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_n // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; // Also merge virtual functions (VF) into the same device - p[strlen(p)-3] = '0'; + if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0'; // And keep the real port aside (the ibv port is always 1 on recent cards) *real_port = 0; for (int d=0; d