Skip to content

Commit baed7db

Browse files
committed
plugin update to nccl-2.20
1 parent 3ff78de commit baed7db

12 files changed

+933
-443
lines changed

configure.ac

+1-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ AC_ARG_WITH([verbs],
8080
AC_CHECK_HEADER( [infiniband/verbs.h], [],[AC_MSG_FAILURE([ibverbs header files not found])])
8181
AC_CHECK_LIB([ibverbs], [ibv_get_device_list], [],[AC_MSG_FAILURE([libibverbs not found]);])
8282

83-
AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, IBV_QPF_GRH_REQUIRED, ibv_reg_dmabuf_mr], [], [],
83+
AC_CHECK_DECLS([IBV_ACCESS_RELAXED_ORDERING, IBV_QPF_GRH_REQUIRED, ibv_reg_dmabuf_mr, ibv_query_ece, ibv_set_ece], [], [],
8484
[[#include <infiniband/verbs.h>]])
8585

8686
# check for ucx

include/ibvwrap.h

+2
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries,
5959
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
6060
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
6161
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
62+
ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
63+
ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
6264
ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
6365
ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr);
6466
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);

include/nccl.h

+42-5
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
#endif
1515

1616
#define NCCL_MAJOR 2
17-
#define NCCL_MINOR 15
18-
#define NCCL_PATCH 1
17+
#define NCCL_MINOR 20
18+
#define NCCL_PATCH 3
1919
#define NCCL_SUFFIX ""
2020

21-
#define NCCL_VERSION_CODE 21510
21+
#define NCCL_VERSION_CODE 22003
2222
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
2323

2424
#ifdef __cplusplus
@@ -42,15 +42,24 @@ typedef enum { ncclSuccess = 0,
4242
ncclInProgress = 7,
4343
ncclNumResults = 8 } ncclResult_t;
4444

45+
#define NCCL_CONFIG_UNDEF_INT INT_MIN
46+
#define NCCL_CONFIG_UNDEF_PTR NULL
47+
#define NCCL_SPLIT_NOCOLOR -1
48+
4549
/* Communicator configuration. Users can assign value to attributes to specify the
4650
* behavior of a communicator. */
47-
typedef struct ncclConfig_v21400 {
51+
typedef struct ncclConfig_v21700 {
4852
/* attributes that users should never touch. */
4953
size_t size;
5054
unsigned int magic;
5155
unsigned int version;
5256
/* attributes that users are able to customize. */
5357
int blocking;
58+
int cgaClusterSize;
59+
int minCTAs;
60+
int maxCTAs;
61+
const char *netName;
62+
int splitShare;
5463
} ncclConfig_t;
5564

5665
/* Config initializer must be assigned to initialize config structure when it is created.
@@ -59,9 +68,23 @@ typedef struct ncclConfig_v21400 {
5968
sizeof(ncclConfig_t), /* size */ \
6069
0xcafebeef, /* magic */ \
6170
NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
62-
1 /* blocking */ \
71+
NCCL_CONFIG_UNDEF_INT, /* blocking */ \
72+
NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
73+
NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
74+
NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
75+
NCCL_CONFIG_UNDEF_PTR, /* netName */ \
76+
NCCL_CONFIG_UNDEF_INT /* splitShare */ \
6377
}
6478

79+
/* NCCL malloc and free function for all types of NCCL optimizations
80+
* (e.g. user buffer registration). The actual allocated size might
81+
* be larger than requested due to granularity requirement. */
82+
ncclResult_t ncclMemAlloc(void** ptr, size_t size);
83+
ncclResult_t pncclMemAlloc(void** ptr, size_t size);
84+
85+
ncclResult_t ncclMemFree(void *ptr);
86+
ncclResult_t pncclMemFree(void *ptr);
87+
6588
/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
6689
* This integer is coded with the MAJOR, MINOR and PATCH level of the
6790
* NCCL library
@@ -119,6 +142,10 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
119142
const char* ncclGetErrorString(ncclResult_t result);
120143
const char* pncclGetErrorString(ncclResult_t result);
121144

145+
/* Returns a human-readable message of the last error that occurred. */
146+
const char* ncclGetLastError(ncclComm_t comm);
147+
const char* pncclGetLastError(ncclComm_t comm);
148+
122149
/* Checks whether the comm has encountered any asynchronous errors */
123150
ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
124151
ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -135,6 +162,16 @@ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
135162
ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
136163
ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
137164

165+
166+
/* Register CUDA buffer for zero-copy operation */
167+
ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
168+
ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
169+
170+
/* Deregister CUDA buffer */
171+
ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
172+
ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
173+
174+
138175
/* Reduction operation selector */
139176
typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
140177
typedef enum { ncclSum = 0,

include/p2p_plugin.h

+34-22
Original file line numberDiff line numberDiff line change
@@ -44,18 +44,27 @@ struct ncclIbMrCache {
4444
int capacity, population;
4545
};
4646

47+
#define NCCL_IB_MAX_DEVS_PER_NIC 2
48+
#define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
49+
struct ncclIbMergedDev {
50+
int ndevs;
51+
int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
52+
int speed;
53+
char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
54+
} __attribute__((aligned(64)));
55+
4756
struct ncclIbRequest {
48-
struct ncclIbVerbs* verbs;
57+
struct ncclIbNetCommBase* base;
4958
int type;
50-
int events;
5159
struct ncclSocket* sock;
52-
struct ncclIbGidInfo* gidInfo;
60+
int events[NCCL_IB_MAX_DEVS_PER_NIC];
61+
struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
5362
int nreqs;
5463
union {
5564
struct {
5665
int size;
5766
void* data;
58-
uint32_t lkey;
67+
uint32_t lkeys[NCCL_IB_MAX_DEVS_PER_NIC];
5968
int offset;
6069
} send;
6170
struct {
@@ -64,56 +73,57 @@ struct ncclIbRequest {
6473
};
6574
};
6675

67-
struct ncclIbVerbs {
68-
int dev;
69-
struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd
76+
// Retain local RoCE address for error logging
77+
struct ncclIbGidInfo {
78+
uint8_t link_layer;
79+
union ibv_gid localGid;
80+
};
81+
82+
typedef struct ncclIbNetCommDevBase {
83+
int ibDevN;
84+
struct ibv_pd* pd;
7085
struct ibv_cq* cq;
7186
uint64_t pad[1];
72-
struct ncclIbRequest reqs[MAX_REQUESTS];
73-
};
87+
struct ncclIbGidInfo gidInfo;
88+
} ncclIbNetCommDevBase;
7489

7590
typedef struct ncclIbDev {
7691
pthread_mutex_t lock;
7792
int device;
7893
uint64_t guid;
79-
uint8_t port;
94+
uint8_t portNum;
8095
uint8_t link;
8196
uint8_t isSharpDev;
8297
int speed;
8398
struct ibv_context* context;
8499
int pdRefs;
85100
struct ibv_pd* pd;
86-
struct ncclIbVerbs verbs;
87101
char devName[MAXNAMESIZE];
88102
char *pciPath;
89103
int realPort;
90104
int maxQp;
91105
struct ncclIbMrCache mrCache;
92106
int ar; // ADAPTIVE_ROUTING
93-
} __attribute__((aligned(64))) nccl_ib_dev_t;
107+
struct ibv_port_attr portAttr;
108+
} __attribute__((aligned(64))) ncclIbDev;
94109

95-
#define MAX_IB_PORT 15
96-
struct userIbDev {
97-
char devName[MAXNAMESIZE];
98-
uint16_t port_en;
99-
};
100110

101111
#define MAX_IB_DEVS 32
112+
struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS];
102113
extern struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
103-
extern struct ncclIbDev userIbDevs[MAX_IB_DEVS];
104114
/* Detect whether GDR can work on a given NIC with the current CUDA device
105115
* Returns :
106116
* ncclSuccess : GDR works
107117
* ncclSystemError : no module or module loaded but not supported by GPU */
108-
ncclResult_t nccl_p2p_gdr_support(int dev);
118+
ncclResult_t nccl_p2p_gdr_support();
109119

110120
ncclResult_t nccl_p2p_dmabuf_support(int dev);
111121

112-
ncclResult_t nccl_p2p_ib_pci_path(nccl_ib_dev_t *devs, int num_devs, char* dev_name, char** path, int* real_port);
122+
ncclResult_t nccl_p2p_ib_pci_path(ncclIbDev *devs, int num_devs, char* dev_name, char** path, int* real_port);
113123

114-
ncclResult_t nccl_p2p_ib_get_properties(nccl_ib_dev_t *devs, int dev, ncclNetProperties_t* props);
124+
ncclResult_t nccl_p2p_ib_get_properties(ncclIbDev *devs, int dev, ncclNetProperties_t* props);
115125

116-
ncclResult_t nccl_p2p_ib_init(int *num_devs, nccl_ib_dev_t *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
126+
ncclResult_t nccl_p2p_ib_init(int *num_devs, ncclIbDev *ncclIbDevs, char *ncclIbIfName, union ncclSocketAddress *ncclIbIfAddr, pthread_t *ncclIbAsyncThread, ncclDebugLogger_t logFunction);
117127

118128
/* Convert value returtned by ibv_query_port to actual link width */
119129
int nccl_p2p_ib_width(int width);
@@ -125,6 +135,8 @@ int64_t ncclParamSharpMaxComms();
125135

126136
int64_t ncclParamIbMergeVfs();
127137

138+
int64_t ncclParamIbMergeNics();
139+
128140
int ncclIbRelaxedOrderingCapable(void);
129141

130142
nccl_p2p_plugin_t nccl_p2p_get_plugin_type();

include/param.h

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
const char* userHomeDir();
1313
void setEnvFile(const char* fileName);
1414
void initEnv();
15+
const char *ncclGetEnv(const char *name);
1516

1617
void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
1718

0 commit comments

Comments
 (0)