-
Notifications
You must be signed in to change notification settings - Fork 1
/
comm-nccl.cu
142 lines (122 loc) · 5.02 KB
/
comm-nccl.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <comm-nccl.hpp>
#include <algorithm>
#include <numeric>
template<typename T> inline ncclDataType_t get_nccl_datatype() {
if (typeid(T) == typeid(long long))
return ncclInt64;
if (typeid(T) == typeid(double))
return ncclDouble;
if (typeid(T) == typeid(std::complex<double>))
return ncclDouble;
return ncclChar;
}
template<typename T> inline int get_data_multiplier() {
if (typeid(T) == typeid(long long))
return 1;
if (typeid(T) == typeid(double))
return 1;
if (typeid(T) == typeid(std::complex<double>))
return 2;
return sizeof(T);
}
template<typename T> inline void ColCommNCCL::level_merge(T* data, long long len, cudaStream_t stream) const {
if (MergeNCCL != nullptr) {
long long mlen = len * get_data_multiplier<T>();
ncclReduce((const void*)data, data, mlen, get_nccl_datatype<T>(), ncclSum, 0, MergeNCCL, stream);
}
}
template<typename T> inline void ColCommNCCL::level_sum(T* data, long long len, cudaStream_t stream) const {
long long mlen = len * get_data_multiplier<T>();
if (AllReduceNCCL != nullptr)
ncclAllReduce((const void*)data, data, mlen, get_nccl_datatype<T>(), ncclSum, AllReduceNCCL, stream);
if (DupNCCL != nullptr)
ncclBroadcast((const void*)data, data, mlen, get_nccl_datatype<T>(), 0, DupNCCL, stream);
}
template<typename T> inline void ColCommNCCL::neighbor_bcast(T* data, const long long box_dims[], cudaStream_t stream) const {
std::vector<long long> offsets(Boxes.size() + 1, 0);
for (long long p = 0; p < (long long)Boxes.size(); p++) {
long long end = Boxes[p].second;
offsets[p + 1] = std::reduce(box_dims, &box_dims[end], offsets[p]);
box_dims = &box_dims[end];
}
for (long long p = 0; p < (long long)NeighborNCCL.size(); p++) {
long long llen = (offsets[p + 1] - offsets[p]) * get_data_multiplier<T>();
void* loc = &data[offsets[p]];
ncclBroadcast((const void*)loc, loc, llen, get_nccl_datatype<T>(), NeighborComm[p].first, NeighborNCCL[p], stream);
}
if (DupNCCL != nullptr) {
long long mlen = offsets.back() * get_data_multiplier<T>();
ncclBroadcast((const void*)data, data, mlen, get_nccl_datatype<T>(), 0, DupNCCL, stream);
}
}
template<typename T> inline void ColCommNCCL::neighbor_reduce(T* data, const long long box_dims[], cudaStream_t stream) const {
std::vector<long long> offsets(Boxes.size() + 1, 0);
for (long long p = 0; p < (long long)Boxes.size(); p++) {
long long end = Boxes[p].second;
offsets[p + 1] = std::reduce(box_dims, &box_dims[end], offsets[p]);
box_dims = &box_dims[end];
}
for (long long p = 0; p < (long long)NeighborNCCL.size(); p++) {
long long llen = (offsets[p + 1] - offsets[p]) * get_data_multiplier<T>();
void* loc = &data[offsets[p]];
ncclReduce((const void*)loc, loc, llen, get_nccl_datatype<T>(), ncclSum, NeighborComm[p].first, NeighborNCCL[p], stream);
}
if (DupNCCL != nullptr) {
long long mlen = offsets.back() * get_data_multiplier<T>();
ncclBroadcast((const void*)data, data, mlen, get_nccl_datatype<T>(), 0, DupNCCL, stream);
}
}
ColCommNCCL::ColCommNCCL(const std::pair<long long, long long> Tree[], std::pair<long long, long long> Mapping[], const long long Rows[], const long long Cols[], MPI_Comm world) :
ColCommMPI(Tree, Mapping, Rows, Cols, world) {
long long len = allocedComm.size();
std::vector<ncclUniqueId> ids(len);
allocedNCCL = std::vector<ncclComm_t>(len);
ncclGroupStart();
for (long long i = 0; i < len; i++) {
int rank, size;
MPI_Comm_rank(allocedComm[i], &rank);
MPI_Comm_size(allocedComm[i], &size);
if (rank == 0)
ncclGetUniqueId(&ids[i]);
MPI_Bcast((void*)&ids[i], sizeof(ncclUniqueId), MPI_BYTE, 0, allocedComm[i]);
ncclCommInitRank(&allocedNCCL[i], size, ids[i], rank);
}
ncclGroupEnd();
auto find_nccl = [&](const MPI_Comm& c) {
const std::vector<MPI_Comm>::iterator i = std::find(allocedComm.begin(), allocedComm.end(), c);
return i == allocedComm.end() ? nullptr : *(allocedNCCL.begin() + std::distance(allocedComm.begin(), i));
};
NeighborNCCL = std::vector<ncclComm_t>(NeighborComm.size(), nullptr);
MergeNCCL = find_nccl(MergeComm.second);
for (long long i = 0; i < (long long)NeighborComm.size(); i++)
NeighborNCCL[i] = find_nccl(NeighborComm[i].second);
AllReduceNCCL = find_nccl(AllReduceComm);
DupNCCL = find_nccl(DupComm);
}
void ColCommNCCL::free_all_comms() {
MergeComm = std::make_pair(0, MPI_COMM_NULL);
NeighborComm.clear();
AllReduceComm = MPI_COMM_NULL;
DupComm = MPI_COMM_NULL;
for (MPI_Comm& c : allocedComm)
MPI_Comm_free(&c);
allocedComm.clear();
MergeNCCL = nullptr;
NeighborNCCL.clear();
AllReduceNCCL = nullptr;
DupNCCL = nullptr;
for (ncclComm_t& c : allocedNCCL)
ncclCommDestroy(c);
allocedNCCL.clear();
}
int ColCommNCCL::set_device(MPI_Comm world) {
int mpi_rank;
MPI_Comm_rank(world, &mpi_rank);
int num_device;
int gpu_avail = (cudaGetDeviceCount(&num_device) == cudaSuccess);
int device = mpi_rank % num_device;
if (gpu_avail)
return cudaSetDevice(device) == cudaSuccess;
else
return 0;
}