-
Notifications
You must be signed in to change notification settings - Fork 0
/
bitonic_sort.hpp
251 lines (226 loc) · 8.48 KB
/
bitonic_sort.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/*
* Copyright 2015 Ryan H. Lewis
* This code is released under the BSD see LICENSE
*/
#ifndef BITONIC_SORT_HPP
#define BITONIC_SORT_HPP
#include <algorithm>
#include <iterator>
#include <cmath>
#include <cassert>
#include <vector>
namespace distributed{
namespace detail {
constexpr bool UP =1;
constexpr bool DOWN=0;
bool is_power_two( std::size_t x){ return !(x & x - 1) && x ; }
std::size_t power_two_below( std::size_t x){
std::size_t r=2;
while( r < x){ r*=2; } return r/2;
}
std::size_t power_two_above( std::size_t x){
return 2*power_two_below( x);
}
} //end namespace detail
namespace debug{
template< typename Communicator>
void print_partners( Communicator& world, std::size_t partner){
for( auto r = 0; r < world.size(); ++r){
if( r == world.rank()){
std::cout << world.rank() << "-" << partner << " | " << std::flush;
}
world.barrier();
}
world.barrier();
if( world.rank() == 0){ std::cout << std::endl << std::flush; }
world.barrier();
}
template< typename Communicator, typename Iterator>
void print_list( Communicator & world, Iterator begin, Iterator end){
for( auto r = 0; r < world.size(); ++r){
if( r == world.rank()){
for( auto i = begin; i != end; ++i){ std::cout << *i << " " << std::flush;}
if( r != world.size()-1){ std::cout << "\b|" << std::flush; }
}
world.barrier();
}
if( world.rank() == 0){ std::cout << std::endl << std::flush; }
world.barrier();
}
}
template< typename Communicator,
typename Vector,
typename Vector1>
void exchange_data( Communicator& world, std::size_t partner,
Vector& send_buffer, Vector1& receive_buffer){
if( partner >= world.size()){
std::cout << partner << " " << world.size() << std::endl;
}
assert( partner < world.size());
//Boost::mpi does not yet implement sendrecv
boost::mpi::request requests[ 2];
requests[ 0] = world.isend( partner, 0, send_buffer);
requests[ 1] = world.irecv( partner, 0, receive_buffer);
boost::mpi::wait_all( requests, requests+2);
}
/*
* TODO: We may save a factor of 2 when exchanging larger sets by having one processor
* send its data to the other, having that one merge, then the other processor returning
* the remaining elements. Then we only move |A|+|B| at most instead of sending |A|+|B|
* always, and assuming a \alpha+\beta{n} model, we may potentially hide the latency term \alpha
*/
template< bool direction,
typename Communicator,
typename Vector,
typename Vector1,
typename Less>
void exchange_and_merge( Communicator& world,
std::size_t partner,
Vector& our_data,
Vector1& their_data,
Less& less){
typedef typename Vector::value_type T;
exchange_data( world, partner, our_data, their_data);
std::vector< T> full_list( our_data.size() + their_data.size());
T max_min = std::max( our_data.front(), their_data.front());
T min_max = std::min( our_data.back(), their_data.back());
std::merge( our_data.begin(), our_data.end(),
their_data.begin(), their_data.end(),
full_list.begin(), less);
our_data.clear();
/* When processors do not have the same amount of data on a node the correctness
* of the overall sort depends critically on the exchange-merge.
* Suppose A and B are the sets in question, where A is held by low processor and B by high processor
* Define L_{A,B} = {x | x < max( min(A), min(B))}
* and U_{A,B} = {x | x > min(max(A),max(B)}
* We need that L goes to processor holding A and U goes to processor holding B
* Notice that |L| may be larger than |A| and similarly for |U|.
*/
//This branch is removed by the compiler
if( direction == detail::DOWN){
int j = 0;
auto i = full_list.begin();
while (((*i < max_min) || (j < full_list.size()/2)) && (*i <= min_max)){
our_data.push_back( *i);
++i; ++j;
}
}else{
int j = full_list.size()-1;
auto i = full_list.rbegin();
while( (*i >= max_min) && ((j >= full_list.size()/2) || (*i > min_max))){
our_data.push_back( *i);
++i; --j;
}
std::reverse( our_data.begin(), our_data.end());
}
their_data.clear();
}
template< typename Communicator, typename Range, typename Less>
void bitonic_sort_binary(Communicator& world, Range& our_data, Less& less){
typedef typename Range::value_type T;
std::vector< T> their_data;
for (std::size_t i = 2; i <= world.size(); i*=2){
bool ith_bit_unset = ((world.rank() & i) == 0);
bool ith_bit_set = !ith_bit_unset;
std::size_t l = log2( i);
for (std::size_t j = (1 << (l-1)); j > 0; j >>= 1) {
std::size_t partner = world.rank() ^ j;
bool jth_bit_unset = (world.rank() < partner);
bool jth_bit_set = !jth_bit_unset;
//if( world.rank() == 0){ std::cout << "l = " << l <<std::endl; }
//debug::print_partners( world, partner);
//debug::print_list( world, our_data.begin(), our_data.end());
//either we are to the left of our neighbor and ith_bit_unset
//or we are to the right of our neighbor and the ith_bit_set
if ( (jth_bit_unset && ith_bit_unset) || (ith_bit_set && jth_bit_set)){
exchange_and_merge< detail::DOWN>( world, partner,
our_data, their_data,
less);
}else{
exchange_and_merge< detail::UP>( world, partner,
our_data, their_data,
less);
}
// debug::print_list( world, our_data.begin(), our_data.end());
// if( world.rank() == 0){ std::cout << "----------------------------------"
// << std::endl << std::flush; }
// world.barrier();
} //end decreasing
} //end outer loop
}
template <typename Communicator, typename Vector, typename Less>
void bitonic_merge_increasing( Communicator& world,
Vector& our_data,
Less& less){
typedef typename Vector::value_type T;
std::size_t num_left = detail::power_two_below( world.size());
std::size_t num_right = world.size() - num_left;
std::size_t rank = world.rank();
std::vector< T> their_data;
// 1, Do merge between the k right procs and the highest k left procs.
if ( (rank < num_left) && ( rank >= (num_left - num_right) ) ){
std::size_t partner = rank + num_right;
assert( partner < world.size());
exchange_and_merge< detail::DOWN>( world, partner, our_data, their_data, less);
} else if (rank >= num_left) {
std::size_t partner = rank - num_right;
assert( partner < world.size());
exchange_and_merge< detail::UP>( world, partner, our_data, their_data, less);
}
}
template< typename Communicator, typename Range, typename Less>
void bitonic_sort(Communicator& world, Range& range, Less& less){
assert( range.size() > 0);
auto begin = std::begin( range);
auto end = std::end( range);
std::sort( begin, end, less);
if( detail::is_power_two( world.size())){
bitonic_sort_binary( world, range, less);
return;
}
std::size_t previous_power_two = detail::power_two_below( world.size());
bool in_power_of_two = (world.rank() < previous_power_two);
//we set the processors rank in the new communicator
//so that the first power of two processors keep there ranks
//the remaining ones reindex so that the new proc 0
//is the one who previously had rank world.size()-1
std::size_t key = world.rank();
if( !in_power_of_two){ key = world.size()-world.rank()-1; }
auto new_world_ = world.split( in_power_of_two, key);
//------------------------------------
//Code to see that the key mapping works
//-------------------------------------
//for( auto i = 0; i < world.size(); ++i){
//if( i == world.rank()){
//std::cout << world.rank() << " ---->> " << key << " ---->> " << new_world_.rank() << std::endl;
//}
//world.barrier();
//}
//if( world.rank() == 0){ std::cout << "------------" << std::endl; }
//world.barrier();
if( in_power_of_two){ //ranks 0...2^k --> 0 ...2^k
bitonic_sort_binary( new_world_, range, less);
}else{ //flip the ranks of these processes
//2^k+1 ... p --> 2^k .... 0
bitonic_sort( new_world_, range, less);
//this way all of this data is sorted in the reverse order!
}
//now we perform a bitonic merge
bitonic_merge_increasing( world, range, less);
//split the processes again into the same groups,
//this time do not reorder ranks.
auto new_world = world.split( in_power_of_two, world.rank());
if( in_power_of_two){
bitonic_sort_binary( new_world, range, less);
}else{
bitonic_sort( new_world, range, less);
}
}
template< typename Communicator, typename Range>
void bitonic_sort(Communicator& world, Range& range){
typedef typename Range::value_type T;
std::less< T> less;
bitonic_sort( world, range, less);
}
} //end namespace distributed
#endif //BITONIC_SORT_HPP