forked from lloda/ra-ra
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbench-stencil1.C
156 lines (136 loc) · 4.41 KB
/
bench-stencil1.C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// (c) Daniel Llorens - 2016-2017
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3 of the License, or (at your option) any
// later version.
/// @file bench-stencil1.C
/// @brief Stencil-as-view.
// TODO Bad performance, see also bench-stencil[23].C.
#include <iostream>
#include <iomanip>
#include <random>
#include "ra/operators.H"
#include "ra/io.H"
#include "ra/test.H"
#include "ra/bench.H"
using std::cout, std::endl, std::flush;
using real = double;
int nx = 1000000;
int ts = 10;
auto I = ra::iota(nx-2, 1);
constexpr ra::Small<real, 3> mask = { 1, -2, 1 };
#define THEOP template <class A_, class Anext_, class Astencil_> __attribute__((noinline)) \
auto operator()(A_ & A, Anext_ & Anext, Astencil_ & Astencil)
// sensitive to RA_CHECK_BOUNDS.
struct f_raw
{
THEOP
{
for (int i=1; i+1<nx; ++i) {
Anext(i) = -2*A(i) + A(i+1) + A(i-1);
}
std::swap(A.p, Anext.p);
};
};
// about as fast as f_raw, but no stencil. Insensitive to RA_CHECK_BOUNDS.
struct f_slices
{
THEOP
{
Anext(I) = -2*A(I) + A(I+1) + A(I-1);
std::swap(A.p, Anext.p);
};
};
// with stencil, about as fast as f_raw. Sensitive to RA_CHECK_BOUNDS.
struct f_stencil_explicit
{
THEOP
{
Astencil.p = A.data();
Anext(I) = map([](auto && A) { return -2*A(1) + A(2) + A(0); },
iter<1>(Astencil));
std::swap(A.p, Anext.p);
};
};
// sum() inside uses run time sizes and 1-dim ply_ravel loop which is a bit slower. TODO
struct f_stencil_arrayop
{
THEOP
{
Astencil.p = A.data();
Anext(I) = map([](auto && s) { return sum(s*mask); }, iter<1>(Astencil));
std::swap(A.p, Anext.p);
};
};
// allows traversal order to be chosen between all 6 axes in ply_ravel. 30x slower. TODO
struct f_sumprod
{
THEOP
{
Astencil.p = A.data();
Anext(I) = 0; // TODO miss notation for sum-of-axes without preparing destination...
Anext(I) += map(ra::wrank<1, 1>(ra::times()), Astencil, mask);
std::swap(A.p, Anext.p);
};
};
// variant of the above, much faster (TODO).
struct f_sumprod2
{
THEOP
{
Astencil.p = A.data();
Anext(I) = 0;
plyf(map(ra::wrank<0, 1, 1>([](auto && A, auto && B, auto && C) { A += B*C; }), Anext(I), Astencil, mask));
std::swap(A.p, Anext.p);
};
};
int main()
{
TestRecorder tr(std::cout);
std::random_device rand;
real value = rand();
auto bench = [&](auto & A, auto & Anext, auto & Astencil, auto && ref, auto && tag, auto && f)
{
auto bv = Benchmark().repeats(ts).runs(3)
.once_f([&](auto && repeat)
{
Anext = 0.;
A = value;
repeat([&]() { f(A, Anext, Astencil); });
});
tr.info(std::setw(5), std::fixed, Benchmark::avg(bv)/A.size()/1e-9, " ns [",
Benchmark::stddev(bv)/A.size()/1e-9 ,"] ", tag)
.test_rel_error(ref, A, 1e-11);
};
ra::Big<real, 1> Aref;
tr.section("static rank");
{
ra::Big<real, 1> A({nx}, 1.);
ra::Big<real, 1> Anext({nx}, 0.);
auto Astencil = stencil(A, 1, 1);
cout << "Astencil " << format_array(Astencil(0, ra::dots<1>), "|", " ") << endl;
#define BENCH(ref, op) bench(A, Anext, Astencil, ref, STRINGIZE(op), op {});
BENCH(A, f_raw);
Aref = ra::Big<real, 1>(A);
BENCH(Aref, f_slices);
BENCH(Aref, f_stencil_explicit);
BENCH(Aref, f_stencil_arrayop);
BENCH(Aref, f_sumprod);
BENCH(Aref, f_sumprod2);
#undef BENCH
}
tr.section("dynamic rank");
{
ra::Big<real> B({nx}, 1.);
ra::Big<real> Bnext({nx}, 0.);
auto Bstencil = stencil(B, 1, 1);
cout << "Bstencil " << format_array(Bstencil(0, ra::dots<1>), "|", " ") << endl;
#define BENCH(ref, op) bench(B, Bnext, Bstencil, ref, STRINGIZE(op), op {});
// BENCH(Aref, f_raw); // TODO very slow
BENCH(Aref, f_slices);
BENCH(Aref, f_stencil_explicit);
BENCH(Aref, f_stencil_arrayop);
#undef BENCH
}
return tr.summary();
}