-
Notifications
You must be signed in to change notification settings - Fork 76
/
Copy pathmain.cpp
300 lines (257 loc) · 8.86 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/*
* Simple test program for evaluating the precision and performance of the
* equal-area mapping, using all three versions of the transform.
*
* Written by Petrik Clarberg <[email protected]>, Lund University, 2007-2008.
* This code is released as public domain for use free of charge for any
* purpose, but without any kind of warranty.
*/
#include <iostream>
#include <iomanip>
#include <sstream>
#include "mapping.h"
#include "mappingfast.h"
#include "mappingsse.h"
using namespace std;
using namespace mapping;
/// Returns a random number in [0,1).
float uniform() { return (float)std::rand() / (1.f+RAND_MAX); }
// Timer functions (tic/toc, returns time in seconds)
// Don't forget to defined MACOSX if you compile on Mac
#ifdef MACOSX
#include <CoreServices/CoreServices.h>
AbsoluteTime _start;
void tic() { _start = UpTime(); }
double toc() {
AbsoluteTime now = UpTime();
AbsoluteTime absTime = SubAbsoluteFromAbsolute(now, _start);
Nanoseconds ns = AbsoluteToNanoseconds(absTime);
return (double)UnsignedWideToUInt64(ns) / 1.0e9;
}
#else // Fallback for non-Mac systems (poor precision)
#include <ctime>
std::clock_t _start;
void tic() { _start = std::clock(); }
double toc() {
std::clock_t now = std::clock();
return (double)(now-_start) / CLOCKS_PER_SEC;
}
#endif
// ----------------------------------------------------------------------------
/// Computes an estimates of the precision by transforming a large number of
/// points/vectors using both the exact and approximate mappings. The error
/// is measured as Euclidian distance in 3D (approximately the same as arc
/// length on the unit sphere since the errors are very small).
// ----------------------------------------------------------------------------
void precision()
{
const int N = 30000; // NxN points are used
double maxDist1=0.0, maxDist2=0.0;
double avgDist1=0.0, avgDist2=0.0;
int count = 0;
for(int y=0; y<N; y++)
{
for(int x=0; x<N; x++)
{
// Compute original point in 2D and exact vector in 3D.
// The unit square is sampled using NxN stratified random points.
vec2f p = vec2f( ((float)x+uniform())/N, ((float)y+uniform())/N );
vec3f d = sqr2sph(p);
vec2f_4 p4(p); // p4 = original point as SSE vector
vec3f_4 d4(d); // d4 = original vector as SSE vector
// Test precision of SSE-optimized version.
vec2f_4 p4q = sph2sqr_sse(d4); // p4q = approx pos in 2D (SSE)
vec3f_4 d4q = sqr2sph_sse(p4); // d4q = approx vector in 3D (SSE)
vec3f dq1 = sqr2sph(p4q.at(0)); // dq1 = 3D vector corresponding to approx pos
vec3f dq2 = d4q.at(0); // dq2 = 3D vector corresponding to approx vector
// Test precision of optimized scalar version (should be the same as the SSE version).
// vec2f pq = sph2sqr_fast(d); // pq = approx pos in 2D (scalar)
// vec3f dq = sqr2sph_fast(p); // dq = approx vector in 3D (scalar)
// vec3f dq1 = sqr2sph(pq); // dq1 = 3D vector corresponding to approx pos
// vec3f dq2 = dq; // dq2 = 3D vector corresponding to approx vector
// Compute errors as max and average Euclidian distance.
double d1 = (d-dq1).len();
double d2 = (d-dq2).len();
if(d1 > maxDist1) maxDist1 = d1;
if(d2 > maxDist2) maxDist2 = d2;
avgDist1 += d1;
avgDist2 += d2;
count++;
}
}
avgDist1 /= (double)count;
avgDist2 /= (double)count;
cout << "Precision of sphere->square transform" << endl;
cout << " max distance: " << maxDist1 << endl;
cout << " avg distance: " << avgDist1 << endl;
cout << endl;
cout << "Precision of square->sphere transform" << endl;
cout << " max distance: " << maxDist2 << endl;
cout << " avg distance: " << avgDist2 << endl;
cout << endl;
}
// ----------------------------------------------------------------------------
/// Performs benchmarking of the different implementations using a specified
/// number of points/vectors. The tests are repeted M times to get better
/// accuracy. The timings are presented as clock cycles / transform, hence
/// we need to current CPU frequency (in Hz)
// ----------------------------------------------------------------------------
void benchmark(int N, int numIter, double cpuFreq)
{
// Allocate temporary storage
vec2f* pos = new vec2f[N];
vec3f* dir = new vec3f[N];
vec2f_4* pos4 = new vec2f_4[N/4];
vec3f_4* dir4 = new vec3f_4[N/4];
double total = (double)N * numIter;
double t1,t2,t3,c1,c2,c3;
// -------------------------------
// Test square -> sphere transform
// -------------------------------
cout << "Testing the square to sphere transform ..." << endl;
cout << " #points : " << N << endl;
cout << " #iterations : " << numIter << endl;
// Setup array of N random 2D points in [0,1].
for(int i=0; i<N; i++)
{
vec2f p(uniform(),uniform());
pos[i] = p;
pos4[i/4].fx[i&3] = p.x;
pos4[i/4].fy[i&3] = p.y;
}
// Perform two passes of the benchmarking to make sure the cache
// is initialized before making the final run.
for(int j=0; j<2; j++)
{
// Timing of scalar version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N; j++)
dir[j] = sqr2sph( pos[j] );
}
t1 = toc();
// Timing of optimized scalar version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N; j++)
dir[j] = sqr2sph_fast( pos[j] );
}
t2 = toc();
// Timing of SSE version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N/4; j++)
dir4[j] = sqr2sph_sse( pos4[j] );
}
t3 = toc();
}
// Compute number of clock cycles / transform.
total = (double)N * numIter;
c1 = cpuFreq*t1 / total;
c2 = cpuFreq*t2 / total;
c3 = cpuFreq*t3 / total;
// Print results.
cout << " time normal scalar : " << t1 << " s" << endl;
cout << " time fast scalar : " << t2 << " s" << endl;
cout << " time fast SSE : " << t3 << " s" << endl;
cout << " cycles normal scalar : " << c1 << endl;
cout << " cycles fast scalar : " << c2 << endl;
cout << " cycles fast SSE : " << c3 << endl;
cout << endl;
// -------------------------------
// Test sphere -> square transform
// -------------------------------
cout << "Testing the sphere to square transform ..." << endl;
cout << " #vectors : " << N << endl;
cout << " #iterations : " << numIter << endl;
// Setup array of N random 3D vectors on the sphere.
for(int i=0; i<N; i++)
{
vec2f p(uniform(),uniform());
vec3f d = sqr2sph(p);
d.normalize(); // just in case
dir[i] = d;
dir4[i/4].setAt(i&3,d);
}
// Perform two passes of the benchmarking to make sure the cache
// is initialized before making the final run.
for(int j=0; j<2; j++)
{
// Timing of scalar version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N; j++)
pos[j] = sph2sqr( dir[j] );
}
t1 = toc();
// Timing of optimized scalar version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N; j++)
pos[j] = sph2sqr_fast( dir[j] );
}
t2 = toc();
// Timing of SSE version
tic();
for(int i=0; i<numIter; i++)
{
for(int j=0; j<N/4; j++)
pos4[j] = sph2sqr_sse( dir4[j] );
}
t3 = toc();
}
// Compute number of clock cycles / transform.
c1 = cpuFreq*t1 / total;
c2 = cpuFreq*t2 / total;
c3 = cpuFreq*t3 / total;
// Print results.
cout << " time normal scalar : " << t1 << " s" << endl;
cout << " time fast scalar : " << t2 << " s" << endl;
cout << " time fast SSE : " << t3 << " s" << endl;
cout << " cycles normal scalar : " << c1 << endl;
cout << " cycles fast scalar : " << c2 << endl;
cout << " cycles fast SSE : " << c3 << endl;
cout << endl;
// Delete buffers.
delete[] pos;
delete[] dir;
delete[] pos4;
delete[] dir4;
}
// ----------------------------------------------------------------------------
int main (int argc, char * const argv[])
{
cout << "Test program for SIMD mapping." << endl;
cout << "Written by Petrik Clarberg <[email protected]>, Lund University, 2007-2008." << endl;
cout << "Released as public domain, without warranty, free of charge for any use." << endl;
cout << endl;
// Estimate the max and avergae approximation errors.
// This function can take a while (couple of minutes).
cout << "Estimating the approximation errors" << endl;
precision();
// Benchmark the implementations using different dataset sizes.
// We ask the user about the CPU speed first.
double cpuSpeed;
cout << "What is your CPU frequency in GHz? " << flush;
cin >> cpuSpeed;
cpuSpeed *= 1.0e9;
benchmark(1<<8, 1<<22, cpuSpeed); // 256
benchmark(1<<12, 1<<18, cpuSpeed); // 4 k
benchmark(1<<16, 1<<14, cpuSpeed); // 64 k
benchmark(1<<20, 1<<10, cpuSpeed); // 1 M
benchmark(1<<24, 1<<6, cpuSpeed); // 16 M
// Test bilinear interpolation
// setup_bilerp(8); // set map size to 2^8=256 pixels
// vec2f pos(0.1f,0.7f); // 2D coordinates of the point to be interpolated
// int32_4 idx;
// f32_4 w;
// bilerp(pos, idx, w); // idx & w holds the pixel indices and weights
// cout << "indices : " << idx << endl;
// cout << "weights : " << w << endl;
return 0;
}