-
Notifications
You must be signed in to change notification settings - Fork 76
/
Copy pathvecsse.h
367 lines (304 loc) · 10.4 KB
/
vecsse.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
/*
* Contains simple wrapper and vector classes using SSE and unions to provide
* access to the individual elements. Many of the functions are provided solely
* for convenience, and should not be used in performance critical code.
*
* Written by Petrik Clarberg <[email protected]>, Lund University, 2007-2008.
* This code is released as public domain for use free of charge for any
* purpose, but without any kind of warranty.
*/
#ifndef __VECSSE_H__
#define __VECSSE_H__
#include <cmath>
#include <iostream>
#include <iomanip>
#include "vec.h"
// Include the highest level of SSEx header enabled.
#if defined(__SSE2__)
#include <emmintrin.h> // SSE2
#elif defined(__SSE__)
#include <xmmintrin.h> // SSE
#endif
// Portable align: gcc uses __attribute__
#ifndef __WIN32__
#define __align16 __attribute__((aligned(16)))
#else
#define __align16 __declspec(align(16) )
#endif
/// Returns true if the pointer p is aligned to a 16-byte boundary.
inline bool is_aligned16(void* p) { return ((int)p&0xF)==0; }
namespace mapping
{
#ifdef __SSE__
// Define a multiply-add (MADD) instruction for convenience, as it is not
// supported by SSE/SSE2. Returns a+b*c, where a,b,c are packed floats.
#define _mm_madd_ps(a,b,c) _mm_add_ps(a,_mm_mul_ps((b),(c)))
// ------------------------------------------------------------------------
/// Wrapper for __m128 providing easy access to the four float elements,
/// and some useful functions for initialization etc.
// ------------------------------------------------------------------------
class f32_4
{
public:
union
{
__m128 m; ///< Access as a single 128-bit type.
float f[4]; ///< Direct access as four floats f3...f0.
unsigned int bits[4]; ///< Direct access to bitwise representation.
};
public:
f32_4() {}
~f32_4() {}
/// Initialize from __m128 type.
inline f32_4(const __m128& _m) { _mm_store_ps(f,_m); }
/// Overload assignment operator from __m128.
inline f32_4& operator= (const __m128& _m) { _mm_store_ps(f,_m); return *this; }
/// Overload cast to __m128 operator.
inline operator __m128() const { return m; }
/// Returns the float at position i=[0,3].
inline float at(int i) const { return f[i]; }
/// Initialize all four floats to the value s.
inline explicit f32_4(float s)
{
m = _mm_load_ss(&s);
m = _mm_shuffle_ps(m, m, 0x00);
}
/// Explicit initialization from unsigned int (useful for bit masks).
inline explicit f32_4(unsigned int u)
{
bits[0] = u;
m = _mm_shuffle_ps(m, m, 0x00);
}
/// Initialize the four elements with the values f3...f0 (f0 at LSB).
inline f32_4(float f3, float f2, float f1, float f0)
{
f[0]=f0; f[1]=f1; f[2]=f2; f[3]=f3;
}
/// Writes the four elements f3...f0 to an output stream.
friend std::ostream& operator<< (std::ostream& os, const f32_4& rhs)
{
using namespace std;
os << setprecision(6);
os << setw(14) << rhs.f[3] << setw(14) << rhs.f[2] << setw(14) << rhs.f[1] << setw(14) << rhs.f[0];
return os;
}
}; // class f32_4
// ------------------------------------------------------------------------
/// Vector class for 4x 2D vectors stored as two __m128 values, providing
/// easy access to both the __m128 values (x,y) and their individual floats
/// (fx[0..3],fy[0..3]).
// ------------------------------------------------------------------------
class vec2f_4
{
public:
union
{
__m128 x; ///< Access to the x-coord as a 128-bit XMM reg.
float fx[4]; ///< Direct access to its four floats fx3...fx0.
};
union
{
__m128 y; ///< Access to the y-coord as a 128-bit XMM reg.
float fy[4]; ///< Direct access to its four floats fy3...fy0.
};
public:
vec2f_4() {}
~vec2f_4() {}
/// Sets the four 2D vectors to the elements of the vec2f v.
inline explicit vec2f_4(const vec2f& v)
{
x = _mm_load_ss(&v.x);
y = _mm_load_ss(&v.y);
x = _mm_shuffle_ps(x, x, 0x00);
y = _mm_shuffle_ps(y, y, 0x00);
}
/// Sets the (x,y) elements to the given values.
inline vec2f_4(const __m128& _x, const __m128& _y)
{
_mm_store_ps(fx,_x);
_mm_store_ps(fy,_y);
}
/// Returns the vec2f at position i=[0,3].
inline vec2f at(int i) const
{
return vec2f(fx[i],fy[i]);
}
/// Sets the 2D vector at position i=[0,3] to v.
inline void setAt(int i, const vec2f& v)
{
fx[i] = v.x;
fy[i] = v.y;
}
/// Writes the four vectors' elements to an output stream.
friend std::ostream& operator<< (std::ostream& os, const vec2f_4& rhs)
{
using namespace std;
os << setprecision(6);
os << setw(14) << rhs.fx[0] << setw(14) << rhs.fy[0] << endl;
os << setw(14) << rhs.fx[1] << setw(14) << rhs.fy[1] << endl;
os << setw(14) << rhs.fx[2] << setw(14) << rhs.fy[2] << endl;
os << setw(14) << rhs.fx[3] << setw(14) << rhs.fy[3];
return os;
}
}; // class vec2f_4
// ------------------------------------------------------------------------
/// Vector class for 4x 3D vectors stored as three __m128 values, providing
/// easy access to both the __m128 values (x,y,z) and their individual
/// floats (fx[0..3],fy[0..3],fz[0..3]).
// ------------------------------------------------------------------------
class vec3f_4
{
public:
union
{
__m128 x; ///< Access to the x-coord as a 128-bit XMM reg.
float fx[4]; ///< Direct access to its four floats fx3...fx0.
};
union
{
__m128 y; ///< Access to the y-coord as a 128-bit XMM reg.
float fy[4]; ///< Direct access to its four floats fy3...fy0.
};
union
{
__m128 z; ///< Access to the z-coord as a 128-bit XMM reg.
float fz[4]; ///< Direct access to its four floats fz3...fz0.
};
public:
vec3f_4() {}
~vec3f_4() {}
/// Sets the four 3D vectors to the elements of the vec3f v.
inline explicit vec3f_4(const vec3f& v)
{
x = _mm_load_ss(&v.x);
y = _mm_load_ss(&v.y);
z = _mm_load_ss(&v.z);
x = _mm_shuffle_ps(x, x, 0x00);
y = _mm_shuffle_ps(y, y, 0x00);
z = _mm_shuffle_ps(z, z, 0x00);
}
/// Sets the (x,y,z) elements to the given values.
inline vec3f_4(const __m128& _x, const __m128& _y, const __m128& _z)
{
_mm_store_ps(fx,_x);
_mm_store_ps(fy,_y);
_mm_store_ps(fz,_z);
}
/// Returns the vec3f at position i=[0,3].
inline vec3f at(int i) const
{
return vec3f(fx[i],fy[i],fz[i]);
}
/// Sets the 3D vector at position i=[0,3] to v.
inline void setAt(int i, const vec3f& v)
{
fx[i] = v.x;
fy[i] = v.y;
fz[i] = v.z;
}
/// Writes the four vectors' elements to an output stream.
friend std::ostream& operator<< (std::ostream& os, const vec3f_4& rhs)
{
using namespace std;
os << setprecision(6);
os << setw(14) << rhs.fx[0] << setw(14) << rhs.fy[0] << setw(14) << rhs.fz[0] << endl;
os << setw(14) << rhs.fx[1] << setw(14) << rhs.fy[1] << setw(14) << rhs.fz[0] << endl;
os << setw(14) << rhs.fx[2] << setw(14) << rhs.fy[2] << setw(14) << rhs.fz[0] << endl;
os << setw(14) << rhs.fx[3] << setw(14) << rhs.fy[3] << setw(14) << rhs.fz[0];
return os;
}
}; // class vec3f_4
#endif // __SSE__
#ifdef __SSE2__
// ------------------------------------------------------------------------
/// Wrapper for __m128i providing easy access to the four int32 elements,
/// and some useful functions for initialization etc.
// ------------------------------------------------------------------------
class int32_4
{
public:
union
{
__m128i m; ///< Access as a single 128-bit integer type.
int w[4]; ///< Direct access as four int32 words w3...w0.
};
public:
int32_4() {}
~int32_4() {}
/// Initialize from __m128i type.
inline int32_4(const __m128i& _m) { _mm_store_si128(&m,_m); }
/// Overload assignment operator from __m128i.
inline int32_4& operator= (const __m128i& _m) { _mm_store_si128(&m,_m); return *this; }
/// Overload cast to __m128i operator.
inline operator __m128i() const { return m; }
/// Returns the integer at position i=[0,3].
inline int at(int i) const { return w[i]; }
/// Initialize all four integers to the value _w.
inline explicit int32_4(int _w)
{
w[0] = _w;
m = _mm_shuffle_epi32(m, 0x00);
}
/// Initialize the four elements with the values w3...w0 (w0 at LSB).
inline int32_4(int w3, int w2, int w1, int w0)
{
w[0]=w0; w[1]=w1; w[2]=w2; w[3]=w3;
}
/// Writes the four integer elements to an output stream.
/// The order is w3...w0, where w0 is the lowest dword (LSB).
friend std::ostream& operator<< (std::ostream& os, const int32_4& rhs)
{
using namespace std;
os << setw(14) << rhs.w[3] << setw(14) << rhs.w[2] << setw(14) << rhs.w[1] << setw(14) << rhs.w[0];
return os;
}
}; // class int32_4
// ------------------------------------------------------------------------
/// Wrapper for __m128i providing easy access as eight int16 elements,
/// and some useful functions for initialization etc.
// ------------------------------------------------------------------------
class int16_8
{
public:
union
{
__m128i m; ///< Access as a single 128-bit integer type.
short w[8]; ///< Direct access as eight int16 words w3...w0.
};
public:
int16_8() {}
~int16_8() {}
/// Initialize from __m128i type.
inline int16_8(const __m128i& _m) { _mm_store_si128(&m,_m); }
/// Overload assignment operator from __m128i.
inline int16_8& operator= (const __m128i& _m) { _mm_store_si128(&m,_m); return *this; }
/// Overload cast to __m128i operator.
inline operator __m128i() const { return m; }
/// Returns the word at position i=[0,7].
inline short at(int i) const { return w[i]; }
/// Initialize all eight words to the value _w.
inline explicit int16_8(int _w)
{
w[1] = w[0] = _w;
m = _mm_shuffle_epi32(m, 0x00);
}
/// Initialize the eight words with the values w7...w0 (w0 at LSB).
inline int16_8(short w7, short w6, short w5, short w4,
short w3, short w2, short w1, short w0)
{
w[0]=w0; w[1]=w1; w[2]=w2; w[3]=w3;
w[4]=w4; w[5]=w5; w[6]=w6; w[7]=w7;
}
/// Writes the eight words to an output stream.
/// The order is w7...w0, where w0 is the lowest word (LSB).
friend std::ostream& operator<< (std::ostream& os, const int16_8& rhs)
{
using namespace std;
os << setw(9) << rhs.w[7] << setw(9) << rhs.w[6] << setw(9) << rhs.w[5] << setw(9) << rhs.w[4];
os << setw(9) << rhs.w[3] << setw(9) << rhs.w[2] << setw(9) << rhs.w[1] << setw(9) << rhs.w[0];
return os;
}
}; // class int16_8
#endif // __SSE2__
} // namespace mapping
#endif // __VECSSE_H__