|
| 1 | +/* NEON implementation of sin, cos, exp and log |
| 2 | +
|
| 3 | + Inspired by Intel Approximate Math library, and based on the |
| 4 | + corresponding algorithms of the cephes math library |
| 5 | +*/ |
| 6 | + |
| 7 | +/* Copyright (C) 2011 Julien Pommier |
| 8 | +
|
| 9 | + This software is provided 'as-is', without any express or implied |
| 10 | + warranty. In no event will the authors be held liable for any damages |
| 11 | + arising from the use of this software. |
| 12 | +
|
| 13 | + Permission is granted to anyone to use this software for any purpose, |
| 14 | + including commercial applications, and to alter it and redistribute it |
| 15 | + freely, subject to the following restrictions: |
| 16 | +
|
| 17 | + 1. The origin of this software must not be misrepresented; you must not |
| 18 | + claim that you wrote the original software. If you use this software |
| 19 | + in a product, an acknowledgment in the product documentation would be |
| 20 | + appreciated but is not required. |
| 21 | + 2. Altered source versions must be plainly marked as such, and must not be |
| 22 | + misrepresented as being the original software. |
| 23 | + 3. This notice may not be removed or altered from any source distribution. |
| 24 | +
|
| 25 | + (this is the zlib license) |
| 26 | +*/ |
| 27 | + |
| 28 | +#include <arm_neon.h> |
| 29 | + |
| 30 | +typedef float32x4_t v4sf; // vector of 4 float |
| 31 | +typedef uint32x4_t v4su; // vector of 4 uint32 |
| 32 | +typedef int32x4_t v4si; // vector of 4 uint32 |
| 33 | + |
| 34 | +#define c_inv_mant_mask ~0x7f800000u |
| 35 | +#define c_cephes_SQRTHF 0.707106781186547524 |
| 36 | +#define c_cephes_log_p0 7.0376836292E-2 |
| 37 | +#define c_cephes_log_p1 - 1.1514610310E-1 |
| 38 | +#define c_cephes_log_p2 1.1676998740E-1 |
| 39 | +#define c_cephes_log_p3 - 1.2420140846E-1 |
| 40 | +#define c_cephes_log_p4 + 1.4249322787E-1 |
| 41 | +#define c_cephes_log_p5 - 1.6668057665E-1 |
| 42 | +#define c_cephes_log_p6 + 2.0000714765E-1 |
| 43 | +#define c_cephes_log_p7 - 2.4999993993E-1 |
| 44 | +#define c_cephes_log_p8 + 3.3333331174E-1 |
| 45 | +#define c_cephes_log_q1 -2.12194440e-4 |
| 46 | +#define c_cephes_log_q2 0.693359375 |
| 47 | + |
| 48 | +/* natural logarithm computed for 4 simultaneous float |
| 49 | + return NaN for x <= 0 |
| 50 | +*/ |
| 51 | +v4sf log_ps(v4sf x) { |
| 52 | + v4sf one = vdupq_n_f32(1); |
| 53 | + |
| 54 | + x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */ |
| 55 | + v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0)); |
| 56 | + |
| 57 | + v4si ux = vreinterpretq_s32_f32(x); |
| 58 | + |
| 59 | + v4si emm0 = vshrq_n_s32(ux, 23); |
| 60 | + |
| 61 | + /* keep only the fractional part */ |
| 62 | + ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask)); |
| 63 | + ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f))); |
| 64 | + x = vreinterpretq_f32_s32(ux); |
| 65 | + |
| 66 | + emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f)); |
| 67 | + v4sf e = vcvtq_f32_s32(emm0); |
| 68 | + |
| 69 | + e = vaddq_f32(e, one); |
| 70 | + |
| 71 | + /* part2: |
| 72 | + if( x < SQRTHF ) { |
| 73 | + e -= 1; |
| 74 | + x = x + x - 1.0; |
| 75 | + } else { x = x - 1.0; } |
| 76 | + */ |
| 77 | + v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF)); |
| 78 | + v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask)); |
| 79 | + x = vsubq_f32(x, one); |
| 80 | + e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask))); |
| 81 | + x = vaddq_f32(x, tmp); |
| 82 | + |
| 83 | + v4sf z = vmulq_f32(x,x); |
| 84 | + |
| 85 | + v4sf y = vdupq_n_f32(c_cephes_log_p0); |
| 86 | + y = vmulq_f32(y, x); |
| 87 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1)); |
| 88 | + y = vmulq_f32(y, x); |
| 89 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2)); |
| 90 | + y = vmulq_f32(y, x); |
| 91 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3)); |
| 92 | + y = vmulq_f32(y, x); |
| 93 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4)); |
| 94 | + y = vmulq_f32(y, x); |
| 95 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5)); |
| 96 | + y = vmulq_f32(y, x); |
| 97 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6)); |
| 98 | + y = vmulq_f32(y, x); |
| 99 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7)); |
| 100 | + y = vmulq_f32(y, x); |
| 101 | + y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8)); |
| 102 | + y = vmulq_f32(y, x); |
| 103 | + |
| 104 | + y = vmulq_f32(y, z); |
| 105 | + |
| 106 | + |
| 107 | + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1)); |
| 108 | + y = vaddq_f32(y, tmp); |
| 109 | + |
| 110 | + |
| 111 | + tmp = vmulq_f32(z, vdupq_n_f32(0.5f)); |
| 112 | + y = vsubq_f32(y, tmp); |
| 113 | + |
| 114 | + tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2)); |
| 115 | + x = vaddq_f32(x, y); |
| 116 | + x = vaddq_f32(x, tmp); |
| 117 | + x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN |
| 118 | + return x; |
| 119 | +} |
| 120 | + |
| 121 | +#define c_exp_hi 88.3762626647949f |
| 122 | +#define c_exp_lo -88.3762626647949f |
| 123 | + |
| 124 | +#define c_cephes_LOG2EF 1.44269504088896341 |
| 125 | +#define c_cephes_exp_C1 0.693359375 |
| 126 | +#define c_cephes_exp_C2 -2.12194440e-4 |
| 127 | + |
| 128 | +#define c_cephes_exp_p0 1.9875691500E-4 |
| 129 | +#define c_cephes_exp_p1 1.3981999507E-3 |
| 130 | +#define c_cephes_exp_p2 8.3334519073E-3 |
| 131 | +#define c_cephes_exp_p3 4.1665795894E-2 |
| 132 | +#define c_cephes_exp_p4 1.6666665459E-1 |
| 133 | +#define c_cephes_exp_p5 5.0000001201E-1 |
| 134 | + |
| 135 | +/* exp() computed for 4 float at once */ |
| 136 | +v4sf exp_ps(v4sf x) { |
| 137 | + v4sf tmp, fx; |
| 138 | + |
| 139 | + v4sf one = vdupq_n_f32(1); |
| 140 | + x = vminq_f32(x, vdupq_n_f32(c_exp_hi)); |
| 141 | + x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo)); |
| 142 | + |
| 143 | + /* express exp(x) as exp(g + n*log(2)) */ |
| 144 | + fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF)); |
| 145 | + |
| 146 | + /* perform a floorf */ |
| 147 | + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); |
| 148 | + |
| 149 | + /* if greater, subtract 1 */ |
| 150 | + v4su mask = vcgtq_f32(tmp, fx); |
| 151 | + mask = vandq_u32(mask, vreinterpretq_u32_f32(one)); |
| 152 | + |
| 153 | + |
| 154 | + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); |
| 155 | + |
| 156 | + tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1)); |
| 157 | + v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2)); |
| 158 | + x = vsubq_f32(x, tmp); |
| 159 | + x = vsubq_f32(x, z); |
| 160 | + |
| 161 | + static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 }; |
| 162 | + v4sf y = vld1q_dup_f32(cephes_exp_p+0); |
| 163 | + v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); |
| 164 | + v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); |
| 165 | + v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); |
| 166 | + v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); |
| 167 | + v4sf c5 = vld1q_dup_f32(cephes_exp_p+5); |
| 168 | + |
| 169 | + y = vmulq_f32(y, x); |
| 170 | + z = vmulq_f32(x,x); |
| 171 | + y = vaddq_f32(y, c1); |
| 172 | + y = vmulq_f32(y, x); |
| 173 | + y = vaddq_f32(y, c2); |
| 174 | + y = vmulq_f32(y, x); |
| 175 | + y = vaddq_f32(y, c3); |
| 176 | + y = vmulq_f32(y, x); |
| 177 | + y = vaddq_f32(y, c4); |
| 178 | + y = vmulq_f32(y, x); |
| 179 | + y = vaddq_f32(y, c5); |
| 180 | + |
| 181 | + y = vmulq_f32(y, z); |
| 182 | + y = vaddq_f32(y, x); |
| 183 | + y = vaddq_f32(y, one); |
| 184 | + |
| 185 | + /* build 2^n */ |
| 186 | + int32x4_t mm; |
| 187 | + mm = vcvtq_s32_f32(fx); |
| 188 | + mm = vaddq_s32(mm, vdupq_n_s32(0x7f)); |
| 189 | + mm = vshlq_n_s32(mm, 23); |
| 190 | + v4sf pow2n = vreinterpretq_f32_s32(mm); |
| 191 | + |
| 192 | + y = vmulq_f32(y, pow2n); |
| 193 | + return y; |
| 194 | +} |
| 195 | + |
| 196 | +#define c_minus_cephes_DP1 -0.78515625 |
| 197 | +#define c_minus_cephes_DP2 -2.4187564849853515625e-4 |
| 198 | +#define c_minus_cephes_DP3 -3.77489497744594108e-8 |
| 199 | +#define c_sincof_p0 -1.9515295891E-4 |
| 200 | +#define c_sincof_p1 8.3321608736E-3 |
| 201 | +#define c_sincof_p2 -1.6666654611E-1 |
| 202 | +#define c_coscof_p0 2.443315711809948E-005 |
| 203 | +#define c_coscof_p1 -1.388731625493765E-003 |
| 204 | +#define c_coscof_p2 4.166664568298827E-002 |
| 205 | +#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI |
| 206 | + |
| 207 | +/* evaluation of 4 sines & cosines at once. |
| 208 | +
|
| 209 | + The code is the exact rewriting of the cephes sinf function. |
| 210 | + Precision is excellent as long as x < 8192 (I did not bother to |
| 211 | + take into account the special handling they have for greater values |
| 212 | + -- it does not return garbage for arguments over 8192, though, but |
| 213 | + the extra precision is missing). |
| 214 | +
|
| 215 | + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the |
| 216 | + surprising but correct result. |
| 217 | +
|
| 218 | + Note also that when you compute sin(x), cos(x) is available at |
| 219 | + almost no extra price so both sin_ps and cos_ps make use of |
| 220 | + sincos_ps.. |
| 221 | + */ |
| 222 | +void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x |
| 223 | + v4sf xmm1, xmm2, xmm3, y; |
| 224 | + |
| 225 | + v4su emm2; |
| 226 | + |
| 227 | + v4su sign_mask_sin, sign_mask_cos; |
| 228 | + sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0)); |
| 229 | + x = vabsq_f32(x); |
| 230 | + |
| 231 | + /* scale by 4/Pi */ |
| 232 | + y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI)); |
| 233 | + |
| 234 | + /* store the integer part of y in mm0 */ |
| 235 | + emm2 = vcvtq_u32_f32(y); |
| 236 | + /* j=(j+1) & (~1) (see the cephes sources) */ |
| 237 | + emm2 = vaddq_u32(emm2, vdupq_n_u32(1)); |
| 238 | + emm2 = vandq_u32(emm2, vdupq_n_u32(~1)); |
| 239 | + y = vcvtq_f32_u32(emm2); |
| 240 | + |
| 241 | + /* get the polynom selection mask |
| 242 | + there is one polynom for 0 <= x <= Pi/4 |
| 243 | + and another one for Pi/4<x<=Pi/2 |
| 244 | +
|
| 245 | + Both branches will be computed. |
| 246 | + */ |
| 247 | + v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2)); |
| 248 | + |
| 249 | + /* The magic pass: "Extended precision modular arithmetic" |
| 250 | + x = ((x - y * DP1) - y * DP2) - y * DP3; */ |
| 251 | + xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1); |
| 252 | + xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2); |
| 253 | + xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3); |
| 254 | + x = vaddq_f32(x, xmm1); |
| 255 | + x = vaddq_f32(x, xmm2); |
| 256 | + x = vaddq_f32(x, xmm3); |
| 257 | + |
| 258 | + sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4))); |
| 259 | + sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4)); |
| 260 | + |
| 261 | + /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, |
| 262 | + and the second polynom (Pi/4 <= x <= 0) in y2 */ |
| 263 | + v4sf z = vmulq_f32(x,x); |
| 264 | + v4sf y1, y2; |
| 265 | + |
| 266 | + y1 = vmulq_n_f32(z, c_coscof_p0); |
| 267 | + y2 = vmulq_n_f32(z, c_sincof_p0); |
| 268 | + y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1)); |
| 269 | + y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1)); |
| 270 | + y1 = vmulq_f32(y1, z); |
| 271 | + y2 = vmulq_f32(y2, z); |
| 272 | + y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2)); |
| 273 | + y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2)); |
| 274 | + y1 = vmulq_f32(y1, z); |
| 275 | + y2 = vmulq_f32(y2, z); |
| 276 | + y1 = vmulq_f32(y1, z); |
| 277 | + y2 = vmulq_f32(y2, x); |
| 278 | + y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f))); |
| 279 | + y2 = vaddq_f32(y2, x); |
| 280 | + y1 = vaddq_f32(y1, vdupq_n_f32(1)); |
| 281 | + |
| 282 | + /* select the correct result from the two polynoms */ |
| 283 | + v4sf ys = vbslq_f32(poly_mask, y1, y2); |
| 284 | + v4sf yc = vbslq_f32(poly_mask, y2, y1); |
| 285 | + *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); |
| 286 | + *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); |
| 287 | +} |
| 288 | + |
| 289 | +v4sf sin_ps(v4sf x) { |
| 290 | + v4sf ysin, ycos; |
| 291 | + sincos_ps(x, &ysin, &ycos); |
| 292 | + return ysin; |
| 293 | +} |
| 294 | + |
| 295 | +v4sf cos_ps(v4sf x) { |
| 296 | + v4sf ysin, ycos; |
| 297 | + sincos_ps(x, &ysin, &ycos); |
| 298 | + return ycos; |
| 299 | +} |
| 300 | + |
| 301 | + |
0 commit comments