Caffe2 - C++ API
A deep learning, cross platform ML framework
avx_mathfun.h
1 #pragma once
2 /*
3  AVX implementation of sin, cos, sincos, exp and log
4 
5  Based on "sse_mathfun.h", by Julien Pommier
6  http://gruntthepeon.free.fr/ssemath/
7 
8  Copyright (C) 2012 Giovanni Garberoglio
9  Interdisciplinary Laboratory for Computational Science (LISC)
10  Fondazione Bruno Kessler and University of Trento
11  via Sommarive, 18
12  I-38123 Trento (Italy)
13 
14  This software is provided 'as-is', without any express or implied
15  warranty. In no event will the authors be held liable for any damages
16  arising from the use of this software.
17 
18  Permission is granted to anyone to use this software for any purpose,
19  including commercial applications, and to alter it and redistribute it
20  freely, subject to the following restrictions:
21 
22  1. The origin of this software must not be misrepresented; you must not
23  claim that you wrote the original software. If you use this software
24  in a product, an acknowledgment in the product documentation would be
25  appreciated but is not required.
26  2. Altered source versions must be plainly marked as such, and must not be
27  misrepresented as being the original software.
28  3. This notice may not be removed or altered from any source distribution.
29 
30  (this is the zlib license)
31 */
32 
33 #include <ATen/native/cpu/Intrinsics.h>
34 
35 /* yes I know, the top of this file is quite ugly */
36 #if defined(__GNUC__)
37 # define ALIGN32_BEG __attribute__((aligned(32)))
38 #elif defined(_WIN32)
39 # define ALIGN32_BEG __declspec(align(32))
40 #endif
41 
42 /* __m128 is ugly to write */
43 typedef __m256 v8sf; // vector of 8 float (avx)
44 typedef __m256i v8si; // vector of 8 int (avx)
45 typedef __m128i v4si; // vector of 8 int (avx)
46 
47 #define _PI32AVX_CONST(Name, Val) \
48  static const ALIGN32_BEG int _pi32avx_##Name[4] = { Val, Val, Val, Val }
49 
50 _PI32AVX_CONST(1, 1);
51 _PI32AVX_CONST(inv1, ~1);
52 _PI32AVX_CONST(2, 2);
53 _PI32AVX_CONST(4, 4);
54 
55 
56 /* declare some AVX constants -- why can't I figure a better way to do that? */
57 #define _PS256_CONST(Name, Val) \
58  static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
59 #define _PI32_CONST256(Name, Val) \
60  static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
61 #define _PS256_CONST_TYPE(Name, Type, Val) \
62  static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val }
63 
64 _PS256_CONST(1 , 1.0f);
65 _PS256_CONST(0p5, 0.5f);
66 /* the smallest non denormalized float number */
67 _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
68 _PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
69 _PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
70 
71 _PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
72 _PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
73 
74 _PI32_CONST256(0, 0);
75 _PI32_CONST256(1, 1);
76 _PI32_CONST256(inv1, ~1);
77 _PI32_CONST256(2, 2);
78 _PI32_CONST256(4, 4);
79 _PI32_CONST256(0x7f, 0x7f);
80 
81 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
82 _PS256_CONST(cephes_log_p0, 7.0376836292E-2);
83 _PS256_CONST(cephes_log_p1, - 1.1514610310E-1);
84 _PS256_CONST(cephes_log_p2, 1.1676998740E-1);
85 _PS256_CONST(cephes_log_p3, - 1.2420140846E-1);
86 _PS256_CONST(cephes_log_p4, + 1.4249322787E-1);
87 _PS256_CONST(cephes_log_p5, - 1.6668057665E-1);
88 _PS256_CONST(cephes_log_p6, + 2.0000714765E-1);
89 _PS256_CONST(cephes_log_p7, - 2.4999993993E-1);
90 _PS256_CONST(cephes_log_p8, + 3.3333331174E-1);
91 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
92 _PS256_CONST(cephes_log_q2, 0.693359375);
93 
94 #ifndef __AVX2__
95 
96 typedef union imm_xmm_union {
97  v8si imm;
98  v4si xmm[2];
100 
101 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
102  imm_xmm_union u __attribute__((aligned(32))); \
103  u.imm = imm_; \
104  xmm0_ = u.xmm[0]; \
105  xmm1_ = u.xmm[1]; \
106 }
107 
108 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
109  imm_xmm_union u __attribute__((aligned(32))); \
110  u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
111  }
112 
113 
114 #define AVX2_BITOP_USING_SSE2(fn) \
115 static inline v8si _mm256_##fn(v8si x, int a) \
116 { \
117  /* use SSE2 instruction to perform the bitop AVX2 */ \
118  v4si x1, x2; \
119  v8si ret; \
120  COPY_IMM_TO_XMM(x, x1, x2); \
121  x1 = _mm_##fn(x1,a); \
122  x2 = _mm_##fn(x2,a); \
123  COPY_XMM_TO_IMM(x1, x2, ret); \
124  return(ret); \
125 }
126 
127 #warning "Using SSE2 to perform AVX2 bitshift ops"
128 AVX2_BITOP_USING_SSE2(slli_epi32)
129 AVX2_BITOP_USING_SSE2(srli_epi32)
130 
131 #define AVX2_INTOP_USING_SSE2(fn) \
132 static inline v8si _mm256_##fn(v8si x, v8si y) \
133 { \
134  /* use SSE2 instructions to perform the AVX2 integer operation */ \
135  v4si x1, x2; \
136  v4si y1, y2; \
137  v8si ret; \
138  COPY_IMM_TO_XMM(x, x1, x2); \
139  COPY_IMM_TO_XMM(y, y1, y2); \
140  x1 = _mm_##fn(x1,y1); \
141  x2 = _mm_##fn(x2,y2); \
142  COPY_XMM_TO_IMM(x1, x2, ret); \
143  return(ret); \
144 }
145 
146 #warning "Using SSE2 to perform AVX2 integer ops"
147 AVX2_INTOP_USING_SSE2(and_si128)
148 AVX2_INTOP_USING_SSE2(andnot_si128)
149 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
150 AVX2_INTOP_USING_SSE2(sub_epi32)
151 AVX2_INTOP_USING_SSE2(add_epi32)
152 
153 #endif /* __AVX2__ */
154 
155 
156 /* natural logarithm computed for 8 simultaneous float
157  return NaN for x <= 0
158 */
159 inline v8sf log256_ps(v8sf x) {
160  v8si imm0;
161  v8sf one = *(v8sf*)_ps256_1;
162 
163  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
164  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
165 
166  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */
167 
168  // can be done with AVX2
169  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
170 
171  /* keep only the fractional part */
172  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
173  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
174 
175  // this is again another AVX2 instruction
176  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
177  v8sf e = _mm256_cvtepi32_ps(imm0);
178 
179  e = _mm256_add_ps(e, one);
180 
181  /* part2:
182  if( x < SQRTHF ) {
183  e -= 1;
184  x = x + x - 1.0;
185  } else { x = x - 1.0; }
186  */
187  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
188  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
189  v8sf tmp = _mm256_and_ps(x, mask);
190  x = _mm256_sub_ps(x, one);
191  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
192  x = _mm256_add_ps(x, tmp);
193 
194  v8sf z = _mm256_mul_ps(x,x);
195 
196  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
197  y = _mm256_mul_ps(y, x);
198  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
199  y = _mm256_mul_ps(y, x);
200  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
201  y = _mm256_mul_ps(y, x);
202  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
203  y = _mm256_mul_ps(y, x);
204  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
205  y = _mm256_mul_ps(y, x);
206  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
207  y = _mm256_mul_ps(y, x);
208  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
209  y = _mm256_mul_ps(y, x);
210  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
211  y = _mm256_mul_ps(y, x);
212  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
213  y = _mm256_mul_ps(y, x);
214 
215  y = _mm256_mul_ps(y, z);
216 
217  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
218  y = _mm256_add_ps(y, tmp);
219 
220 
221  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
222  y = _mm256_sub_ps(y, tmp);
223 
224  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
225  x = _mm256_add_ps(x, y);
226  x = _mm256_add_ps(x, tmp);
227  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
228  return x;
229 }
230 
231 _PS256_CONST(exp_hi, 88.3762626647949f);
232 _PS256_CONST(exp_lo, -88.3762626647949f);
233 
234 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
235 _PS256_CONST(cephes_exp_C1, 0.693359375);
236 _PS256_CONST(cephes_exp_C2, -2.12194440e-4);
237 
238 _PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
239 _PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
240 _PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
241 _PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
242 _PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
243 _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
244 
245 inline v8sf exp256_ps(v8sf x) {
246  v8sf tmp = _mm256_setzero_ps(), fx;
247  v8si imm0;
248  v8sf one = *(v8sf*)_ps256_1;
249 
250  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
251  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
252 
253  /* express exp(x) as exp(g + n*log(2)) */
254  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
255  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
256 
257  /* how to perform a floorf with SSE: just below */
258  //imm0 = _mm256_cvttps_epi32(fx);
259  //tmp = _mm256_cvtepi32_ps(imm0);
260 
261  tmp = _mm256_floor_ps(fx);
262 
263  /* if greater, substract 1 */
264  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);
265  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
266  mask = _mm256_and_ps(mask, one);
267  fx = _mm256_sub_ps(tmp, mask);
268 
269  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
270  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
271  x = _mm256_sub_ps(x, tmp);
272  x = _mm256_sub_ps(x, z);
273 
274  z = _mm256_mul_ps(x,x);
275 
276  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
277  y = _mm256_mul_ps(y, x);
278  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
279  y = _mm256_mul_ps(y, x);
280  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
281  y = _mm256_mul_ps(y, x);
282  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
283  y = _mm256_mul_ps(y, x);
284  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
285  y = _mm256_mul_ps(y, x);
286  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
287  y = _mm256_mul_ps(y, z);
288  y = _mm256_add_ps(y, x);
289  y = _mm256_add_ps(y, one);
290 
291  /* build 2^n */
292  imm0 = _mm256_cvttps_epi32(fx);
293  // another two AVX2 instructions
294  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
295  imm0 = _mm256_slli_epi32(imm0, 23);
296  v8sf pow2n = _mm256_castsi256_ps(imm0);
297  y = _mm256_mul_ps(y, pow2n);
298  return y;
299 }
300 
301 _PS256_CONST(minus_cephes_DP1, -0.78515625);
302 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
303 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
304 _PS256_CONST(sincof_p0, -1.9515295891E-4);
305 _PS256_CONST(sincof_p1, 8.3321608736E-3);
306 _PS256_CONST(sincof_p2, -1.6666654611E-1);
307 _PS256_CONST(coscof_p0, 2.443315711809948E-005);
308 _PS256_CONST(coscof_p1, -1.388731625493765E-003);
309 _PS256_CONST(coscof_p2, 4.166664568298827E-002);
310 _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
311 
312 
313 /* evaluation of 8 sines at onces using AVX intrisics
314 
315  The code is the exact rewriting of the cephes sinf function.
316  Precision is excellent as long as x < 8192 (I did not bother to
317  take into account the special handling they have for greater values
318  -- it does not return garbage for arguments over 8192, though, but
319  the extra precision is missing).
320 
321  Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
322  surprising but correct result.
323 
324 */
325 inline v8sf sin256_ps(v8sf x) { // any x
326  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
327  v8si imm0, imm2;
328 
329 #ifndef __AVX2__
330  v4si imm0_1, imm0_2;
331  v4si imm2_1, imm2_2;
332 #endif
333 
334  sign_bit = x;
335  /* take the absolute value */
336  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
337  /* extract the sign bit (upper one) */
338  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
339 
340  /* scale by 4/Pi */
341  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
342 
343  /*
344  Here we start a series of integer operations, which are in the
345  realm of AVX2.
346  If we don't have AVX, let's perform them using SSE2 directives
347  */
348 
349 #ifdef __AVX2__
350  /* store the integer part of y in mm0 */
351  imm2 = _mm256_cvttps_epi32(y);
352  /* j=(j+1) & (~1) (see the cephes sources) */
353  // another two AVX2 instruction
354  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
355  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
356  y = _mm256_cvtepi32_ps(imm2);
357 
358  /* get the swap sign flag */
359  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
360  imm0 = _mm256_slli_epi32(imm0, 29);
361  /* get the polynom selection mask
362  there is one polynom for 0 <= x <= Pi/4
363  and another one for Pi/4<x<=Pi/2
364 
365  Both branches will be computed.
366  */
367  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
368  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
369 #else
370  /* we use SSE2 routines to perform the integer ops */
371  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
372 
373  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
374  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
375 
376  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
377  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
378 
379  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
380  y = _mm256_cvtepi32_ps(imm2);
381 
382  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
383  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
384 
385  imm0_1 = _mm_slli_epi32(imm0_1, 29);
386  imm0_2 = _mm_slli_epi32(imm0_2, 29);
387 
388  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
389 
390  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
391  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
392 
393  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
394  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
395 
396  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
397 #endif
398 
399  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
400  v8sf poly_mask = _mm256_castsi256_ps(imm2);
401  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
402 
403  /* The magic pass: "Extended precision modular arithmetic"
404  x = ((x - y * DP1) - y * DP2) - y * DP3; */
405  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
406  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
407  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
408  xmm1 = _mm256_mul_ps(y, xmm1);
409  xmm2 = _mm256_mul_ps(y, xmm2);
410  xmm3 = _mm256_mul_ps(y, xmm3);
411  x = _mm256_add_ps(x, xmm1);
412  x = _mm256_add_ps(x, xmm2);
413  x = _mm256_add_ps(x, xmm3);
414 
415  /* Evaluate the first polynom (0 <= x <= Pi/4) */
416  y = *(v8sf*)_ps256_coscof_p0;
417  v8sf z = _mm256_mul_ps(x,x);
418 
419  y = _mm256_mul_ps(y, z);
420  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
421  y = _mm256_mul_ps(y, z);
422  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
423  y = _mm256_mul_ps(y, z);
424  y = _mm256_mul_ps(y, z);
425  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
426  y = _mm256_sub_ps(y, tmp);
427  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
428 
429  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
430 
431  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
432  y2 = _mm256_mul_ps(y2, z);
433  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
434  y2 = _mm256_mul_ps(y2, z);
435  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
436  y2 = _mm256_mul_ps(y2, z);
437  y2 = _mm256_mul_ps(y2, x);
438  y2 = _mm256_add_ps(y2, x);
439 
440  /* select the correct result from the two polynoms */
441  xmm3 = poly_mask;
442  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
443  y = _mm256_andnot_ps(xmm3, y);
444  y = _mm256_add_ps(y,y2);
445  /* update the sign */
446  y = _mm256_xor_ps(y, sign_bit);
447 
448  return y;
449 }
450 
451 /* almost the same as sin_ps */
452 inline v8sf cos256_ps(v8sf x) { // any x
453  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
454  v8si imm0, imm2;
455 
456 #ifndef __AVX2__
457  v4si imm0_1, imm0_2;
458  v4si imm2_1, imm2_2;
459 #endif
460 
461  /* take the absolute value */
462  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
463 
464  /* scale by 4/Pi */
465  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
466 
467 #ifdef __AVX2__
468  /* store the integer part of y in mm0 */
469  imm2 = _mm256_cvttps_epi32(y);
470  /* j=(j+1) & (~1) (see the cephes sources) */
471  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
472  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
473  y = _mm256_cvtepi32_ps(imm2);
474  imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
475 
476  /* get the swap sign flag */
477  imm0 = _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
478  imm0 = _mm256_slli_epi32(imm0, 29);
479  /* get the polynom selection mask */
480  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
481  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
482 #else
483 
484  /* we use SSE2 routines to perform the integer ops */
485  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
486 
487  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
488  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
489 
490  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
491  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
492 
493  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
494  y = _mm256_cvtepi32_ps(imm2);
495 
496  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
497  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
498 
499  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
500  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
501 
502  imm0_1 = _mm_slli_epi32(imm0_1, 29);
503  imm0_2 = _mm_slli_epi32(imm0_2, 29);
504 
505  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
506 
507  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
508  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
509 
510  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
511  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
512 
513  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
514 #endif
515 
516  v8sf sign_bit = _mm256_castsi256_ps(imm0);
517  v8sf poly_mask = _mm256_castsi256_ps(imm2);
518 
519  /* The magic pass: "Extended precision modular arithmetic"
520  x = ((x - y * DP1) - y * DP2) - y * DP3; */
521  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
522  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
523  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
524  xmm1 = _mm256_mul_ps(y, xmm1);
525  xmm2 = _mm256_mul_ps(y, xmm2);
526  xmm3 = _mm256_mul_ps(y, xmm3);
527  x = _mm256_add_ps(x, xmm1);
528  x = _mm256_add_ps(x, xmm2);
529  x = _mm256_add_ps(x, xmm3);
530 
531  /* Evaluate the first polynom (0 <= x <= Pi/4) */
532  y = *(v8sf*)_ps256_coscof_p0;
533  v8sf z = _mm256_mul_ps(x,x);
534 
535  y = _mm256_mul_ps(y, z);
536  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
537  y = _mm256_mul_ps(y, z);
538  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
539  y = _mm256_mul_ps(y, z);
540  y = _mm256_mul_ps(y, z);
541  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
542  y = _mm256_sub_ps(y, tmp);
543  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
544 
545  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
546 
547  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
548  y2 = _mm256_mul_ps(y2, z);
549  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
550  y2 = _mm256_mul_ps(y2, z);
551  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
552  y2 = _mm256_mul_ps(y2, z);
553  y2 = _mm256_mul_ps(y2, x);
554  y2 = _mm256_add_ps(y2, x);
555 
556  /* select the correct result from the two polynoms */
557  xmm3 = poly_mask;
558  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
559  y = _mm256_andnot_ps(xmm3, y);
560  y = _mm256_add_ps(y,y2);
561  /* update the sign */
562  y = _mm256_xor_ps(y, sign_bit);
563 
564  return y;
565 }
566 
567 /* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
568  it is almost as fast, and gives you a free cosine with your sine */
569 inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
570 
571  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
572  v8si imm0, imm2, imm4;
573 
574 #ifndef __AVX2__
575  v4si imm0_1, imm0_2;
576  v4si imm2_1, imm2_2;
577  v4si imm4_1, imm4_2;
578 #endif
579 
580  sign_bit_sin = x;
581  /* take the absolute value */
582  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
583  /* extract the sign bit (upper one) */
584  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
585 
586  /* scale by 4/Pi */
587  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
588 
589 #ifdef __AVX2__
590  /* store the integer part of y in imm2 */
591  imm2 = _mm256_cvttps_epi32(y);
592 
593  /* j=(j+1) & (~1) (see the cephes sources) */
594  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
595  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
596 
597  y = _mm256_cvtepi32_ps(imm2);
598  imm4 = imm2;
599 
600  /* get the swap sign flag for the sine */
601  imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
602  imm0 = _mm256_slli_epi32(imm0, 29);
603  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
604 
605  /* get the polynom selection mask for the sine*/
606  imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
607  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
608  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
609 #else
610  /* we use SSE2 routines to perform the integer ops */
611  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
612 
613  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
614  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
615 
616  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
617  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
618 
619  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
620  y = _mm256_cvtepi32_ps(imm2);
621 
622  imm4_1 = imm2_1;
623  imm4_2 = imm2_2;
624 
625  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
626  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
627 
628  imm0_1 = _mm_slli_epi32(imm0_1, 29);
629  imm0_2 = _mm_slli_epi32(imm0_2, 29);
630 
631  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
632 
633  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
634  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
635 
636  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
637  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
638 
639  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
640 #endif
641  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
642  v8sf poly_mask = _mm256_castsi256_ps(imm2);
643 
644  /* The magic pass: "Extended precision modular arithmetic"
645  x = ((x - y * DP1) - y * DP2) - y * DP3; */
646  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
647  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
648  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
649  xmm1 = _mm256_mul_ps(y, xmm1);
650  xmm2 = _mm256_mul_ps(y, xmm2);
651  xmm3 = _mm256_mul_ps(y, xmm3);
652  x = _mm256_add_ps(x, xmm1);
653  x = _mm256_add_ps(x, xmm2);
654  x = _mm256_add_ps(x, xmm3);
655 
656 #ifdef __AVX2__
657  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
658  imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
659  imm4 = _mm256_slli_epi32(imm4, 29);
660 #else
661  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
662  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
663 
664  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
665  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
666 
667  imm4_1 = _mm_slli_epi32(imm4_1, 29);
668  imm4_2 = _mm_slli_epi32(imm4_2, 29);
669 
670  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
671 #endif
672 
673  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
674 
675  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
676 
677  /* Evaluate the first polynom (0 <= x <= Pi/4) */
678  v8sf z = _mm256_mul_ps(x,x);
679  y = *(v8sf*)_ps256_coscof_p0;
680 
681  y = _mm256_mul_ps(y, z);
682  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
683  y = _mm256_mul_ps(y, z);
684  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
685  y = _mm256_mul_ps(y, z);
686  y = _mm256_mul_ps(y, z);
687  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
688  y = _mm256_sub_ps(y, tmp);
689  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
690 
691  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
692 
693  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
694  y2 = _mm256_mul_ps(y2, z);
695  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
696  y2 = _mm256_mul_ps(y2, z);
697  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
698  y2 = _mm256_mul_ps(y2, z);
699  y2 = _mm256_mul_ps(y2, x);
700  y2 = _mm256_add_ps(y2, x);
701 
702  /* select the correct result from the two polynoms */
703  xmm3 = poly_mask;
704  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
705  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
706  y2 = _mm256_sub_ps(y2,ysin2);
707  y = _mm256_sub_ps(y, ysin1);
708 
709  xmm1 = _mm256_add_ps(ysin1,ysin2);
710  xmm2 = _mm256_add_ps(y,y2);
711 
712  /* update the sign */
713  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
714  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
715 }
Definition: static.cpp:76