33 #include <ATen/native/cpu/Intrinsics.h> 37 # define ALIGN32_BEG __attribute__((aligned(32))) 39 # define ALIGN32_BEG __declspec(align(32)) 47 #define _PI32AVX_CONST(Name, Val) \ 48 static const ALIGN32_BEG int _pi32avx_##Name[4] = { Val, Val, Val, Val } 51 _PI32AVX_CONST(inv1, ~1);
57 #define _PS256_CONST(Name, Val) \ 58 static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } 59 #define _PI32_CONST256(Name, Val) \ 60 static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } 61 #define _PS256_CONST_TYPE(Name, Type, Val) \ 62 static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } 64 _PS256_CONST(1 , 1.0f);
65 _PS256_CONST(0p5, 0.5f);
67 _PS256_CONST_TYPE(min_norm_pos,
int, 0x00800000);
68 _PS256_CONST_TYPE(mant_mask,
int, 0x7f800000);
69 _PS256_CONST_TYPE(inv_mant_mask,
int, ~0x7f800000);
71 _PS256_CONST_TYPE(sign_mask,
int, (
int)0x80000000);
72 _PS256_CONST_TYPE(inv_sign_mask,
int, ~0x80000000);
76 _PI32_CONST256(inv1, ~1);
79 _PI32_CONST256(0x7f, 0x7f);
81 _PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
82 _PS256_CONST(cephes_log_p0, 7.0376836292
E-2);
83 _PS256_CONST(cephes_log_p1, - 1.1514610310
E-1);
84 _PS256_CONST(cephes_log_p2, 1.1676998740
E-1);
85 _PS256_CONST(cephes_log_p3, - 1.2420140846
E-1);
86 _PS256_CONST(cephes_log_p4, + 1.4249322787
E-1);
87 _PS256_CONST(cephes_log_p5, - 1.6668057665
E-1);
88 _PS256_CONST(cephes_log_p6, + 2.0000714765
E-1);
89 _PS256_CONST(cephes_log_p7, - 2.4999993993
E-1);
90 _PS256_CONST(cephes_log_p8, + 3.3333331174
E-1);
91 _PS256_CONST(cephes_log_q1, -2.12194440e-4);
92 _PS256_CONST(cephes_log_q2, 0.693359375);
101 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ 102 imm_xmm_union u __attribute__((aligned(32))); \ 108 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ 109 imm_xmm_union u __attribute__((aligned(32))); \ 110 u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ 114 #define AVX2_BITOP_USING_SSE2(fn) \ 115 static inline v8si _mm256_##fn(v8si x, int a) \ 120 COPY_IMM_TO_XMM(x, x1, x2); \ 121 x1 = _mm_##fn(x1,a); \ 122 x2 = _mm_##fn(x2,a); \ 123 COPY_XMM_TO_IMM(x1, x2, ret); \ 127 #warning "Using SSE2 to perform AVX2 bitshift ops" 128 AVX2_BITOP_USING_SSE2(slli_epi32)
129 AVX2_BITOP_USING_SSE2(srli_epi32)
131 #define AVX2_INTOP_USING_SSE2(fn) \ 132 static inline v8si _mm256_##fn(v8si x, v8si y) \ 138 COPY_IMM_TO_XMM(x, x1, x2); \ 139 COPY_IMM_TO_XMM(y, y1, y2); \ 140 x1 = _mm_##fn(x1,y1); \ 141 x2 = _mm_##fn(x2,y2); \ 142 COPY_XMM_TO_IMM(x1, x2, ret); \ 146 #warning "Using SSE2 to perform AVX2 integer ops" 147 AVX2_INTOP_USING_SSE2(and_si128)
148 AVX2_INTOP_USING_SSE2(andnot_si128)
149 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
150 AVX2_INTOP_USING_SSE2(sub_epi32)
151 AVX2_INTOP_USING_SSE2(add_epi32)
159 inline v8sf log256_ps(v8sf x) {
161 v8sf one = *(v8sf*)_ps256_1;
164 v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
166 x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);
169 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);
172 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
173 x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);
176 imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
177 v8sf e = _mm256_cvtepi32_ps(imm0);
179 e = _mm256_add_ps(e, one);
188 v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
189 v8sf tmp = _mm256_and_ps(x, mask);
190 x = _mm256_sub_ps(x, one);
191 e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
192 x = _mm256_add_ps(x, tmp);
194 v8sf z = _mm256_mul_ps(x,x);
196 v8sf y = *(v8sf*)_ps256_cephes_log_p0;
197 y = _mm256_mul_ps(y, x);
198 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
199 y = _mm256_mul_ps(y, x);
200 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
201 y = _mm256_mul_ps(y, x);
202 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
203 y = _mm256_mul_ps(y, x);
204 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
205 y = _mm256_mul_ps(y, x);
206 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
207 y = _mm256_mul_ps(y, x);
208 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
209 y = _mm256_mul_ps(y, x);
210 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
211 y = _mm256_mul_ps(y, x);
212 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
213 y = _mm256_mul_ps(y, x);
215 y = _mm256_mul_ps(y, z);
217 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
218 y = _mm256_add_ps(y, tmp);
221 tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
222 y = _mm256_sub_ps(y, tmp);
224 tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
225 x = _mm256_add_ps(x, y);
226 x = _mm256_add_ps(x, tmp);
227 x = _mm256_or_ps(x, invalid_mask);
231 _PS256_CONST(exp_hi, 88.3762626647949f);
232 _PS256_CONST(exp_lo, -88.3762626647949f);
234 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
235 _PS256_CONST(cephes_exp_C1, 0.693359375);
236 _PS256_CONST(cephes_exp_C2, -2.12194440e-4);
238 _PS256_CONST(cephes_exp_p0, 1.9875691500
E-4);
239 _PS256_CONST(cephes_exp_p1, 1.3981999507
E-3);
240 _PS256_CONST(cephes_exp_p2, 8.3334519073
E-3);
241 _PS256_CONST(cephes_exp_p3, 4.1665795894
E-2);
242 _PS256_CONST(cephes_exp_p4, 1.6666665459
E-1);
243 _PS256_CONST(cephes_exp_p5, 5.0000001201
E-1);
245 inline v8sf exp256_ps(v8sf x) {
246 v8sf tmp = _mm256_setzero_ps(), fx;
248 v8sf one = *(v8sf*)_ps256_1;
250 x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
251 x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);
254 fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
255 fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);
261 tmp = _mm256_floor_ps(fx);
265 v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
266 mask = _mm256_and_ps(mask, one);
267 fx = _mm256_sub_ps(tmp, mask);
269 tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
270 v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
271 x = _mm256_sub_ps(x, tmp);
272 x = _mm256_sub_ps(x, z);
274 z = _mm256_mul_ps(x,x);
276 v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
277 y = _mm256_mul_ps(y, x);
278 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
279 y = _mm256_mul_ps(y, x);
280 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
281 y = _mm256_mul_ps(y, x);
282 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
283 y = _mm256_mul_ps(y, x);
284 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
285 y = _mm256_mul_ps(y, x);
286 y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
287 y = _mm256_mul_ps(y, z);
288 y = _mm256_add_ps(y, x);
289 y = _mm256_add_ps(y, one);
292 imm0 = _mm256_cvttps_epi32(fx);
294 imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
295 imm0 = _mm256_slli_epi32(imm0, 23);
296 v8sf pow2n = _mm256_castsi256_ps(imm0);
297 y = _mm256_mul_ps(y, pow2n);
301 _PS256_CONST(minus_cephes_DP1, -0.78515625);
302 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
303 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
304 _PS256_CONST(sincof_p0, -1.9515295891
E-4);
305 _PS256_CONST(sincof_p1, 8.3321608736
E-3);
306 _PS256_CONST(sincof_p2, -1.6666654611
E-1);
307 _PS256_CONST(coscof_p0, 2.443315711809948
E-005);
308 _PS256_CONST(coscof_p1, -1.388731625493765
E-003);
309 _PS256_CONST(coscof_p2, 4.166664568298827
E-002);
310 _PS256_CONST(cephes_FOPI, 1.27323954473516);
325 inline v8sf sin256_ps(v8sf x) {
326 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
336 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
338 sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
341 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
351 imm2 = _mm256_cvttps_epi32(y);
354 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
355 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
356 y = _mm256_cvtepi32_ps(imm2);
359 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
360 imm0 = _mm256_slli_epi32(imm0, 29);
367 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
368 imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
371 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
373 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
374 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
376 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
377 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
379 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
380 y = _mm256_cvtepi32_ps(imm2);
382 imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
383 imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
385 imm0_1 = _mm_slli_epi32(imm0_1, 29);
386 imm0_2 = _mm_slli_epi32(imm0_2, 29);
388 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
390 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
391 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
393 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
394 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
396 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
399 v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
400 v8sf poly_mask = _mm256_castsi256_ps(imm2);
401 sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
405 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
406 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
407 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
408 xmm1 = _mm256_mul_ps(y, xmm1);
409 xmm2 = _mm256_mul_ps(y, xmm2);
410 xmm3 = _mm256_mul_ps(y, xmm3);
411 x = _mm256_add_ps(x, xmm1);
412 x = _mm256_add_ps(x, xmm2);
413 x = _mm256_add_ps(x, xmm3);
416 y = *(v8sf*)_ps256_coscof_p0;
417 v8sf z = _mm256_mul_ps(x,x);
419 y = _mm256_mul_ps(y, z);
420 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
421 y = _mm256_mul_ps(y, z);
422 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
423 y = _mm256_mul_ps(y, z);
424 y = _mm256_mul_ps(y, z);
425 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
426 y = _mm256_sub_ps(y, tmp);
427 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
431 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
432 y2 = _mm256_mul_ps(y2, z);
433 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
434 y2 = _mm256_mul_ps(y2, z);
435 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
436 y2 = _mm256_mul_ps(y2, z);
437 y2 = _mm256_mul_ps(y2, x);
438 y2 = _mm256_add_ps(y2, x);
442 y2 = _mm256_and_ps(xmm3, y2);
443 y = _mm256_andnot_ps(xmm3, y);
444 y = _mm256_add_ps(y,y2);
446 y = _mm256_xor_ps(y, sign_bit);
452 inline v8sf cos256_ps(v8sf x) {
453 v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
462 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
465 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
469 imm2 = _mm256_cvttps_epi32(y);
471 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
472 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
473 y = _mm256_cvtepi32_ps(imm2);
474 imm2 = _mm256_sub_epi32(imm2, *(v8si*)_pi32_256_2);
477 imm0 = _mm256_andnot_si256(imm2, *(v8si*)_pi32_256_4);
478 imm0 = _mm256_slli_epi32(imm0, 29);
480 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
481 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
485 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
487 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
488 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
490 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
491 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
493 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
494 y = _mm256_cvtepi32_ps(imm2);
496 imm2_1 = _mm_sub_epi32(imm2_1, *(v4si*)_pi32avx_2);
497 imm2_2 = _mm_sub_epi32(imm2_2, *(v4si*)_pi32avx_2);
499 imm0_1 = _mm_andnot_si128(imm2_1, *(v4si*)_pi32avx_4);
500 imm0_2 = _mm_andnot_si128(imm2_2, *(v4si*)_pi32avx_4);
502 imm0_1 = _mm_slli_epi32(imm0_1, 29);
503 imm0_2 = _mm_slli_epi32(imm0_2, 29);
505 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
507 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
508 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
510 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
511 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
513 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
516 v8sf sign_bit = _mm256_castsi256_ps(imm0);
517 v8sf poly_mask = _mm256_castsi256_ps(imm2);
521 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
522 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
523 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
524 xmm1 = _mm256_mul_ps(y, xmm1);
525 xmm2 = _mm256_mul_ps(y, xmm2);
526 xmm3 = _mm256_mul_ps(y, xmm3);
527 x = _mm256_add_ps(x, xmm1);
528 x = _mm256_add_ps(x, xmm2);
529 x = _mm256_add_ps(x, xmm3);
532 y = *(v8sf*)_ps256_coscof_p0;
533 v8sf z = _mm256_mul_ps(x,x);
535 y = _mm256_mul_ps(y, z);
536 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
537 y = _mm256_mul_ps(y, z);
538 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
539 y = _mm256_mul_ps(y, z);
540 y = _mm256_mul_ps(y, z);
541 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
542 y = _mm256_sub_ps(y, tmp);
543 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
547 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
548 y2 = _mm256_mul_ps(y2, z);
549 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
550 y2 = _mm256_mul_ps(y2, z);
551 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
552 y2 = _mm256_mul_ps(y2, z);
553 y2 = _mm256_mul_ps(y2, x);
554 y2 = _mm256_add_ps(y2, x);
558 y2 = _mm256_and_ps(xmm3, y2);
559 y = _mm256_andnot_ps(xmm3, y);
560 y = _mm256_add_ps(y,y2);
562 y = _mm256_xor_ps(y, sign_bit);
569 inline void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
571 v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
572 v8si imm0, imm2, imm4;
582 x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
584 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
587 y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
591 imm2 = _mm256_cvttps_epi32(y);
594 imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
595 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1);
597 y = _mm256_cvtepi32_ps(imm2);
601 imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4);
602 imm0 = _mm256_slli_epi32(imm0, 29);
606 imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_2);
607 imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
611 COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
613 imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
614 imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
616 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
617 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);
619 COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
620 y = _mm256_cvtepi32_ps(imm2);
625 imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
626 imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
628 imm0_1 = _mm_slli_epi32(imm0_1, 29);
629 imm0_2 = _mm_slli_epi32(imm0_2, 29);
631 COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
633 imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
634 imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);
636 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
637 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
639 COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
641 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
642 v8sf poly_mask = _mm256_castsi256_ps(imm2);
646 xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
647 xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
648 xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
649 xmm1 = _mm256_mul_ps(y, xmm1);
650 xmm2 = _mm256_mul_ps(y, xmm2);
651 xmm3 = _mm256_mul_ps(y, xmm3);
652 x = _mm256_add_ps(x, xmm1);
653 x = _mm256_add_ps(x, xmm2);
654 x = _mm256_add_ps(x, xmm3);
657 imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
658 imm4 = _mm256_andnot_si256(imm4, *(v8si*)_pi32_256_4);
659 imm4 = _mm256_slli_epi32(imm4, 29);
661 imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
662 imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);
664 imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
665 imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
667 imm4_1 = _mm_slli_epi32(imm4_1, 29);
668 imm4_2 = _mm_slli_epi32(imm4_2, 29);
670 COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
673 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
675 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
678 v8sf z = _mm256_mul_ps(x,x);
679 y = *(v8sf*)_ps256_coscof_p0;
681 y = _mm256_mul_ps(y, z);
682 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
683 y = _mm256_mul_ps(y, z);
684 y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
685 y = _mm256_mul_ps(y, z);
686 y = _mm256_mul_ps(y, z);
687 v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
688 y = _mm256_sub_ps(y, tmp);
689 y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
693 v8sf y2 = *(v8sf*)_ps256_sincof_p0;
694 y2 = _mm256_mul_ps(y2, z);
695 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
696 y2 = _mm256_mul_ps(y2, z);
697 y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
698 y2 = _mm256_mul_ps(y2, z);
699 y2 = _mm256_mul_ps(y2, x);
700 y2 = _mm256_add_ps(y2, x);
704 v8sf ysin2 = _mm256_and_ps(xmm3, y2);
705 v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
706 y2 = _mm256_sub_ps(y2,ysin2);
707 y = _mm256_sub_ps(y, ysin1);
709 xmm1 = _mm256_add_ps(ysin1,ysin2);
710 xmm2 = _mm256_add_ps(y,y2);
713 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
714 *c = _mm256_xor_ps(xmm2, sign_bit_cos);