11 #ifndef __VM_AVXFunc__
12 #define __VM_AVXFunc__
18 #define CPU_HAS_AVX_INSTR 1
19 #define VM_AVX_STYLE 1
21 #include <immintrin.h>
32 vm_v8sf(
const v8si &
a)
43 vm_v8si(
const v8sf &
a)
53 #define V8SF(A) vm_v8sf(A)
54 #define V8SI(A) vm_v8si(A)
58 #define V8SF(A) (v8sf)A
59 #define V8SI(A) (v8si)A
62 #define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
66 #define VM_SHUFFLE_MASK_AVX(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
72 return _mm256_shuffle_ps(a, b,
mask);
82 template <
int A,
int B,
int C,
int D,
typename T>
84 vm_shuffle_avx(
const T &a,
const T &b)
86 return vm_shuffle_avx<VM_SHUFFLE_MASK_AVX(A,B,C,D)>(
a,
b);
89 template <
int mask,
typename T>
91 vm_shuffle_avx(
const T &a)
93 return vm_shuffle_avx<mask>(
a,
a);
96 template <
int A,
int B,
int C,
int D,
typename T>
98 vm_shuffle_avx(
const T &a)
100 return vm_shuffle_avx<A,B,C,D>(
a,
a);
104 #if defined(_MSC_VER)
119 case 0 :
return _mm256_insert_epi32(v, a, 0);
120 case 1 :
return _mm256_insert_epi32(v, a, 1);
121 case 2 :
return _mm256_insert_epi32(v, a, 2);
122 case 3 :
return _mm256_insert_epi32(v, a, 3);
123 case 4 :
return _mm256_insert_epi32(v, a, 4);
124 case 5 :
return _mm256_insert_epi32(v, a, 5);
125 case 6 :
return _mm256_insert_epi32(v, a, 6);
126 case 7 :
return _mm256_insert_epi32(v, a, 7);
133 vm_insert_avx(
const v8sf v,
float a,
int n)
135 union {
v8sf vector;
float comp[8]; };
142 #if defined(_MSC_VER)
144 vm_extract_avx(
const v8si v,
int n)
152 vm_extract_avx(
const v8si v,
int n)
156 case 0 :
return _mm256_extract_epi32(v, 0);
157 case 1 :
return _mm256_extract_epi32(v, 1);
158 case 2 :
return _mm256_extract_epi32(v, 2);
159 case 3 :
return _mm256_extract_epi32(v, 3);
160 case 4 :
return _mm256_extract_epi32(v, 4);
161 case 5 :
return _mm256_extract_epi32(v, 5);
162 case 6 :
return _mm256_extract_epi32(v, 6);
163 case 7 :
return _mm256_extract_epi32(v, 7);
170 vm_extract_avx(
const v8sf v,
int n)
172 union {
v8sf vector;
float comp[8]; };
178 vm_splats_avx(
float a)
180 return _mm256_set1_ps(a);
188 return V8SI(vm_splats_avx(tmp.
fval));
192 vm_splats_avx(
int32 a)
194 return _mm256_set1_epi32(a);
198 vm_splats_avx(
float a0,
float a1,
float a2,
float a3,
199 float a4,
float a5,
float a6,
float a7)
201 return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
216 return _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
220 vm_load_avx(
const int32 v[8])
222 return _mm256_loadu_si256((
v8si *) v);
226 vm_load_avx(
const float v[8])
228 return _mm256_loadu_ps(v);
234 _mm256_storeu_si256((__m256i*) dst, value);
238 vm_store_avx(
float dst[8],
v8sf value)
240 _mm256_storeu_ps(dst, value);
246 return _mm256_setzero_si256();
252 return _mm256_setzero_ps();
256 vm_negate_avx(
v8sf a)
258 return _mm256_sub_ps(_mm256_setzero_ps(), a);
264 return _mm256_max_ps(a, vm_negate_avx(a));
270 return _mm256_mul_ps(a, _mm256_rcp_ps(b));
276 return _mm256_rcp_ps(_mm256_rsqrt_ps(a));
282 return _mm256_add_ps(_mm256_mul_ps(a, b), c);
286 #define SSE_WRAPPER_I(NAME, OP) \
287 static SYS_FORCE_INLINE v8si \
288 NAME(v8si a, v8si b) \
290 __m128i la = _mm256_extractf128_si256(a, 0); \
291 __m128i ua = _mm256_extractf128_si256(a, 1); \
292 __m128i lb = _mm256_extractf128_si256(b, 0); \
293 __m128i ub = _mm256_extractf128_si256(b, 1); \
294 return _mm256_set_m128i(OP(ua, ub), \
300 static const
v8si theSSETrue_avx= vm_splats_avx(0xFFFFFFFF);
303 vm_allbits_avx(const
v8si &a)
305 return _mm256_movemask_ps(
V8SF(_mm256_cmpeq_epi32(a, theSSETrue_avx))) == 0xFF;
309 #define VM_EXTRACT_AVX vm_extract_avx
310 #define VM_INSERT_AVX vm_insert_avx
311 #define VM_SPLATS_AVX vm_splats_avx
312 #define VM_LOAD_AVX vm_load_avx
313 #define VM_STORE_AVX vm_store_avx
314 #define VM_ZERO_AVX vm_zero_avx
315 #define VM_IZERO_AVX vm_izero_avx
317 #define VM_CMPLT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LT_OQ))
318 #define VM_CMPLE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LE_OQ))
319 #define VM_CMPGT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GT_OQ))
320 #define VM_CMPGE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GE_OQ))
321 #define VM_CMPEQ_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_EQ_OQ))
322 #define VM_CMPNE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_NEQ_OQ))
324 #define VM_ICMPLT_AVX vm_int_cmplt_avx
325 #define VM_ICMPGT_AVX _mm256_cmpgt_epi32
326 #define VM_ICMPEQ_AVX _mm256_cmpeq_epi32
328 #define VM_IADD_AVX _mm256_add_epi32
329 #define VM_ISUB_AVX _mm256_sub_epi32
330 #define VM_IMUL_AVX _mm256_mullo_epi32
332 #define VM_ADD_AVX _mm256_add_ps
333 #define VM_SUB_AVX _mm256_sub_ps
334 #define VM_MUL_AVX _mm256_mul_ps
335 #define VM_DIV_AVX _mm256_div_ps
336 #define VM_SQRT_AVX _mm256_sqrt_ps
337 #define VM_ISQRT_AVX _mm256_rsqrt_ps
338 #define VM_INVERT_AVX _mm256_rcp_ps
339 #define VM_ABS_AVX vm_abs_avx
341 #define VM_FDIV_AVX vm_fdiv_avx
342 #define VM_NEG_AVX vm_negate_avx
343 #define VM_FSQRT_AVX vm_fsqrt_avx
344 #define VM_MADD_AVX vm_madd_avx
346 #define VM_MIN_AVX _mm256_min_ps
347 #define VM_MAX_AVX _mm256_max_ps
349 #define VM_AND_AVX _mm256_and_si256
350 #define VM_ANDNOT_AVX _mm256_andnot_si256
351 #define VM_OR_AVX _mm256_or_si256
352 #define VM_XOR_AVX _mm256_xor_si256
354 #define VM_ALLBITS_AVX vm_allbits_avx
356 #define VM_SHUFFLE_AVX vm_shuffle_avx
359 #define VM_SSE_ROUND_MASK_AVX 0x6000
360 #define VM_SSE_ROUND_ZERO_AVX 0x6000
361 #define VM_SSE_ROUND_UP_AVX 0x4000
362 #define VM_SSE_ROUND_DOWN_AVX 0x2000
363 #define VM_SSE_ROUND_NEAR_AVX 0x0000
365 #define GETROUND_AVX() (_mm_getcsr()&VM_SSE_ROUND_MASK_AVX)
366 #define SETROUND_AVX(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK_AVX)))
371 #define VM_P_FLOOR_AVX() uint rounding = GETROUND_AVX(); \
372 SETROUND_AVX(VM_SSE_ROUND_DOWN_AVX);
373 #define VM_FLOOR_AVX _mm256_cvtps_epi32
374 #define VM_INT_AVX _mm256_cvttps_epi32
375 #define VM_E_FLOOR_AVX() SETROUND_AVX(rounding);
378 #define VM_IFLOAT_AVX _mm256_cvtepi32_ps
381 #define VM_SHIFTLEFT_AVX(A,C) _mm256_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
382 #define VM_SHIFTRIGHT_AVX(A,C) _mm256_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
421 #define _PI32AVX_CONST(Name, Val) \
422 static const SYS_ALIGN(32) int _pi32avx_##Name[4] = \
423 { Val, Val, Val, Val }
431 #define _PS256_CONST(Name, Val) \
432 static const SYS_ALIGN(32) float _ps256_##Name[8] = \
433 { Val, Val, Val, Val, Val, Val, Val, Val }
434 #define _PS256_CONST_TYPE(Name, Type, Val) \
435 static const SYS_ALIGN(32) Type _ps256_##Name[8] = \
436 { Val, Val, Val, Val, Val, Val, Val, Val }
445 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
446 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
455 #undef _PI32AVX_CONST
457 #undef _PS256_CONST_TYPE
459 typedef union imm_xmm_union {
464 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
465 SYS_ALIGN(32) imm_xmm_union u; \
471 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
472 SYS_ALIGN(32) imm_xmm_union u; \
473 u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
476 v8sf xmm1, xmm2, xmm3, sign_bit_sin,
y;
477 v8si imm0, imm2, imm4;
479 __m128i imm0_1, imm0_2;
480 __m128i imm2_1, imm2_2;
481 __m128i imm4_1, imm4_2;
485 x = _mm256_and_ps(x, *(
v8sf*)_ps256_inv_sign_mask);
487 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(
v8sf*)_ps256_sign_mask);
490 y = _mm256_mul_ps(x, *(
v8sf*)_ps256_cephes_FOPI);
495 imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
496 imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
498 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
499 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
502 y = _mm256_cvtepi32_ps(imm2);
507 imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
508 imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
510 imm0_1 = _mm_slli_epi32(imm0_1, 29);
511 imm0_2 = _mm_slli_epi32(imm0_2, 29);
515 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
516 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
518 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
519 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
523 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
524 v8sf poly_mask = _mm256_castsi256_ps(imm2);
528 xmm1 = *(
v8sf*)_ps256_minus_cephes_DP1;
529 xmm2 = *(
v8sf*)_ps256_minus_cephes_DP2;
530 xmm3 = *(
v8sf*)_ps256_minus_cephes_DP3;
531 xmm1 = _mm256_mul_ps(y, xmm1);
532 xmm2 = _mm256_mul_ps(y, xmm2);
533 xmm3 = _mm256_mul_ps(y, xmm3);
534 x = _mm256_add_ps(x, xmm1);
535 x = _mm256_add_ps(x, xmm2);
536 x = _mm256_add_ps(x, xmm3);
538 imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
539 imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
541 imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
542 imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
544 imm4_1 = _mm_slli_epi32(imm4_1, 29);
545 imm4_2 = _mm_slli_epi32(imm4_2, 29);
549 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
551 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
554 v8sf z = _mm256_mul_ps(x,x);
555 y = *(
v8sf*)_ps256_coscof_p0;
557 y = _mm256_mul_ps(y, z);
558 y = _mm256_add_ps(y, *(
v8sf*)_ps256_coscof_p1);
559 y = _mm256_mul_ps(y, z);
560 y = _mm256_add_ps(y, *(
v8sf*)_ps256_coscof_p2);
561 y = _mm256_mul_ps(y, z);
562 y = _mm256_mul_ps(y, z);
563 v8sf tmp = _mm256_mul_ps(z, *(
v8sf*)_ps256_0p5);
564 y = _mm256_sub_ps(y, tmp);
565 y = _mm256_add_ps(y, *(
v8sf*)_ps256_1);
569 y2 = _mm256_mul_ps(y2, z);
570 y2 = _mm256_add_ps(y2, *(
v8sf*)_ps256_sincof_p1);
571 y2 = _mm256_mul_ps(y2, z);
572 y2 = _mm256_add_ps(y2, *(
v8sf*)_ps256_sincof_p2);
573 y2 = _mm256_mul_ps(y2, z);
574 y2 = _mm256_mul_ps(y2, x);
575 y2 = _mm256_add_ps(y2, x);
579 v8sf ysin2 = _mm256_and_ps(xmm3, y2);
580 v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
581 y2 = _mm256_sub_ps(y2,ysin2);
582 y = _mm256_sub_ps(y, ysin1);
584 xmm1 = _mm256_add_ps(ysin1,ysin2);
585 xmm2 = _mm256_add_ps(y,y2);
588 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
589 *c = _mm256_xor_ps(xmm2, sign_bit_cos);
591 #undef COPY_IMM_TO_XMM
592 #undef COPY_XMM_TO_IMM
599 vm_sincos_avx(x,&s,&c);
607 vm_sincos_avx(x,&s,&c);
615 vm_sincos_avx(x,&s,&c);
616 return _mm256_div_ps(s,c);
619 #define VM_SINCOS_AVX vm_sincos_avx
620 #define VM_SIN_AVX vm_sin_avx
621 #define VM_COS_AVX vm_cos_avx
622 #define VM_TAN_AVX vm_tan_avx
#define _PS256_CONST_TYPE(Name, Type, Val)
#define _PS256_CONST(Name, Val)
GLsizei const GLfloat * value
GLdouble GLdouble GLdouble z
GLboolean GLboolean GLboolean GLboolean a
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)
GLboolean GLboolean GLboolean b
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)
#define _PI32AVX_CONST(Name, Val)
GLdouble GLdouble GLdouble y2
#define SSE_WRAPPER_I(NAME, OP)