HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_AVXFunc.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_AVXFunc.h ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_AVXFunc__
12 #define __VM_AVXFunc__
13 
14 #include "VM_API.h"
15 #include <SYS/SYS_Align.h>
16 #include <SYS/SYS_Types.h>
17 
18 #define CPU_HAS_AVX_INSTR 1
19 #define VM_AVX_STYLE 1
20 
21 #include <immintrin.h>
22 typedef __m256 v8sf;
23 typedef __m256i v8si;
24 
25 // Plain casting (no conversion)
26 // MSVC has problems casting between __m128 and __m128i, so we implement a
27 // custom casting routine specifically for windows.
28 
29 #if defined(_MSC_VER)
30 
32 vm_v8sf(const v8si &a)
33 {
34  union {
35  v8si ival;
36  v8sf fval;
37  };
38  ival = a;
39  return fval;
40 }
41 
43 vm_v8si(const v8sf &a)
44 {
45  union {
46  v8si ival;
47  v8sf fval;
48  };
49  fval = a;
50  return ival;
51 }
52 
53 #define V8SF(A) vm_v8sf(A)
54 #define V8SI(A) vm_v8si(A)
55 
56 #else
57 
58 #define V8SF(A) (v8sf)A
59 #define V8SI(A) (v8si)A
60 
61 // Intrinsic missing in gcc/clang
62 #define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
63 
64 #endif
65 
66 #define VM_SHUFFLE_MASK_AVX(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
67 
68 template <int mask>
70 vm_shuffle_avx(const v8sf &a, const v8sf &b)
71 {
72  return _mm256_shuffle_ps(a, b, mask);
73 }
74 
75 template <int mask>
77 vm_shuffle_avx(const v8si &a, const v8si &b)
78 {
79  return V8SI(_mm256_shuffle_ps(V8SF(a), V8SF(b), mask));
80 }
81 
82 template <int A, int B, int C, int D, typename T>
83 static SYS_FORCE_INLINE T
84 vm_shuffle_avx(const T &a, const T &b)
85 {
86  return vm_shuffle_avx<VM_SHUFFLE_MASK_AVX(A,B,C,D)>(a, b);
87 }
88 
89 template <int mask, typename T>
90 static SYS_FORCE_INLINE T
91 vm_shuffle_avx(const T &a)
92 {
93  return vm_shuffle_avx<mask>(a, a);
94 }
95 
96 template <int A, int B, int C, int D, typename T>
97 static SYS_FORCE_INLINE T
98 vm_shuffle_avx(const T &a)
99 {
100  return vm_shuffle_avx<A,B,C,D>(a, a);
101 }
102 
103 // The _mm256_insert_epi32 intrinsic is missing in VS2015
104 #if defined(_MSC_VER)
105 static SYS_FORCE_INLINE v8si
106 vm_insert_avx(const v8si v, int32 a, int n)
107 {
108  union { v8si vector; int32 comp[8]; };
109  vector = v;
110  comp[n] = a;
111  return vector;
112 }
113 #else
114 static SYS_FORCE_INLINE v8si
115 vm_insert_avx(const v8si v, int32 a, int n)
116 {
117  switch(n)
118  {
119  case 0 : return _mm256_insert_epi32(v, a, 0);
120  case 1 : return _mm256_insert_epi32(v, a, 1);
121  case 2 : return _mm256_insert_epi32(v, a, 2);
122  case 3 : return _mm256_insert_epi32(v, a, 3);
123  case 4 : return _mm256_insert_epi32(v, a, 4);
124  case 5 : return _mm256_insert_epi32(v, a, 5);
125  case 6 : return _mm256_insert_epi32(v, a, 6);
126  case 7 : return _mm256_insert_epi32(v, a, 7);
127  }
128  return v;
129 }
130 #endif
131 
132 static SYS_FORCE_INLINE v8sf
133 vm_insert_avx(const v8sf v, float a, int n)
134 {
135  union { v8sf vector; float comp[8]; };
136  vector = v;
137  comp[n] = a;
138  return vector;
139 }
140 
141 // The _mm256_extract_epi32 intrinsic is missing in VS2015
142 #if defined(_MSC_VER)
143 static SYS_FORCE_INLINE int
144 vm_extract_avx(const v8si v, int n)
145 {
146  union { v8si vector; int32 comp[8]; };
147  vector = v;
148  return comp[n];
149 }
150 #else
151 static SYS_FORCE_INLINE int
152 vm_extract_avx(const v8si v, int n)
153 {
154  switch(n)
155  {
156  case 0 : return _mm256_extract_epi32(v, 0);
157  case 1 : return _mm256_extract_epi32(v, 1);
158  case 2 : return _mm256_extract_epi32(v, 2);
159  case 3 : return _mm256_extract_epi32(v, 3);
160  case 4 : return _mm256_extract_epi32(v, 4);
161  case 5 : return _mm256_extract_epi32(v, 5);
162  case 6 : return _mm256_extract_epi32(v, 6);
163  case 7 : return _mm256_extract_epi32(v, 7);
164  }
165  return 0;
166 }
167 #endif
168 
169 static SYS_FORCE_INLINE float
170 vm_extract_avx(const v8sf v, int n)
171 {
172  union { v8sf vector; float comp[8]; };
173  vector = v;
174  return comp[n];
175 }
176 
177 static SYS_FORCE_INLINE v8sf
178 vm_splats_avx(float a)
179 {
180  return _mm256_set1_ps(a);
181 }
182 
183 static SYS_FORCE_INLINE v8si
184 vm_splats_avx(uint32 a)
185 {
186  SYS_FPRealUnionF tmp;
187  tmp.uval = a;
188  return V8SI(vm_splats_avx(tmp.fval));
189 }
190 
191 static SYS_FORCE_INLINE v8si
192 vm_splats_avx(int32 a)
193 {
194  return _mm256_set1_epi32(a);
195 }
196 
197 static SYS_FORCE_INLINE v8sf
198 vm_splats_avx(float a0, float a1, float a2, float a3,
199  float a4, float a5, float a6, float a7)
200 {
201  return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
202 }
203 
204 static SYS_FORCE_INLINE v8si
205 vm_splats_avx(uint a0, uint a1, uint a2, uint a3,
206  uint a4, uint a5, uint a6, uint a7)
207 {
208  return _mm256_set_epi32((int32)a7, (int32)a6, (int32)a5, (int32)a4,
209  (int32)a3, (int32)a2, (int32)a1, (int32)a0);
210 }
211 
212 static SYS_FORCE_INLINE v8si
213 vm_splats_avx(int32 a0, int32 a1, int32 a2, int32 a3,
214  int32 a4, int32 a5, int32 a6, int32 a7)
215 {
216  return _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
217 }
218 
219 static SYS_FORCE_INLINE v8si
220 vm_load_avx(const int32 v[8])
221 {
222  return _mm256_loadu_si256((v8si *) v);
223 }
224 
225 static SYS_FORCE_INLINE v8sf
226 vm_load_avx(const float v[8])
227 {
228  return _mm256_loadu_ps(v);
229 }
230 
231 static SYS_FORCE_INLINE void
232 vm_store_avx(int32 dst[8], v8si value)
233 {
234  _mm256_storeu_si256((__m256i*) dst, value);
235 }
236 
237 static SYS_FORCE_INLINE void
238 vm_store_avx(float dst[8], v8sf value)
239 {
240  _mm256_storeu_ps(dst, value);
241 }
242 
243 static SYS_FORCE_INLINE v8si
244 vm_izero_avx()
245 {
246  return _mm256_setzero_si256();
247 }
248 
249 static SYS_FORCE_INLINE v8sf
250 vm_zero_avx()
251 {
252  return _mm256_setzero_ps();
253 }
254 
255 static SYS_FORCE_INLINE v8sf
256 vm_negate_avx(v8sf a)
257 {
258  return _mm256_sub_ps(_mm256_setzero_ps(), a);
259 }
260 
261 static SYS_FORCE_INLINE v8sf
262 vm_abs_avx(v8sf a)
263 {
264  return _mm256_max_ps(a, vm_negate_avx(a));
265 }
266 
267 static SYS_FORCE_INLINE v8sf
268 vm_fdiv_avx(v8sf a, v8sf b)
269 {
270  return _mm256_mul_ps(a, _mm256_rcp_ps(b));
271 }
272 
273 static SYS_FORCE_INLINE v8sf
274 vm_fsqrt_avx(v8sf a)
275 {
276  return _mm256_rcp_ps(_mm256_rsqrt_ps(a));
277 }
278 
279 static SYS_FORCE_INLINE v8sf
280 vm_madd_avx(v8sf a, v8sf b, v8sf c)
281 {
282  return _mm256_add_ps(_mm256_mul_ps(a, b), c);
283 }
284 
285 // Some integer instructions aren't in AVX so we use SSE
286 #define SSE_WRAPPER_I(NAME, OP) \
287 static SYS_FORCE_INLINE v8si \
288 NAME(v8si a, v8si b) \
289 { \
290  __m128i la = _mm256_extractf128_si256(a, 0); \
291  __m128i ua = _mm256_extractf128_si256(a, 1); \
292  __m128i lb = _mm256_extractf128_si256(b, 0); \
293  __m128i ub = _mm256_extractf128_si256(b, 1); \
294  return _mm256_set_m128i(OP(ua, ub), \
295  OP(la, lb)); \
296 }
297 SSE_WRAPPER_I(vm_int_cmplt_avx, _mm_cmplt_epi32)
298 
299 
300 static const v8si theSSETrue_avx= vm_splats_avx(0xFFFFFFFF);
301 
302 static SYS_FORCE_INLINE bool
303 vm_allbits_avx(const v8si &a)
304 {
305  return _mm256_movemask_ps(V8SF(_mm256_cmpeq_epi32(a, theSSETrue_avx))) == 0xFF;
306 }
307 
308 
309 #define VM_EXTRACT_AVX vm_extract_avx
310 #define VM_INSERT_AVX vm_insert_avx
311 #define VM_SPLATS_AVX vm_splats_avx
312 #define VM_LOAD_AVX vm_load_avx
313 #define VM_STORE_AVX vm_store_avx
314 #define VM_ZERO_AVX vm_zero_avx
315 #define VM_IZERO_AVX vm_izero_avx
316 
317 #define VM_CMPLT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LT_OQ))
318 #define VM_CMPLE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LE_OQ))
319 #define VM_CMPGT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GT_OQ))
320 #define VM_CMPGE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GE_OQ))
321 #define VM_CMPEQ_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_EQ_OQ))
322 #define VM_CMPNE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_NEQ_OQ))
323 
324 #define VM_ICMPLT_AVX vm_int_cmplt_avx
325 #define VM_ICMPGT_AVX _mm256_cmpgt_epi32
326 #define VM_ICMPEQ_AVX _mm256_cmpeq_epi32
327 
328 #define VM_IADD_AVX _mm256_add_epi32
329 #define VM_ISUB_AVX _mm256_sub_epi32
330 #define VM_IMUL_AVX _mm256_mullo_epi32
331 
332 #define VM_ADD_AVX _mm256_add_ps
333 #define VM_SUB_AVX _mm256_sub_ps
334 #define VM_MUL_AVX _mm256_mul_ps
335 #define VM_DIV_AVX _mm256_div_ps
336 #define VM_SQRT_AVX _mm256_sqrt_ps
337 #define VM_ISQRT_AVX _mm256_rsqrt_ps
338 #define VM_INVERT_AVX _mm256_rcp_ps
339 #define VM_ABS_AVX vm_abs_avx
340 
341 #define VM_FDIV_AVX vm_fdiv_avx
342 #define VM_NEG_AVX vm_negate_avx
343 #define VM_FSQRT_AVX vm_fsqrt_avx
344 #define VM_MADD_AVX vm_madd_avx
345 
346 #define VM_MIN_AVX _mm256_min_ps
347 #define VM_MAX_AVX _mm256_max_ps
348 
349 #define VM_AND_AVX _mm256_and_si256
350 #define VM_ANDNOT_AVX _mm256_andnot_si256
351 #define VM_OR_AVX _mm256_or_si256
352 #define VM_XOR_AVX _mm256_xor_si256
353 
354 #define VM_ALLBITS_AVX vm_allbits_avx
355 
356 #define VM_SHUFFLE_AVX vm_shuffle_avx
357 
358 // Integer to float conversions
359 #define VM_SSE_ROUND_MASK_AVX 0x6000
360 #define VM_SSE_ROUND_ZERO_AVX 0x6000
361 #define VM_SSE_ROUND_UP_AVX 0x4000
362 #define VM_SSE_ROUND_DOWN_AVX 0x2000
363 #define VM_SSE_ROUND_NEAR_AVX 0x0000
364 
365 #define GETROUND_AVX() (_mm_getcsr()&VM_SSE_ROUND_MASK_AVX)
366 #define SETROUND_AVX(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK_AVX)))
367 
368 // The P functions must be invoked before FLOOR, the E functions invoked
369 // afterwards to reset the state.
370 
371 #define VM_P_FLOOR_AVX() uint rounding = GETROUND_AVX(); \
372  SETROUND_AVX(VM_SSE_ROUND_DOWN_AVX);
373 #define VM_FLOOR_AVX _mm256_cvtps_epi32
374 #define VM_INT_AVX _mm256_cvttps_epi32
375 #define VM_E_FLOOR_AVX() SETROUND_AVX(rounding);
376 
377 // Float to integer conversion
378 #define VM_IFLOAT_AVX _mm256_cvtepi32_ps
379 
380 // bitshifing A=v8si C=int
381 #define VM_SHIFTLEFT_AVX(A,C) _mm256_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
382 #define VM_SHIFTRIGHT_AVX(A,C) _mm256_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
383 
384 //
385 // SSE Trig sourced from...
386 // http://software-lisc.fbk.eu/avx_mathfun/avx_mathfun.h
387 //
388 static SYS_FORCE_INLINE void
389 vm_sincos_avx(v8sf x, v8sf *s, v8sf *c)
390 {
391 
392 // AVX implementation of sincos
393 //
394 // Based on "sse_mathfun.h", by Julien Pommier
395 // http://gruntthepeon.free.fr/ssemath/
396 //
397 // Copyright (C) 2012 Giovanni Garberoglio
398 // Interdisciplinary Laboratory for Computational Science (LISC)
399 // Fondazione Bruno Kessler and University of Trento
400 // via Sommarive, 18
401 // I-38123 Trento (Italy)
402 //
403 // This software is provided 'as-is', without any express or implied
404 // warranty. In no event will the authors be held liable for any damages
405 // arising from the use of this software.
406 //
407 // Permission is granted to anyone to use this software for any purpose,
408 // including commercial applications, and to alter it and redistribute it
409 // freely, subject to the following restrictions:
410 //
411 // 1. The origin of this software must not be misrepresented; you must not
412 // claim that you wrote the original software. If you use this software
413 // in a product, an acknowledgment in the product documentation would be
414 // appreciated but is not required.
415 // 2. Altered source versions must be plainly marked as such, and must not be
416 // misrepresented as being the original software.
417 // 3. This notice may not be removed or altered from any source distribution.
418 //
419 // (this is the zlib license)
420 
421 #define _PI32AVX_CONST(Name, Val) \
422  static const SYS_ALIGN(32) int _pi32avx_##Name[4] = \
423  { Val, Val, Val, Val }
424 
425  _PI32AVX_CONST(1, 1);
426  _PI32AVX_CONST(inv1, ~1);
427  _PI32AVX_CONST(2, 2);
428  _PI32AVX_CONST(4, 4);
429 
430  //declare some AVX constants -- why can't I figure a better way to do that?
431 #define _PS256_CONST(Name, Val) \
432  static const SYS_ALIGN(32) float _ps256_##Name[8] = \
433  { Val, Val, Val, Val, Val, Val, Val, Val }
434 #define _PS256_CONST_TYPE(Name, Type, Val) \
435  static const SYS_ALIGN(32) Type _ps256_##Name[8] = \
436  { Val, Val, Val, Val, Val, Val, Val, Val }
437 
438  _PS256_CONST(1 , 1.0f);
439  _PS256_CONST(0p5, 0.5f);
440 
441  _PS256_CONST_TYPE(sign_mask, uint32, 0x80000000);
442  _PS256_CONST_TYPE(inv_sign_mask, uint32, ~0x80000000);
443 
444  _PS256_CONST(minus_cephes_DP1, -0.78515625);
445  _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
446  _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
447  _PS256_CONST(sincof_p0, -1.9515295891E-4);
448  _PS256_CONST(sincof_p1, 8.3321608736E-3);
449  _PS256_CONST(sincof_p2, -1.6666654611E-1);
450  _PS256_CONST(coscof_p0, 2.443315711809948E-005);
451  _PS256_CONST(coscof_p1, -1.388731625493765E-003);
452  _PS256_CONST(coscof_p2, 4.166664568298827E-002);
453  _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
454 
455 #undef _PI32AVX_CONST
456 #undef _PS256_CONST
457 #undef _PS256_CONST_TYPE
458 
459  typedef union imm_xmm_union {
460  v8si imm;
461  __m128i xmm[2];
462  } imm_xmm_union;
463 
464 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
465  SYS_ALIGN(32) imm_xmm_union u; \
466  u.imm = imm_; \
467  xmm0_ = u.xmm[0]; \
468  xmm1_ = u.xmm[1]; \
469  }
470 
471 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
472  SYS_ALIGN(32) imm_xmm_union u; \
473  u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
474  }
475 
476  v8sf xmm1, xmm2, xmm3, sign_bit_sin, y;
477  v8si imm0, imm2, imm4;
478 
479  __m128i imm0_1, imm0_2;
480  __m128i imm2_1, imm2_2;
481  __m128i imm4_1, imm4_2;
482 
483  sign_bit_sin = x;
484  // take the absolute value
485  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
486  // extract the sign bit (upper one)
487  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
488 
489  // scale by 4/Pi
490  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
491 
492  // we use SSE2 routines to perform the integer ops
493  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
494 
495  imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
496  imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
497 
498  imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
499  imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
500 
501  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
502  y = _mm256_cvtepi32_ps(imm2);
503 
504  imm4_1 = imm2_1;
505  imm4_2 = imm2_2;
506 
507  imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
508  imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
509 
510  imm0_1 = _mm_slli_epi32(imm0_1, 29);
511  imm0_2 = _mm_slli_epi32(imm0_2, 29);
512 
513  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
514 
515  imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
516  imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
517 
518  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
519  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
520 
521  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
522 
523  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
524  v8sf poly_mask = _mm256_castsi256_ps(imm2);
525 
526  // The magic pass: "Extended precision modular arithmetic"
527  // x = ((x - y * DP1) - y * DP2) - y * DP3;
528  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
529  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
530  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
531  xmm1 = _mm256_mul_ps(y, xmm1);
532  xmm2 = _mm256_mul_ps(y, xmm2);
533  xmm3 = _mm256_mul_ps(y, xmm3);
534  x = _mm256_add_ps(x, xmm1);
535  x = _mm256_add_ps(x, xmm2);
536  x = _mm256_add_ps(x, xmm3);
537 
538  imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
539  imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
540 
541  imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
542  imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
543 
544  imm4_1 = _mm_slli_epi32(imm4_1, 29);
545  imm4_2 = _mm_slli_epi32(imm4_2, 29);
546 
547  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
548 
549  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
550 
551  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
552 
553  // Evaluate the first polynom (0 <= x <= Pi/4)
554  v8sf z = _mm256_mul_ps(x,x);
555  y = *(v8sf*)_ps256_coscof_p0;
556 
557  y = _mm256_mul_ps(y, z);
558  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
559  y = _mm256_mul_ps(y, z);
560  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
561  y = _mm256_mul_ps(y, z);
562  y = _mm256_mul_ps(y, z);
563  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
564  y = _mm256_sub_ps(y, tmp);
565  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
566 
567  // Evaluate the second polynom (Pi/4 <= x <= 0)
568  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
569  y2 = _mm256_mul_ps(y2, z);
570  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
571  y2 = _mm256_mul_ps(y2, z);
572  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
573  y2 = _mm256_mul_ps(y2, z);
574  y2 = _mm256_mul_ps(y2, x);
575  y2 = _mm256_add_ps(y2, x);
576 
577  // select the correct result from the two polynoms
578  xmm3 = poly_mask;
579  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
580  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
581  y2 = _mm256_sub_ps(y2,ysin2);
582  y = _mm256_sub_ps(y, ysin1);
583 
584  xmm1 = _mm256_add_ps(ysin1,ysin2);
585  xmm2 = _mm256_add_ps(y,y2);
586 
587  // update the sign
588  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
589  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
590 
591 #undef COPY_IMM_TO_XMM
592 #undef COPY_XMM_TO_IMM
593 }
594 
595 static SYS_FORCE_INLINE v8sf
596 vm_sin_avx(v8sf x)
597 {
598  v8sf s,c;
599  vm_sincos_avx(x,&s,&c);
600  return s;
601 }
602 
603 static SYS_FORCE_INLINE v8sf
604 vm_cos_avx(v8sf x)
605 {
606  v8sf s,c;
607  vm_sincos_avx(x,&s,&c);
608  return c;
609 }
610 
611 static SYS_FORCE_INLINE v8sf
612 vm_tan_avx(v8sf x)
613 {
614  v8sf s,c;
615  vm_sincos_avx(x,&s,&c);
616  return _mm256_div_ps(s,c);
617 }
618 
619 #define VM_SINCOS_AVX vm_sincos_avx
620 #define VM_SIN_AVX vm_sin_avx
621 #define VM_COS_AVX vm_cos_avx
622 #define VM_TAN_AVX vm_tan_avx
623 
624 #endif
int int32
Definition: SYS_Types.h:39
#define _PS256_CONST_TYPE(Name, Type, Val)
#define _PS256_CONST(Name, Val)
const GLdouble * v
Definition: glcorearb.h:837
GLsizei const GLfloat * value
Definition: glcorearb.h:824
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
GLint y
Definition: glcorearb.h:103
__m256 v8sf
Definition: VM_AVXFunc.h:22
__m256i v8si
Definition: VM_AVXFunc.h:23
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
#define V8SF(A)
Definition: VM_AVXFunc.h:58
GLint GLuint mask
Definition: glcorearb.h:124
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)
GLint GLenum GLint x
Definition: glcorearb.h:409
GLenum GLenum dst
Definition: glcorearb.h:1793
#define V8SI(A)
Definition: VM_AVXFunc.h:59
#define _PI32AVX_CONST(Name, Val)
unsigned int uint32
Definition: SYS_Types.h:40
GLdouble GLdouble GLdouble y2
Definition: glad.h:2349
#define SSE_WRAPPER_I(NAME, OP)
Definition: VM_AVXFunc.h:286
unsigned int uint
Definition: SYS_Types.h:45