HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_SSEFunc.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_SSEFunc.h ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_SSEFunc__
12 #define __VM_SSEFunc__
13 
14 #include "VM_API.h"
15 #include <SYS/SYS_Align.h>
16 #include <SYS/SYS_Inline.h>
17 #include <SYS/SYS_Types.h>
18 
19 #if defined(_MSC_VER)
20  #pragma warning(push)
21  #pragma warning(disable:4799)
22 #endif
23 
24 #define CPU_HAS_SIMD_INSTR 1
25 #define VM_SSE_STYLE 1
26 
27 #if !defined(ARM64)
28 #include <emmintrin.h>
29 #else
30 #include <sse2neon.h>
31 #endif
32 
33 typedef __m128 v4sf;
34 typedef __m128i v4si;
35 
36 // NOTE: __SS4_1__ is NOT a predefined macro on Visual C++
37 #if defined(__SSE4_1__) || defined(_MSC_VER)
38 #define VM_SSE41_STYLE 1
39 #if !defined(ARM64)
40 #include <smmintrin.h>
41 #endif
42 #endif
43 
44 #if defined(_MSC_VER)
45  #pragma warning(pop)
46 #endif
47 
48 // Plain casting (no conversion)
49 #define V4SF(A) _mm_castsi128_ps(A)
50 #define V4SI(A) _mm_castps_si128(A)
51 
52 #define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
53 
54 template <int mask>
56 vm_shuffle(const v4sf &a, const v4sf &b)
57 {
58  return _mm_shuffle_ps(a, b, mask);
59 }
60 
61 template <int mask>
63 vm_shuffle(const v4si &a, const v4si &b)
64 {
65  return V4SI(_mm_shuffle_ps(V4SF(a), V4SF(b), mask));
66 }
67 
68 template <int A, int B, int C, int D, typename T>
69 static SYS_FORCE_INLINE T
70 vm_shuffle(const T &a, const T &b)
71 {
72  return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(a, b);
73 }
74 
75 template <int mask, typename T>
76 static SYS_FORCE_INLINE T
77 vm_shuffle(const T &a)
78 {
79  return vm_shuffle<mask>(a, a);
80 }
81 
82 template <int A, int B, int C, int D, typename T>
83 static SYS_FORCE_INLINE T
84 vm_shuffle(const T &a)
85 {
86  return vm_shuffle<A,B,C,D>(a, a);
87 }
88 
89 #if defined(VM_SSE41_STYLE)
90 
92 vm_insert(const v4si v, int32 a, int n)
93 {
94  switch (n)
95  {
96  case 0: return _mm_insert_epi32(v, a, 0);
97  case 1: return _mm_insert_epi32(v, a, 1);
98  case 2: return _mm_insert_epi32(v, a, 2);
99  case 3: return _mm_insert_epi32(v, a, 3);
100  }
101  return v;
102 }
103 
104 static SYS_FORCE_INLINE v4sf
105 vm_insert(const v4sf v, float a, int n)
106 {
107  switch (n)
108  {
109  case 0: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));
110  case 1: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));
111  case 2: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));
112  case 3: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));
113  }
114  return v;
115 }
116 
117 static SYS_FORCE_INLINE int
118 vm_extract(const v4si v, int n)
119 {
120  switch (n)
121  {
122  case 0: return _mm_extract_epi32(v, 0);
123  case 1: return _mm_extract_epi32(v, 1);
124  case 2: return _mm_extract_epi32(v, 2);
125  case 3: return _mm_extract_epi32(v, 3);
126  }
127  return 0;
128 }
129 
130 static SYS_FORCE_INLINE float
131 vm_extract(const v4sf v, int n)
132 {
133  SYS_FPRealUnionF tmp;
134  switch (n)
135  {
136  case 0: tmp.ival = _mm_extract_ps(v, 0); break;
137  case 1: tmp.ival = _mm_extract_ps(v, 1); break;
138  case 2: tmp.ival = _mm_extract_ps(v, 2); break;
139  case 3: tmp.ival = _mm_extract_ps(v, 3); break;
140  }
141  return tmp.fval;
142 }
143 
144 #else
145 
146 static SYS_FORCE_INLINE v4si
147 vm_insert(const v4si v, int32 a, int n)
148 {
149  union { v4si vector; int32 comp[4]; };
150  vector = v;
151  comp[n] = a;
152  return vector;
153 }
154 
155 static SYS_FORCE_INLINE v4sf
156 vm_insert(const v4sf v, float a, int n)
157 {
158  union { v4sf vector; float comp[4]; };
159  vector = v;
160  comp[n] = a;
161  return vector;
162 }
163 
164 static SYS_FORCE_INLINE int
165 vm_extract(const v4si v, int n)
166 {
167  union { v4si vector; int32 comp[4]; };
168  vector = v;
169  return comp[n];
170 }
171 
172 static SYS_FORCE_INLINE float
173 vm_extract(const v4sf v, int n)
174 {
175  union { v4sf vector; float comp[4]; };
176  vector = v;
177  return comp[n];
178 }
179 
180 #endif
181 
182 static SYS_FORCE_INLINE v4sf
183 vm_splats(float a)
184 {
185  return _mm_set1_ps(a);
186 }
187 
188 static SYS_FORCE_INLINE v4si
189 vm_splats(uint32 a)
190 {
191  SYS_FPRealUnionF tmp;
192  tmp.uval = a;
193  return V4SI(vm_splats(tmp.fval));
194 }
195 
196 static SYS_FORCE_INLINE v4si
197 vm_splats(int32 a)
198 {
199  SYS_FPRealUnionF tmp;
200  tmp.ival = a;
201  return V4SI(vm_splats(tmp.fval));
202 }
203 
204 static SYS_FORCE_INLINE v4sf
205 vm_splats(float a, float b, float c, float d)
206 {
207  return vm_shuffle<0,2,0,2>(
208  vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),
209  vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));
210 }
211 
212 static SYS_FORCE_INLINE v4si
213 vm_splats(uint32 a, uint32 b, uint32 c, uint32 d)
214 {
215  SYS_FPRealUnionF af, bf, cf, df;
216  af.uval = a;
217  bf.uval = b;
218  cf.uval = c;
219  df.uval = d;
220  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
221 }
222 
223 static SYS_FORCE_INLINE v4si
224 vm_splats(int32 a, int32 b, int32 c, int32 d)
225 {
226  SYS_FPRealUnionF af, bf, cf, df;
227  af.ival = a;
228  bf.ival = b;
229  cf.ival = c;
230  df.ival = d;
231  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
232 }
233 
234 static SYS_FORCE_INLINE v4si
235 vm_load(const int32 v[4])
236 {
237  return V4SI(_mm_loadu_ps((const float *)v));
238 }
239 
240 static SYS_FORCE_INLINE v4sf
241 vm_load(const float v[4])
242 {
243  return _mm_loadu_ps(v);
244 }
245 
246 static SYS_FORCE_INLINE void
247 vm_store(int32 dst[4], v4si value)
248 {
249  _mm_storeu_si128((__m128i*) dst, value);
250 }
251 static SYS_FORCE_INLINE void
252 vm_store(float dst[4], v4sf value)
253 {
254  _mm_storeu_ps(dst, value);
255 }
256 
257 static SYS_FORCE_INLINE v4sf
258 vm_negate(v4sf a)
259 {
260  return _mm_sub_ps(_mm_setzero_ps(), a);
261 }
262 
263 static SYS_FORCE_INLINE v4sf
264 vm_abs(v4sf a)
265 {
266  return _mm_max_ps(a, vm_negate(a));
267 }
268 
269 static SYS_FORCE_INLINE v4sf
270 vm_fdiv(v4sf a, v4sf b)
271 {
272  return _mm_mul_ps(a, _mm_rcp_ps(b));
273 }
274 
275 static SYS_FORCE_INLINE v4sf
276 vm_fsqrt(v4sf a)
277 {
278  return _mm_rcp_ps(_mm_rsqrt_ps(a));
279 }
280 
281 static SYS_FORCE_INLINE v4sf
282 vm_madd(v4sf a, v4sf b, v4sf c)
283 {
284  return _mm_add_ps(_mm_mul_ps(a, b), c);
285 }
286 
287 static const v4si theSSETrue = vm_splats(0xFFFFFFFF);
288 
289 static SYS_FORCE_INLINE bool
290 vm_allbits(const v4si &a)
291 {
292  return _mm_movemask_ps(V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;
293 }
294 
295 static SYS_FORCE_INLINE int
296 vm_signbits(const v4si &a)
297 {
298  return _mm_movemask_ps(V4SF(a));
299 }
300 
301 
302 static SYS_FORCE_INLINE int
303 vm_signbits(const v4sf &a)
304 {
305  return _mm_movemask_ps(a);
306 }
307 
308 #define VM_EXTRACT vm_extract
309 #define VM_INSERT vm_insert
310 #define VM_SPLATS vm_splats
311 #define VM_LOAD vm_load
312 #define VM_STORE vm_store
313 
314 #define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
315 #define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
316 #define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
317 #define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
318 #define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
319 #define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
320 
321 #define VM_ICMPLT _mm_cmplt_epi32
322 #define VM_ICMPGT _mm_cmpgt_epi32
323 #define VM_ICMPEQ _mm_cmpeq_epi32
324 
325 #define VM_IADD _mm_add_epi32
326 #define VM_ISUB _mm_sub_epi32
327 #if defined(VM_SSE41_STYLE)
328 #define VM_IMUL _mm_mullo_epi32
329 #endif
330 
331 #define VM_ADD _mm_add_ps
332 #define VM_SUB _mm_sub_ps
333 #define VM_MUL _mm_mul_ps
334 #define VM_DIV _mm_div_ps
335 #define VM_SQRT _mm_sqrt_ps
336 #define VM_ISQRT _mm_rsqrt_ps
337 #define VM_INVERT _mm_rcp_ps
338 #define VM_ABS vm_abs
339 
340 #define VM_FDIV vm_fdiv
341 #define VM_NEG vm_negate
342 #define VM_FSQRT vm_fsqrt
343 #define VM_MADD vm_madd
344 
345 #define VM_MIN _mm_min_ps
346 #define VM_MAX _mm_max_ps
347 
348 #define VM_AND _mm_and_si128
349 #define VM_ANDNOT _mm_andnot_si128
350 #define VM_OR _mm_or_si128
351 #define VM_XOR _mm_xor_si128
352 
353 #define VM_ALLBITS vm_allbits
354 #define VM_SIGNBITS vm_signbits
355 
356 #define VM_SHUFFLE vm_shuffle
357 
358 // Integer to float conversions
359 #if !defined(ARM64)
360 #define VM_SSE_ROUND_MASK 0x6000 // 0b110000000000000
361 #define VM_SSE_ROUND_ZERO 0x6000 // 0b110000000000000 (RZ)
362 #define VM_SSE_ROUND_UP 0x4000 // 0b100000000000000 (R+)
363 #define VM_SSE_ROUND_DOWN 0x2000 // 0b010000000000000 (R-)
364 #define VM_SSE_ROUND_NEAR 0x0000 // 0b000000000000000 (RN)
365 
366 #define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
367 #define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
368 #else
369 #define VM_SSE_ROUND_ZERO _MM_ROUND_TOWARDS_ZERO
370 #define VM_SSE_ROUND_UP _MM_ROUND_UP
371 #define VM_SSE_ROUND_DOWN _MM_ROUND_DOWN
372 #define VM_SSE_ROUND_NEAR _MM_ROUND_NEAREST
373 
374 #define GETROUND() _MM_GET_ROUNDING_MODE()
375 #define SETROUND(x) _MM_SET_ROUNDING_MODE(x)
376 #endif
377 
378 // The P functions must be invoked before FLOOR, the E functions invoked
379 // afterwards to reset the state.
380 
381 #define VM_P_FLOOR() uint rounding = GETROUND(); \
382  SETROUND(VM_SSE_ROUND_DOWN);
383 #define VM_FLOOR _mm_cvtps_epi32
384 #define VM_INT _mm_cvttps_epi32
385 #define VM_E_FLOOR() SETROUND(rounding);
386 
387 // Float to integer conversion
388 #define VM_IFLOAT _mm_cvtepi32_ps
389 
390 // bitshifing A=v4si C=int
391 #define VM_SHIFTLEFT(A,C) _mm_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
392 #define VM_SHIFTRIGHT(A,C) _mm_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
393 
394 //
395 // SSE Trig sourced from...
396 // http://gruntthepeon.free.fr/ssemath/sse_mathfun.h
397 //
398 static SYS_FORCE_INLINE void
399 vm_sincos(v4sf x, v4sf *s, v4sf *c)
400 {
401 // Copyright (C) 2007 Julien Pommier
402 // This software is provided 'as-is', without any express or implied
403 // warranty. In no event will the authors be held liable for any damages
404 // arising from the use of this software.
405 // Permission is granted to anyone to use this software for any purpose,
406 // including commercial applications, and to alter it and redistribute it
407 // freely, subject to the following restrictions:
408 // 1. The origin of this software must not be misrepresented; you must not
409 // claim that you wrote the original software. If you use this software
410 // in a product, an acknowledgment in the product documentation would be
411 // appreciated but is not required.
412 // 2. Altered source versions must be plainly marked as such, and must not be
413 // misrepresented as being the original software.
414 // 3. This notice may not be removed or altered from any source distribution.
415 // (this is the zlib license)
416 
417 #define _PS_CONST(Name, Val) \
418  static const SYS_ALIGN16 float _ps_##Name[4] = { Val, Val, Val, Val }
419 #define _PI32_CONST(Name, Val) \
420  static const SYS_ALIGN16 int _pi32_##Name[4] = { Val, Val, Val, Val }
421 #define _PS_CONST_TYPE(Name, Type, Val) \
422  static const SYS_ALIGN16 Type _ps_##Name[4] = { Val, Val, Val, Val }
423 
424  _PS_CONST(1 , 1.0f);
425  _PS_CONST(0p5, 0.5f);
426  _PI32_CONST(1, 1);
427  _PI32_CONST(inv1, ~1);
428  _PI32_CONST(2, 2);
429  _PI32_CONST(4, 4);
430 
431  _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
432  _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
433 
434  _PS_CONST(minus_cephes_DP1, -0.78515625);
435  _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
436  _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
437  _PS_CONST(sincof_p0, -1.9515295891E-4);
438  _PS_CONST(sincof_p1, 8.3321608736E-3);
439  _PS_CONST(sincof_p2, -1.6666654611E-1);
440  _PS_CONST(coscof_p0, 2.443315711809948E-005);
441  _PS_CONST(coscof_p1, -1.388731625493765E-003);
442  _PS_CONST(coscof_p2, 4.166664568298827E-002);
443  _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
444 
445 #undef _PS_CONST
446 #undef _PI32_CONST
447 #undef _PS_CONST_TYPE
448 
449  v4sf xmm1, xmm2, xmm3, sign_bit_sin, y;
450  v4si emm0, emm2, emm4;
451 
452  sign_bit_sin = x;
453  // take the absolute value
454  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
455  // extract the sign bit (upper one)
456  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
457 
458  // scale by 4/Pi
459  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
460 
461  // store the integer part of y in emm2
462  emm2 = _mm_cvttps_epi32(y);
463 
464  // j=(j+1) & (~1) (see the cephes sources)
465  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
466  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
467  y = _mm_cvtepi32_ps(emm2);
468 
469  emm4 = emm2;
470 
471  // get the swap sign flag for the sine
472  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
473  emm0 = _mm_slli_epi32(emm0, 29);
474  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
475 
476  // get the polynom selection mask for the sine
477  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
478  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
479  v4sf poly_mask = _mm_castsi128_ps(emm2);
480 
481  // The magic pass: "Extended precision modular arithmetic"
482  // x = ((x - y * DP1) - y * DP2) - y * DP3;
483  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
484  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
485  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
486  xmm1 = _mm_mul_ps(y, xmm1);
487  xmm2 = _mm_mul_ps(y, xmm2);
488  xmm3 = _mm_mul_ps(y, xmm3);
489  x = _mm_add_ps(x, xmm1);
490  x = _mm_add_ps(x, xmm2);
491  x = _mm_add_ps(x, xmm3);
492 
493  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
494  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
495  emm4 = _mm_slli_epi32(emm4, 29);
496  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
497 
498  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
499 
500  // Evaluate the first polynom (0 <= x <= Pi/4)
501  v4sf z = _mm_mul_ps(x,x);
502  y = *(v4sf*)_ps_coscof_p0;
503 
504  y = _mm_mul_ps(y, z);
505  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
506  y = _mm_mul_ps(y, z);
507  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
508  y = _mm_mul_ps(y, z);
509  y = _mm_mul_ps(y, z);
510  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
511  y = _mm_sub_ps(y, tmp);
512  y = _mm_add_ps(y, *(v4sf*)_ps_1);
513 
514  // Evaluate the second polynom (Pi/4 <= x <= 0)
515  v4sf y2 = *(v4sf*)_ps_sincof_p0;
516  y2 = _mm_mul_ps(y2, z);
517  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
518  y2 = _mm_mul_ps(y2, z);
519  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
520  y2 = _mm_mul_ps(y2, z);
521  y2 = _mm_mul_ps(y2, x);
522  y2 = _mm_add_ps(y2, x);
523 
524  // select the correct result from the two polynoms
525  xmm3 = poly_mask;
526  v4sf ysin2 = _mm_and_ps(xmm3, y2);
527  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
528  y2 = _mm_sub_ps(y2,ysin2);
529  y = _mm_sub_ps(y, ysin1);
530 
531  xmm1 = _mm_add_ps(ysin1,ysin2);
532  xmm2 = _mm_add_ps(y,y2);
533 
534  // update the sign
535  *s = _mm_xor_ps(xmm1, sign_bit_sin);
536  *c = _mm_xor_ps(xmm2, sign_bit_cos);
537 }
538 
539 static SYS_FORCE_INLINE v4sf
540 vm_sin(v4sf x)
541 {
542  v4sf s,c;
543  vm_sincos(x,&s,&c);
544  return s;
545 }
546 
547 static SYS_FORCE_INLINE v4sf
548 vm_cos(v4sf x)
549 {
550  v4sf s,c;
551  vm_sincos(x,&s,&c);
552  return c;
553 }
554 
555 static SYS_FORCE_INLINE v4sf
556 vm_tan(v4sf x)
557 {
558  v4sf s,c;
559  vm_sincos(x,&s,&c);
560  return _mm_div_ps(s,c);
561 }
562 
563 #define VM_SINCOS vm_sincos
564 #define VM_SIN vm_sin
565 #define VM_COS vm_cos
566 #define VM_TAN vm_tan
567 
568 #endif
__m128i v4si
Definition: VM_SSEFunc.h:34
#define _PI32_CONST(Name, Val)
int int32
Definition: SYS_Types.h:39
const GLdouble * v
Definition: glcorearb.h:837
#define V4SF(A)
Definition: VM_SSEFunc.h:49
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
GLint y
Definition: glcorearb.h:103
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
__m128 v4sf
Definition: VM_SSEFunc.h:33
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
GLint GLuint mask
Definition: glcorearb.h:124
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
GLint GLenum GLint x
Definition: glcorearb.h:409
GLenum GLenum dst
Definition: glcorearb.h:1793
#define V4SI(A)
Definition: VM_SSEFunc.h:50
unsigned int uint32
Definition: SYS_Types.h:40
Definition: core.h:1131
GLdouble GLdouble GLdouble y2
Definition: glad.h:2349
#define _PS_CONST(Name, Val)
#define _PS_CONST_TYPE(Name, Type, Val)