HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_SSEFunc.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_SSEFunc.h ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_SSEFunc__
12 #define __VM_SSEFunc__
13 
14 #include "VM_API.h"
15 #include <SYS/SYS_Inline.h>
16 #include <SYS/SYS_Types.h>
17 
18 #if defined(_MSC_VER)
19  #pragma warning(push)
20  #pragma warning(disable:4799)
21 #endif
22 
23 #define CPU_HAS_SIMD_INSTR 1
24 #define VM_SSE_STYLE 1
25 
26 #include <emmintrin.h>
27 typedef __m128 v4sf;
28 typedef __m128i v4si;
29 
30 // NOTE: __SS4_1__ is NOT a predefined macro on Visual C++
31 #if defined(__SSE4_1__) || defined(_MSC_VER)
32 #define VM_SSE41_STYLE 1
33 #include <smmintrin.h>
34 #endif
35 
36 #if defined(_MSC_VER)
37  #pragma warning(pop)
38 #endif
39 
40 // Plain casting (no conversion)
41 // MSVC has problems casting between __m128 and __m128i, so we implement a
42 // custom casting routine specifically for windows.
43 
44 #if defined(_MSC_VER)
45 
47 vm_v4sf(const v4si &a)
48 {
49  union {
50  v4si ival;
51  v4sf fval;
52  };
53  ival = a;
54  return fval;
55 }
56 
58 vm_v4si(const v4sf &a)
59 {
60  union {
61  v4si ival;
62  v4sf fval;
63  };
64  fval = a;
65  return ival;
66 }
67 
68 #define V4SF(A) vm_v4sf(A)
69 #define V4SI(A) vm_v4si(A)
70 
71 #else
72 
73 #define V4SF(A) (v4sf)A
74 #define V4SI(A) (v4si)A
75 
76 #endif
77 
78 #define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
79 
80 template <int mask>
82 vm_shuffle(const v4sf &a, const v4sf &b)
83 {
84  return _mm_shuffle_ps(a, b, mask);
85 }
86 
87 template <int mask>
89 vm_shuffle(const v4si &a, const v4si &b)
90 {
91  return V4SI(_mm_shuffle_ps(V4SF(a), V4SF(b), mask));
92 }
93 
94 template <int A, int B, int C, int D, typename T>
95 static SYS_FORCE_INLINE T
96 vm_shuffle(const T &a, const T &b)
97 {
98  return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(a, b);
99 }
100 
101 template <int mask, typename T>
102 static SYS_FORCE_INLINE T
103 vm_shuffle(const T &a)
104 {
105  return vm_shuffle<mask>(a, a);
106 }
107 
108 template <int A, int B, int C, int D, typename T>
109 static SYS_FORCE_INLINE T
110 vm_shuffle(const T &a)
111 {
112  return vm_shuffle<A,B,C,D>(a, a);
113 }
114 
115 #if defined(VM_SSE41_STYLE)
116 
117 static SYS_FORCE_INLINE v4si
118 vm_insert(const v4si v, int32 a, int n)
119 {
120  switch (n)
121  {
122  case 0: return _mm_insert_epi32(v, a, 0);
123  case 1: return _mm_insert_epi32(v, a, 1);
124  case 2: return _mm_insert_epi32(v, a, 2);
125  case 3: return _mm_insert_epi32(v, a, 3);
126  }
127  return v;
128 }
129 
130 static SYS_FORCE_INLINE v4sf
131 vm_insert(const v4sf v, float a, int n)
132 {
133  switch (n)
134  {
135  case 0: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));
136  case 1: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));
137  case 2: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));
138  case 3: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));
139  }
140  return v;
141 }
142 
143 static SYS_FORCE_INLINE int
144 vm_extract(const v4si v, int n)
145 {
146  switch (n)
147  {
148  case 0: return _mm_extract_epi32(v, 0);
149  case 1: return _mm_extract_epi32(v, 1);
150  case 2: return _mm_extract_epi32(v, 2);
151  case 3: return _mm_extract_epi32(v, 3);
152  }
153  return 0;
154 }
155 
156 static SYS_FORCE_INLINE float
157 vm_extract(const v4sf v, int n)
158 {
159  SYS_FPRealUnionF tmp;
160  switch (n)
161  {
162  case 0: tmp.ival = _mm_extract_ps(v, 0); break;
163  case 1: tmp.ival = _mm_extract_ps(v, 1); break;
164  case 2: tmp.ival = _mm_extract_ps(v, 2); break;
165  case 3: tmp.ival = _mm_extract_ps(v, 3); break;
166  }
167  return tmp.fval;
168 }
169 
170 #else
171 
172 static SYS_FORCE_INLINE v4si
173 vm_insert(const v4si v, int32 a, int n)
174 {
175  union { v4si vector; int32 comp[4]; };
176  vector = v;
177  comp[n] = a;
178  return vector;
179 }
180 
181 static SYS_FORCE_INLINE v4sf
182 vm_insert(const v4sf v, float a, int n)
183 {
184  union { v4sf vector; float comp[4]; };
185  vector = v;
186  comp[n] = a;
187  return vector;
188 }
189 
190 static SYS_FORCE_INLINE int
191 vm_extract(const v4si v, int n)
192 {
193  union { v4si vector; int32 comp[4]; };
194  vector = v;
195  return comp[n];
196 }
197 
198 static SYS_FORCE_INLINE float
199 vm_extract(const v4sf v, int n)
200 {
201  union { v4sf vector; float comp[4]; };
202  vector = v;
203  return comp[n];
204 }
205 
206 #endif
207 
208 static SYS_FORCE_INLINE v4sf
209 vm_splats(float a)
210 {
211  return _mm_set1_ps(a);
212 }
213 
214 static SYS_FORCE_INLINE v4si
215 vm_splats(uint32 a)
216 {
217  SYS_FPRealUnionF tmp;
218  tmp.uval = a;
219  return V4SI(vm_splats(tmp.fval));
220 }
221 
222 static SYS_FORCE_INLINE v4si
223 vm_splats(int32 a)
224 {
225  SYS_FPRealUnionF tmp;
226  tmp.ival = a;
227  return V4SI(vm_splats(tmp.fval));
228 }
229 
230 static SYS_FORCE_INLINE v4sf
231 vm_splats(float a, float b, float c, float d)
232 {
233  return vm_shuffle<0,2,0,2>(
234  vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),
235  vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));
236 }
237 
238 static SYS_FORCE_INLINE v4si
239 vm_splats(uint32 a, uint32 b, uint32 c, uint32 d)
240 {
241  SYS_FPRealUnionF af, bf, cf, df;
242  af.uval = a;
243  bf.uval = b;
244  cf.uval = c;
245  df.uval = d;
246  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
247 }
248 
249 static SYS_FORCE_INLINE v4si
250 vm_splats(int32 a, int32 b, int32 c, int32 d)
251 {
252  SYS_FPRealUnionF af, bf, cf, df;
253  af.ival = a;
254  bf.ival = b;
255  cf.ival = c;
256  df.ival = d;
257  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
258 }
259 
260 static SYS_FORCE_INLINE v4si
261 vm_load(const int32 v[4])
262 {
263  return V4SI(_mm_loadu_ps((const float *)v));
264 }
265 
266 static SYS_FORCE_INLINE v4sf
267 vm_load(const float v[4])
268 {
269  return _mm_loadu_ps(v);
270 }
271 
272 static SYS_FORCE_INLINE void
273 vm_store(int32 dst[4], v4si value)
274 {
275  _mm_storeu_si128((__m128i*) dst, value);
276 }
277 static SYS_FORCE_INLINE void
278 vm_store(float dst[4], v4sf value)
279 {
280  _mm_storeu_ps(dst, value);
281 }
282 
283 static SYS_FORCE_INLINE v4sf
284 vm_negate(v4sf a)
285 {
286  return _mm_sub_ps(_mm_setzero_ps(), a);
287 }
288 
289 static SYS_FORCE_INLINE v4sf
290 vm_abs(v4sf a)
291 {
292  return _mm_max_ps(a, vm_negate(a));
293 }
294 
295 static SYS_FORCE_INLINE v4sf
296 vm_fdiv(v4sf a, v4sf b)
297 {
298  return _mm_mul_ps(a, _mm_rcp_ps(b));
299 }
300 
301 static SYS_FORCE_INLINE v4sf
302 vm_fsqrt(v4sf a)
303 {
304  return _mm_rcp_ps(_mm_rsqrt_ps(a));
305 }
306 
307 static SYS_FORCE_INLINE v4sf
308 vm_madd(v4sf a, v4sf b, v4sf c)
309 {
310  return _mm_add_ps(_mm_mul_ps(a, b), c);
311 }
312 
313 static const v4si theSSETrue = vm_splats(0xFFFFFFFF);
314 
315 static SYS_FORCE_INLINE bool
316 vm_allbits(const v4si &a)
317 {
318  return _mm_movemask_ps(V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;
319 }
320 
321 static SYS_FORCE_INLINE int
322 vm_signbits(const v4si &a)
323 {
324  return _mm_movemask_ps(V4SF(a));
325 }
326 
327 
328 static SYS_FORCE_INLINE int
329 vm_signbits(const v4sf &a)
330 {
331  return _mm_movemask_ps(a);
332 }
333 
334 #define VM_EXTRACT vm_extract
335 #define VM_INSERT vm_insert
336 #define VM_SPLATS vm_splats
337 #define VM_LOAD vm_load
338 #define VM_STORE vm_store
339 
340 #define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
341 #define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
342 #define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
343 #define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
344 #define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
345 #define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
346 
347 #define VM_ICMPLT _mm_cmplt_epi32
348 #define VM_ICMPGT _mm_cmpgt_epi32
349 #define VM_ICMPEQ _mm_cmpeq_epi32
350 
351 #define VM_IADD _mm_add_epi32
352 #define VM_ISUB _mm_sub_epi32
353 #if defined(VM_SSE41_STYLE)
354 #define VM_IMUL _mm_mullo_epi32
355 #endif
356 
357 #define VM_ADD _mm_add_ps
358 #define VM_SUB _mm_sub_ps
359 #define VM_MUL _mm_mul_ps
360 #define VM_DIV _mm_div_ps
361 #define VM_SQRT _mm_sqrt_ps
362 #define VM_ISQRT _mm_rsqrt_ps
363 #define VM_INVERT _mm_rcp_ps
364 #define VM_ABS vm_abs
365 
366 #define VM_FDIV vm_fdiv
367 #define VM_NEG vm_negate
368 #define VM_FSQRT vm_fsqrt
369 #define VM_MADD vm_madd
370 
371 #define VM_MIN _mm_min_ps
372 #define VM_MAX _mm_max_ps
373 
374 #define VM_AND _mm_and_si128
375 #define VM_ANDNOT _mm_andnot_si128
376 #define VM_OR _mm_or_si128
377 #define VM_XOR _mm_xor_si128
378 
379 #define VM_ALLBITS vm_allbits
380 #define VM_SIGNBITS vm_signbits
381 
382 #define VM_SHUFFLE vm_shuffle
383 
384 // Integer to float conversions
385 #define VM_SSE_ROUND_MASK 0x6000
386 #define VM_SSE_ROUND_ZERO 0x6000
387 #define VM_SSE_ROUND_UP 0x4000
388 #define VM_SSE_ROUND_DOWN 0x2000
389 #define VM_SSE_ROUND_NEAR 0x0000
390 
391 #define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
392 #define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
393 
394 // The P functions must be invoked before FLOOR, the E functions invoked
395 // afterwards to reset the state.
396 
397 #define VM_P_FLOOR() uint rounding = GETROUND(); \
398  SETROUND(VM_SSE_ROUND_DOWN);
399 #define VM_FLOOR _mm_cvtps_epi32
400 #define VM_INT _mm_cvttps_epi32
401 #define VM_E_FLOOR() SETROUND(rounding);
402 
403 // Float to integer conversion
404 #define VM_IFLOAT _mm_cvtepi32_ps
405 
406 #endif
__m128i v4si
Definition: VM_SSEFunc.h:28
const GLdouble * v
Definition: glcorearb.h:836
#define V4SF(A)
Definition: VM_SSEFunc.h:73
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
GLint GLuint mask
Definition: glcorearb.h:123
GLdouble n
Definition: glcorearb.h:2007
__m128 v4sf
Definition: VM_SSEFunc.h:27
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
int int32
Definition: SYS_Types.h:35
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1221
GLenum GLenum dst
Definition: glcorearb.h:1792
GLsizei const GLfloat * value
Definition: glcorearb.h:823
#define V4SI(A)
Definition: VM_SSEFunc.h:74
unsigned int uint32
Definition: SYS_Types.h:36