HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
VM_SSEFunc.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_SSEFunc.h ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_SSEFunc__
12 #define __VM_SSEFunc__
13 
14 #include "VM_API.h"
15 #include <SYS/SYS_Inline.h>
16 #include <SYS/SYS_Types.h>
17 
18 #if defined(_MSC_VER)
19  #pragma warning(push)
20  #pragma warning(disable:4799)
21 #endif
22 
23 #define CPU_HAS_SIMD_INSTR 1
24 #define VM_SSE_STYLE 1
25 
26 #include <emmintrin.h>
27 typedef __m128 v4sf;
28 typedef __m128i v4si;
29 
30 #if defined(__SSE4_1__)
31 #define VM_SSE41_STYLE 1
32 #include <smmintrin.h>
33 #endif
34 
35 #if defined(_MSC_VER)
36  #pragma warning(pop)
37 #endif
38 
39 // Plain casting (no conversion)
40 // MSVC has problems casting between __m128 and __m128i, so we implement a
41 // custom casting routine specifically for windows.
42 
43 #if defined(_MSC_VER)
44 
46 vm_v4sf(const v4si &a)
47 {
48  union {
49  v4si ival;
50  v4sf fval;
51  };
52  ival = a;
53  return fval;
54 }
55 
57 vm_v4si(const v4sf &a)
58 {
59  union {
60  v4si ival;
61  v4sf fval;
62  };
63  fval = a;
64  return ival;
65 }
66 
67 #define V4SF(A) vm_v4sf(A)
68 #define V4SI(A) vm_v4si(A)
69 
70 #else
71 
72 #define V4SF(A) (v4sf)A
73 #define V4SI(A) (v4si)A
74 
75 #endif
76 
77 #define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
78 
79 template <int mask>
81 vm_shuffle(const v4sf &a, const v4sf &b)
82 {
83  return _mm_shuffle_ps(a, b, mask);
84 }
85 
86 template <int mask>
88 vm_shuffle(const v4si &a, const v4si &b)
89 {
90  return V4SI(_mm_shuffle_ps(V4SF(a), V4SF(b), mask));
91 }
92 
93 template <int A, int B, int C, int D, typename T>
94 static SYS_FORCE_INLINE T
95 vm_shuffle(const T &a, const T &b)
96 {
97  return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(a, b);
98 }
99 
100 template <int mask, typename T>
101 static SYS_FORCE_INLINE T
102 vm_shuffle(const T &a)
103 {
104  return vm_shuffle<mask>(a, a);
105 }
106 
107 template <int A, int B, int C, int D, typename T>
108 static SYS_FORCE_INLINE T
109 vm_shuffle(const T &a)
110 {
111  return vm_shuffle<A,B,C,D>(a, a);
112 }
113 
114 #if defined(VM_SSE41_STYLE)
115 
116 static SYS_FORCE_INLINE v4si
117 vm_insert(const v4si v, int32 a, int n)
118 {
119  switch (n)
120  {
121  case 0: return _mm_insert_epi32(v, a, 0);
122  case 1: return _mm_insert_epi32(v, a, 1);
123  case 2: return _mm_insert_epi32(v, a, 2);
124  case 3: return _mm_insert_epi32(v, a, 3);
125  }
126  return v;
127 }
128 
129 static SYS_FORCE_INLINE v4sf
130 vm_insert(const v4sf v, float a, int n)
131 {
132  switch (n)
133  {
134  case 0: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));
135  case 1: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));
136  case 2: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));
137  case 3: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));
138  }
139  return v;
140 }
141 
142 static SYS_FORCE_INLINE int
143 vm_extract(const v4si v, int n)
144 {
145  switch (n)
146  {
147  case 0: return _mm_extract_epi32(v, 0);
148  case 1: return _mm_extract_epi32(v, 1);
149  case 2: return _mm_extract_epi32(v, 2);
150  case 3: return _mm_extract_epi32(v, 3);
151  }
152  return 0;
153 }
154 
155 static SYS_FORCE_INLINE float
156 vm_extract(const v4sf v, int n)
157 {
158  SYS_FPRealUnionF tmp;
159  switch (n)
160  {
161  case 0: tmp.ival = _mm_extract_ps(v, 0); break;
162  case 1: tmp.ival = _mm_extract_ps(v, 1); break;
163  case 2: tmp.ival = _mm_extract_ps(v, 2); break;
164  case 3: tmp.ival = _mm_extract_ps(v, 3); break;
165  }
166  return tmp.fval;
167 }
168 
169 #else
170 
171 static SYS_FORCE_INLINE v4si
172 vm_insert(const v4si v, int32 a, int n)
173 {
174  union { v4si vector; int32 comp[4]; };
175  vector = v;
176  comp[n] = a;
177  return vector;
178 }
179 
180 static SYS_FORCE_INLINE v4sf
181 vm_insert(const v4sf v, float a, int n)
182 {
183  union { v4sf vector; float comp[4]; };
184  vector = v;
185  comp[n] = a;
186  return vector;
187 }
188 
189 static SYS_FORCE_INLINE int
190 vm_extract(const v4si v, int n)
191 {
192  union { v4si vector; int32 comp[4]; };
193  vector = v;
194  return comp[n];
195 }
196 
197 static SYS_FORCE_INLINE float
198 vm_extract(const v4sf v, int n)
199 {
200  union { v4sf vector; float comp[4]; };
201  vector = v;
202  return comp[n];
203 }
204 
205 #endif
206 
207 static SYS_FORCE_INLINE v4sf
208 vm_splats(float a)
209 {
210  return _mm_set1_ps(a);
211 }
212 
213 static SYS_FORCE_INLINE v4si
214 vm_splats(uint32 a)
215 {
216  SYS_FPRealUnionF tmp;
217  tmp.uval = a;
218  return V4SI(vm_splats(tmp.fval));
219 }
220 
221 static SYS_FORCE_INLINE v4si
222 vm_splats(int32 a)
223 {
224  SYS_FPRealUnionF tmp;
225  tmp.ival = a;
226  return V4SI(vm_splats(tmp.fval));
227 }
228 
229 static SYS_FORCE_INLINE v4sf
230 vm_splats(float a, float b, float c, float d)
231 {
232  return vm_shuffle<0,2,0,2>(
233  vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),
234  vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));
235 }
236 
237 static SYS_FORCE_INLINE v4si
238 vm_splats(uint32 a, uint32 b, uint32 c, uint32 d)
239 {
240  SYS_FPRealUnionF af, bf, cf, df;
241  af.uval = a;
242  bf.uval = b;
243  cf.uval = c;
244  df.uval = d;
245  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
246 }
247 
248 static SYS_FORCE_INLINE v4si
249 vm_splats(int32 a, int32 b, int32 c, int32 d)
250 {
251  SYS_FPRealUnionF af, bf, cf, df;
252  af.ival = a;
253  bf.ival = b;
254  cf.ival = c;
255  df.ival = d;
256  return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
257 }
258 
259 static SYS_FORCE_INLINE v4si
260 vm_load(const int32 v[4])
261 {
262  return V4SI(_mm_loadu_ps((const float *)v));
263 }
264 
265 static SYS_FORCE_INLINE v4sf
266 vm_load(const float v[4])
267 {
268  return _mm_loadu_ps(v);
269 }
270 
271 static SYS_FORCE_INLINE void
272 vm_store(float dst[4], v4sf value)
273 {
274  _mm_storeu_ps(dst, value);
275 }
276 
277 static SYS_FORCE_INLINE v4sf
278 vm_negate(v4sf a)
279 {
280  return _mm_sub_ps(_mm_setzero_ps(), a);
281 }
282 
283 static SYS_FORCE_INLINE v4sf
284 vm_abs(v4sf a)
285 {
286  return _mm_max_ps(a, vm_negate(a));
287 }
288 
289 static SYS_FORCE_INLINE v4sf
290 vm_fdiv(v4sf a, v4sf b)
291 {
292  return _mm_mul_ps(a, _mm_rcp_ps(b));
293 }
294 
295 static SYS_FORCE_INLINE v4sf
296 vm_fsqrt(v4sf a)
297 {
298  return _mm_rcp_ps(_mm_rsqrt_ps(a));
299 }
300 
301 static SYS_FORCE_INLINE v4sf
302 vm_madd(v4sf a, v4sf b, v4sf c)
303 {
304  return _mm_add_ps(_mm_mul_ps(a, b), c);
305 }
306 
307 static const v4si theSSETrue = vm_splats(0xFFFFFFFF);
308 
309 static SYS_FORCE_INLINE bool
310 vm_allbits(const v4si &a)
311 {
312  return _mm_movemask_ps(V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;
313 }
314 
315 
316 #define VM_EXTRACT vm_extract
317 #define VM_INSERT vm_insert
318 #define VM_SPLATS vm_splats
319 #define VM_LOAD vm_load
320 #define VM_STORE vm_store
321 
322 #define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
323 #define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
324 #define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
325 #define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
326 #define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
327 #define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
328 
329 #define VM_ICMPLT _mm_cmplt_epi32
330 #define VM_ICMPGT _mm_cmpgt_epi32
331 #define VM_ICMPEQ _mm_cmpeq_epi32
332 
333 #define VM_IADD _mm_add_epi32
334 #define VM_ISUB _mm_sub_epi32
335 
336 #define VM_ADD _mm_add_ps
337 #define VM_SUB _mm_sub_ps
338 #define VM_MUL _mm_mul_ps
339 #define VM_DIV _mm_div_ps
340 #define VM_SQRT _mm_sqrt_ps
341 #define VM_ISQRT _mm_rsqrt_ps
342 #define VM_INVERT _mm_rcp_ps
343 #define VM_ABS vm_abs
344 
345 #define VM_FDIV vm_fdiv
346 #define VM_NEG vm_negate
347 #define VM_FSQRT vm_fsqrt
348 #define VM_MADD vm_madd
349 
350 #define VM_MIN _mm_min_ps
351 #define VM_MAX _mm_max_ps
352 
353 #define VM_AND _mm_and_si128
354 #define VM_ANDNOT _mm_andnot_si128
355 #define VM_OR _mm_or_si128
356 #define VM_XOR _mm_xor_si128
357 
358 #define VM_ALLBITS vm_allbits
359 
360 #define VM_SHUFFLE vm_shuffle
361 
362 // Integer to float conversions
363 #define VM_SSE_ROUND_MASK 0x6000
364 #define VM_SSE_ROUND_ZERO 0x6000
365 #define VM_SSE_ROUND_UP 0x4000
366 #define VM_SSE_ROUND_DOWN 0x2000
367 #define VM_SSE_ROUND_NEAR 0x0000
368 
369 #define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
370 #define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
371 
372 // The P functions must be invoked before FLOOR, the E functions invoked
373 // afterwards to reset the state.
374 
375 #define VM_P_FLOOR() uint rounding = GETROUND(); \
376  SETROUND(VM_SSE_ROUND_DOWN);
377 #define VM_FLOOR _mm_cvtps_epi32
378 #define VM_INT _mm_cvttps_epi32
379 #define VM_E_FLOOR() SETROUND(rounding);
380 
381 // Float to integer conversion
382 #define VM_IFLOAT _mm_cvtepi32_ps
383 
384 #endif
__m128i v4si
Definition: VM_SSEFunc.h:28
const GLdouble * v
Definition: glcorearb.h:836
#define V4SF(A)
Definition: VM_SSEFunc.h:72
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
GLint GLuint mask
Definition: glcorearb.h:123
GLdouble n
Definition: glcorearb.h:2007
__m128 v4sf
Definition: VM_SSEFunc.h:27
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
int int32
Definition: SYS_Types.h:34
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1221
GLenum GLenum dst
Definition: glcorearb.h:1792
GLsizei const GLfloat * value
Definition: glcorearb.h:823
#define V4SI(A)
Definition: VM_SSEFunc.h:73
unsigned int uint32
Definition: SYS_Types.h:35