HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_SIMD.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_SIMD.C ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_SIMD__
12 #define __VM_SIMD__
13 
14 #include <SYS/SYS_Inline.h>
15 #include <SYS/SYS_Math.h>
16 #include <SYS/SYS_StaticAssert.h>
17 #include <SYS/SYS_Types.h>
18 #include <SYS/SYS_TypeTraits.h>
19 #include <SYS/SYS_Visibility.h>
20 #include <cstdint>
21 
22 //#define FORCE_NON_SIMD
23 
24 // Some older versions of glibc don't correctly align the stack for extra
25 // threads (this problem should have been fixed in 2.3.4 - see
26 // http://sources.redhat.com/bugzilla/show_bug.cgi?id=723). Instead of
27 // checking the glibc version, check the compiler version and assume newer
28 // compilers are used with newer glibc.
29 
30 #if defined(FORCE_NON_SIMD)
31  #include "VM_BasicFunc.h"
32 #else
33  #if defined(CELLRSX) || defined(PPC)
34  #include "VM_AltivecFunc.h"
35  #elif defined(LINUX) && SYS_IS_GCC_GE(3, 4) && defined(__SSE2__)
36  #include "VM_SSEFunc.h"
37  #elif defined(WIN32)
38  #include "VM_SSEFunc.h"
39  #elif defined(MBSD_INTEL)
40  #include "VM_SSEFunc.h"
41  #else
42  #include "VM_BasicFunc.h"
43  #endif
44 #endif
45 
46 class v4uf;
47 
48 class v4uu {
49 public:
50  // NOTE: For unknown reasons, BVH construction is significantly
51  // slower on GCC 5.4 if v4uu and v4uf are POD types, so I put
52  // back the previous code.
53 #if 0
54  SYS_FORCE_INLINE v4uu() noexcept = default;
55  SYS_FORCE_INLINE v4uu(const v4si &v) : vector(v) {
56  SYS_STATIC_ASSERT_MSG(SYSisPOD<v4uu>(), "v4uu should be POD, for better performance in UT_Array, etc.");
57  }
58  SYS_FORCE_INLINE v4uu(const v4uu &v) noexcept = default;
59  SYS_FORCE_INLINE v4uu &operator=(const v4uu &v) noexcept = default;
60 #else
62  SYS_FORCE_INLINE v4uu(const v4si &v) : vector(v) {}
63  SYS_FORCE_INLINE v4uu(const v4uu &v) : vector(v.vector) {}
64  SYS_FORCE_INLINE v4uu operator=(const v4uu &v)
65  { vector = v.vector; return *this; }
66 #endif
67  explicit SYS_FORCE_INLINE v4uu(int32 v) { vector = VM_SPLATS(v); }
68  explicit SYS_FORCE_INLINE v4uu(const int32 v[4])
69  { vector = VM_LOAD(v); }
71  { vector = VM_SPLATS(a, b, c, d); }
72 
73  // Assignment
75  { vector = v4uu(v).vector; return *this; }
77  { vector = v; return *this; }
78 
79  SYS_FORCE_INLINE void condAssign(const v4uu &val, const v4uu &c)
80  { *this = (c & val) | ((!c) & *this); }
81 
82  // Comparison
83  SYS_FORCE_INLINE v4uu operator == (const v4uu &v) const
84  { return v4uu(VM_ICMPEQ(vector, v.vector)); }
85  SYS_FORCE_INLINE v4uu operator != (const v4uu &v) const
86  { return ~(*this == v); }
87  SYS_FORCE_INLINE v4uu operator > (const v4uu &v) const
88  { return v4uu(VM_ICMPGT(vector, v.vector)); }
89  SYS_FORCE_INLINE v4uu operator < (const v4uu &v) const
90  { return v4uu(VM_ICMPLT(vector, v.vector)); }
91  SYS_FORCE_INLINE v4uu operator >= (const v4uu &v) const
92  { return ~(*this < v); }
93  SYS_FORCE_INLINE v4uu operator <= (const v4uu &v) const
94  { return ~(*this > v); }
95 
96  SYS_FORCE_INLINE v4uu operator == (int32 v) const { return *this == v4uu(v); }
97  SYS_FORCE_INLINE v4uu operator != (int32 v) const { return *this != v4uu(v); }
98  SYS_FORCE_INLINE v4uu operator > (int32 v) const { return *this > v4uu(v); }
99  SYS_FORCE_INLINE v4uu operator < (int32 v) const { return *this < v4uu(v); }
100  SYS_FORCE_INLINE v4uu operator >= (int32 v) const { return *this >= v4uu(v); }
101  SYS_FORCE_INLINE v4uu operator <= (int32 v) const { return *this <= v4uu(v); }
102 
103  // Basic math
104  SYS_FORCE_INLINE v4uu operator+(const v4uu &r) const
105  { return v4uu(VM_IADD(vector, r.vector)); }
106  SYS_FORCE_INLINE v4uu operator-(const v4uu &r) const
107  { return v4uu(VM_ISUB(vector, r.vector)); }
108 #if defined(VM_IMUL)
109  SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
110  { return v4uu(VM_IMUL(vector, r.vector)); }
111 #else
112  SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
113  {
114  return v4uu((*this)[0] * r[0],
115  (*this)[1] * r[1],
116  (*this)[2] * r[2],
117  (*this)[3] * r[3]);
118  }
119 #endif
120  SYS_FORCE_INLINE v4uu operator+=(const v4uu &r) { return (*this = *this + r); }
121  SYS_FORCE_INLINE v4uu operator-=(const v4uu &r) { return (*this = *this - r); }
122  SYS_FORCE_INLINE v4uu operator*=(const v4uu &r) { return (*this = *this * r); }
123  SYS_FORCE_INLINE v4uu operator+(int32 r) const { return *this + v4uu(r); }
124  SYS_FORCE_INLINE v4uu operator-(int32 r) const { return *this - v4uu(r); }
125  SYS_FORCE_INLINE v4uu operator*(int32 r) const { return *this * v4uu(r); }
126  SYS_FORCE_INLINE v4uu operator+=(int32 r) { return (*this = *this + r); }
127  SYS_FORCE_INLINE v4uu operator-=(int32 r) { return (*this = *this - r); }
128  SYS_FORCE_INLINE v4uu operator*=(int32 r) { return (*this = *this * r); }
129 
130  // Modulo
131  SYS_FORCE_INLINE v4uu operator%(const v4uu &r) const
132  {
133  return v4uu((*this)[0] % r[0],
134  (*this)[1] % r[1],
135  (*this)[2] % r[2],
136  (*this)[3] % r[3]);
137  }
138  SYS_FORCE_INLINE v4uu operator%(int r) const
139  {
140  return v4uu((*this)[0] % r,
141  (*this)[1] % r,
142  (*this)[2] % r,
143  (*this)[3] % r);
144  }
145  SYS_FORCE_INLINE v4uu operator%=(const v4uu &r) { return (*this = *this % r); }
146  SYS_FORCE_INLINE v4uu operator%=(int r) { return (*this = *this % r); }
147 
148 
149  // logical/bitwise
150 
151  SYS_FORCE_INLINE v4uu operator||(const v4uu &r) const
152  { return v4uu(VM_OR(vector, r.vector)); }
153  SYS_FORCE_INLINE v4uu operator&&(const v4uu &r) const
154  { return v4uu(VM_AND(vector, r.vector)); }
155  SYS_FORCE_INLINE v4uu operator^(const v4uu &r) const
156  { return v4uu(VM_XOR(vector, r.vector)); }
158  { return *this == v4uu(0); }
159 
160  SYS_FORCE_INLINE v4uu operator|(const v4uu &r) const { return *this || r; }
161  SYS_FORCE_INLINE v4uu operator&(const v4uu &r) const { return *this && r; }
163  { return *this ^ v4uu(0xFFFFFFFF); }
164  SYS_FORCE_INLINE void operator|=(const v4uu &r) { vector = VM_OR(vector, r.vector); }
165  SYS_FORCE_INLINE void operator&=(const v4uu &r) { vector = VM_AND(vector, r.vector); }
166  SYS_FORCE_INLINE void operator^=(const v4uu &r) { vector = VM_XOR(vector, r.vector); }
167 
168  // component
169  SYS_FORCE_INLINE int32 operator[](int idx) const { return VM_EXTRACT(vector, idx); }
170  SYS_FORCE_INLINE void setComp(int idx, int32 v) { vector = VM_INSERT(vector, v, idx); }
171 
172  v4uf toFloat() const;
173 
174  operator v4uf() const;
175 
176 public:
178 };
179 
180 class v4uf {
181 public:
182  SYS_FORCE_INLINE v4uf() noexcept = default;
183  // NOTE: For unknown reasons, BVH construction is significantly
184  // slower on GCC 5.4 if v4uu and v4uf are POD types, so I put
185  // back the previous code.
186 #if 0
187  SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept = default;
188  SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept : vector(v) {
189  SYS_STATIC_ASSERT_MSG(SYSisPOD<v4uf>(), "v4uf should be POD, for better performance in UT_Array, etc.");
190  }
191  SYS_FORCE_INLINE v4uf &operator=(const v4uf &v) noexcept = default;
192 #else
193  SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept : vector(v) {}
194  SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept : vector(v.vector) {}
195  SYS_FORCE_INLINE v4uf operator=(const v4uf &v) noexcept
196  { vector = v.vector; return *this; }
197 #endif
198  explicit SYS_FORCE_INLINE v4uf(float v) { vector = VM_SPLATS(v); }
199  explicit SYS_FORCE_INLINE v4uf(const float v[4])
200  { vector = VM_LOAD(v); }
201  SYS_FORCE_INLINE v4uf(float a, float b, float c, float d)
202  { vector = VM_SPLATS(a, b, c, d); }
203 
204  // Assignment
206  { vector = v4uf(v).vector; return *this; }
208  { vector = v; return *this; }
209 
210  SYS_FORCE_INLINE void condAssign(const v4uf &val, const v4uu &c)
211  { *this = (val & c) | (*this & ~c); }
212 
213  // Comparison
214  SYS_FORCE_INLINE v4uu operator == (const v4uf &v) const
215  { return v4uu(VM_CMPEQ(vector, v.vector)); }
216  SYS_FORCE_INLINE v4uu operator != (const v4uf &v) const
217  { return v4uu(VM_CMPNE(vector, v.vector)); }
218  SYS_FORCE_INLINE v4uu operator > (const v4uf &v) const
219  { return v4uu(VM_CMPGT(vector, v.vector)); }
220  SYS_FORCE_INLINE v4uu operator < (const v4uf &v) const
221  { return v4uu(VM_CMPLT(vector, v.vector)); }
222  SYS_FORCE_INLINE v4uu operator >= (const v4uf &v) const
223  { return v4uu(VM_CMPGE(vector, v.vector)); }
224  SYS_FORCE_INLINE v4uu operator <= (const v4uf &v) const
225  { return v4uu(VM_CMPLE(vector, v.vector)); }
226 
227  SYS_FORCE_INLINE v4uu operator == (float v) const { return *this == v4uf(v); }
228  SYS_FORCE_INLINE v4uu operator != (float v) const { return *this != v4uf(v); }
229  SYS_FORCE_INLINE v4uu operator > (float v) const { return *this > v4uf(v); }
230  SYS_FORCE_INLINE v4uu operator < (float v) const { return *this < v4uf(v); }
231  SYS_FORCE_INLINE v4uu operator >= (float v) const { return *this >= v4uf(v); }
232  SYS_FORCE_INLINE v4uu operator <= (float v) const { return *this <= v4uf(v); }
233 
234 
235  // Basic math
237  { return v4uf(VM_ADD(vector, r.vector)); }
239  { return v4uf(VM_SUB(vector, r.vector)); }
241  { return v4uf(VM_NEG(vector)); }
243  { return v4uf(VM_MUL(vector, r.vector)); }
245  { return v4uf(VM_DIV(vector, r.vector)); }
246 
247  SYS_FORCE_INLINE v4uf operator+=(const v4uf &r) { return (*this = *this + r); }
248  SYS_FORCE_INLINE v4uf operator-=(const v4uf &r) { return (*this = *this - r); }
249  SYS_FORCE_INLINE v4uf operator*=(const v4uf &r) { return (*this = *this * r); }
250  SYS_FORCE_INLINE v4uf operator/=(const v4uf &r) { return (*this = *this / r); }
251 
252  SYS_FORCE_INLINE v4uf operator+(float r) const { return *this + v4uf(r); }
253  SYS_FORCE_INLINE v4uf operator-(float r) const { return *this - v4uf(r); }
254  SYS_FORCE_INLINE v4uf operator*(float r) const { return *this * v4uf(r); }
255  SYS_FORCE_INLINE v4uf operator/(float r) const { return *this / v4uf(r); }
256  SYS_FORCE_INLINE v4uf operator+=(float r) { return (*this = *this + r); }
257  SYS_FORCE_INLINE v4uf operator-=(float r) { return (*this = *this - r); }
258  SYS_FORCE_INLINE v4uf operator*=(float r) { return (*this = *this * r); }
259  SYS_FORCE_INLINE v4uf operator/=(float r) { return (*this = *this / r); }
260 
261  // logical/bitwise
262 
263  SYS_FORCE_INLINE v4uf operator||(const v4uu &r) const
264  { return v4uf(V4SF(VM_OR(V4SI(vector), r.vector))); }
265  SYS_FORCE_INLINE v4uf operator&&(const v4uu &r) const
266  { return v4uf(V4SF(VM_AND(V4SI(vector), r.vector))); }
267  SYS_FORCE_INLINE v4uf operator^(const v4uu &r) const
268  { return v4uf(V4SF(VM_XOR(V4SI(vector), r.vector))); }
270  { return v4uf(V4SF((*this == v4uf(0.0F)).vector)); }
271 
273  { return v4uf(V4SF(VM_OR(V4SI(vector), V4SI(r.vector)))); }
275  { return v4uf(V4SF(VM_AND(V4SI(vector), V4SI(r.vector)))); }
277  { return v4uf(V4SF(VM_XOR(V4SI(vector), V4SI(r.vector)))); }
278 
279  SYS_FORCE_INLINE v4uf operator|(const v4uu &r) const { return *this || r; }
280  SYS_FORCE_INLINE v4uf operator&(const v4uu &r) const { return *this && r; }
282  { return *this ^ v4uu(0xFFFFFFFF); }
283 
284  SYS_FORCE_INLINE v4uf operator|(const v4uf &r) const { return *this || r; }
285  SYS_FORCE_INLINE v4uf operator&(const v4uf &r) const { return *this && r; }
286 
287  // component
288  SYS_FORCE_INLINE float operator[](int idx) const { return VM_EXTRACT(vector, idx); }
289  SYS_FORCE_INLINE void setComp(int idx, float v) { vector = VM_INSERT(vector, v, idx); }
290 
291  // more math
292  SYS_FORCE_INLINE v4uf abs() const { return v4uf(VM_ABS(vector)); }
293  SYS_FORCE_INLINE v4uf clamp(const v4uf &low, const v4uf &high) const
294  { return v4uf(
295  VM_MIN(VM_MAX(vector, low.vector), high.vector)); }
296  SYS_FORCE_INLINE v4uf clamp(float low, float high) const
297  { return v4uf(VM_MIN(VM_MAX(vector,
298  v4uf(low).vector), v4uf(high).vector)); }
300 
301  /// This is a lie, it is a signed int.
302  SYS_FORCE_INLINE v4uu toUnsignedInt() const { return VM_INT(vector); }
303  SYS_FORCE_INLINE v4uu toSignedInt() const { return VM_INT(vector); }
304 
305  v4uu floor() const
306  {
307  VM_P_FLOOR();
308  v4uu result = VM_FLOOR(vector);
309  VM_E_FLOOR();
310  return result;
311  }
312 
313  /// Returns the integer part of this float, this becomes the
314  /// 0..1 fractional component.
315  v4uu splitFloat()
316  {
317  v4uu base = toSignedInt();
318  *this -= base.toFloat();
319  return base;
320  }
321 
322  template <int A, int B, int C, int D>
324  {
325  return VM_SHUFFLE<A,B,C,D>(vector);
326  }
327 
329  {
330  // If the exponent is the maximum value, it's either infinite or NaN.
331  const v4si mask = VM_SPLATS(0x7F800000);
332  return ~v4uu(VM_ICMPEQ(VM_AND(V4SI(vector), mask), mask));
333  }
334 
335 public:
337 };
338 
341 {
342  return v4uf(VM_IFLOAT(vector));
343 }
345 v4uu::operator v4uf() const
346 {
347  return toFloat();
348 }
349 
350 static SYS_FORCE_INLINE v4uf
351 operator*(float r, const v4uf &v) noexcept
352 {
353  return v * v4uf(r);
354 }
355 
356 //
357 // Custom vector operations
358 //
359 
360 static SYS_FORCE_INLINE v4uf
361 sqrt(const v4uf &a)
362 {
363  return v4uf(VM_SQRT(a.vector));
364 }
365 
366 static SYS_FORCE_INLINE v4uf
367 fabs(const v4uf &a)
368 {
369  return a.abs();
370 }
371 
372 // Use this operation to mask disabled values to 0
373 // rval = !a ? b : 0;
374 
375 static SYS_FORCE_INLINE v4uf
376 andn(const v4uu &a, const v4uf &b)
377 {
378  return v4uf(V4SF(VM_ANDNOT(a.vector, V4SI(b.vector))));
379 }
380 
381 static SYS_FORCE_INLINE v4uu
382 andn(const v4uu &a, const v4uu &b)
383 {
384  return v4uu(VM_ANDNOT(a.vector, b.vector));
385 }
386 
387 // rval = a ? b : c;
388 static SYS_FORCE_INLINE v4uf
389 ternary(const v4uu &a, const v4uf &b, const v4uf &c)
390 {
391  return (b & a) | andn(a, c);
392 }
393 
394 static SYS_FORCE_INLINE v4uu
395 ternary(const v4uu &a, const v4uu &b, const v4uu &c)
396 {
397  return (b & a) | andn(a, c);
398 }
399 
400 // rval = !(a && b)
401 static SYS_FORCE_INLINE v4uu
402 nand(const v4uu &a, const v4uu &b)
403 {
404  return !v4uu(VM_AND(a.vector, b.vector));
405 }
406 
407 static SYS_FORCE_INLINE v4uf
408 vmin(const v4uf &a, const v4uf &b)
409 {
410  return v4uf(VM_MIN(a.vector, b.vector));
411 }
412 
413 static SYS_FORCE_INLINE v4uf
414 vmax(const v4uf &a, const v4uf &b)
415 {
416  return v4uf(VM_MAX(a.vector, b.vector));
417 }
418 
419 static SYS_FORCE_INLINE v4uf
420 clamp(const v4uf &a, const v4uf &b, const v4uf &c)
421 {
422  return vmax(vmin(a, c), b);
423 }
424 
425 static SYS_FORCE_INLINE v4uf
426 clamp(const v4uf &a, float b, float c)
427 {
428  return vmax(vmin(a, v4uf(c)), v4uf(b));
429 }
430 
431 // Returns an integer with the lower 4 bits set
432 // if the corresponding sign bit in a is set.
433 // Useful after comparisons as comparisons will
434 // be -1 or 0; so no need for an allbits test.
435 // 1 is a[0], 2 is a[1], and 8 is a[3].
436 static SYS_FORCE_INLINE int
437 signbits(const v4uu &a)
438 {
439  return vm_signbits(a.vector);
440 }
441 
442 static SYS_FORCE_INLINE int
443 signbits(const v4uf &a)
444 {
445  return vm_signbits(a.vector);
446 }
447 
448 static SYS_FORCE_INLINE bool
449 allbits(const v4uu &a)
450 {
451  return vm_allbits(a.vector);
452 }
453 
454 static SYS_FORCE_INLINE bool
455 anybits(const v4uu &a)
456 {
457  return !allbits(~a);
458 }
459 
460 static SYS_FORCE_INLINE v4uf
461 madd(const v4uf &v, const v4uf &f, const v4uf &a)
462 {
463  return v4uf(VM_MADD(v.vector, f.vector, a.vector));
464 }
465 
466 static SYS_FORCE_INLINE v4uf
467 madd(const v4uf &v, float f, float a)
468 {
469  return v4uf(VM_MADD(v.vector, v4uf(f).vector, v4uf(a).vector));
470 }
471 
472 static SYS_FORCE_INLINE v4uf
473 madd(const v4uf &v, float f, const v4uf &a)
474 {
475  return v4uf(VM_MADD(v.vector, v4uf(f).vector, a.vector));
476 }
477 
478 static SYS_FORCE_INLINE v4uf
479 msub(const v4uf &v, const v4uf &f, const v4uf &s)
480 {
481  return madd(v, f, -s);
482 }
483 
484 static SYS_FORCE_INLINE v4uf
485 msub(const v4uf &v, float f, float s)
486 {
487  return madd(v, f, -s);
488 }
489 
490 static SYS_FORCE_INLINE v4uf
491 lerp(const v4uf &a, const v4uf &b, const v4uf &w)
492 {
493  v4uf w1 = v4uf(1.0F) - w;
494  return madd(a, w1, b*w);
495 }
496 
497 static SYS_FORCE_INLINE v4uf
498 luminance(const v4uf &r, const v4uf &g, const v4uf &b,
499  float rw, float gw, float bw)
500 {
501  return v4uf(madd(r, v4uf(rw), madd(g, v4uf(gw), b * bw)));
502 }
503 
504 static SYS_FORCE_INLINE float
505 dot3(const v4uf &a, const v4uf &b)
506 {
507  v4uf res = a*b;
508  return res[0] + res[1] + res[2];
509 }
510 
511 static SYS_FORCE_INLINE float
512 dot4(const v4uf &a, const v4uf &b)
513 {
514  v4uf res = a*b;
515  return res[0] + res[1] + res[2] + res[3];
516 }
517 
518 static SYS_FORCE_INLINE float
519 length(const v4uf &a)
520 {
521  return SYSsqrt(dot3(a, a));
522 }
523 
524 static SYS_FORCE_INLINE v4uf
525 normalize(const v4uf &a)
526 {
527  return a / length(a);
528 }
529 
530 static SYS_FORCE_INLINE v4uf
531 cross(const v4uf &a, const v4uf &b)
532 {
533  return v4uf(a[1]*b[2] - a[2]*b[1],
534  a[2]*b[0] - a[0]*b[2],
535  a[0]*b[1] - a[1]*b[0], 0);
536 }
537 
538 // Convert 4 fpreal32 to 4 fpreal16's, returned as an int64
539 #if defined(AMD64) && (defined(__SSE4_1__) || defined(_MSC_VER))
540 #include <smmintrin.h> // SSE 4.1
541 static SYS_FORCE_INLINE int64
542 VMconvert4F32ToF16(v4si input)
543 {
544  // Separate sign bit and the rest
545  __m128i sign_mask = _mm_set1_epi32(0x80000000);
546  __m128i sign = _mm_and_si128(sign_mask, input);
547  __m128i positive = _mm_andnot_si128(sign_mask, input); // order matters here
548 
549  // Move the sign bit to its new location.
550  sign = _mm_srli_epi32(sign,16);
551 
552  // Clamp the value to avoid problems below.
553  __m128i min_value = _mm_set1_epi32((127-15)<<23);
554  __m128i max_value = _mm_set1_epi32((127+16)<<23);
555  positive = _mm_max_epi32(positive, min_value); // SSE4.1
556  positive = _mm_min_epi32(positive, max_value); // SSE4.1
557 
558  // Isolate the bit that's going to determine whether we round up or down.
559  __m128i bit0_mask = _mm_set1_epi32(1);
560  __m128i round_bit = _mm_srli_epi32(positive, (23-10-1));
561  round_bit = _mm_and_si128(round_bit, bit0_mask);
562 
563  // Isolate the exponent, change it from excess-127 to excess-15,
564  // then shift it into its new location.
565  __m128i exponent_diff = _mm_set1_epi32(127-15);
566  __m128i exponent = _mm_srli_epi32(positive, 23);
567  exponent = _mm_sub_epi32(exponent, exponent_diff);
568  exponent = _mm_slli_epi32(exponent, 10);
569 
570  // Isolate the mantissa bits that we'll be keeping.
571  __m128i ten_bits_mask = _mm_set1_epi32(0x3FF);
572  positive = _mm_srli_epi32(positive, 23-10);
573  positive = _mm_and_si128(positive, ten_bits_mask);
574 
575  // Combine the exponent, mantissa, and sign
576  __m128i f16s = _mm_or_si128(exponent, positive);
577  f16s = _mm_or_si128(f16s, sign);
578 
579  // Add the rounding bit
580  f16s = _mm_add_epi32(f16s, round_bit);
581 
582  // Pack the 4 low 16 bits for each 32-bit integer into
583  // first 4 16-bit integers, (and last 4 16-bit integers,
584  // but we don't care about those.)
585  f16s = _mm_packus_epi32(f16s,f16s); // SSE4.1
586 
587  // Just extract out the first 4 16-bit integers.
588  return _mm_cvtsi128_si64(f16s);
589 }
590 #else
591 static SYS_FORCE_INLINE int64
592 VMconvert4F32ToF16(v4si input)
593 {
594  int64 result;
595  const fpreal32 * src = reinterpret_cast<const fpreal32 *>(&input);
596  fpreal16 * dst = reinterpret_cast<fpreal16 *>(&result);
597 
598  for (int i = 0; i < 4; ++i)
599  dst[i] = fpreal16(src[i]);
600 
601  return result;
602 }
603 #endif
604 static SYS_FORCE_INLINE int64
605 VMconvert4F32ToF16(v4sf input)
606 {
607  return VMconvert4F32ToF16(V4SI(input));
608 }
609 
610 // SYS version of everything
611 
612 static SYS_FORCE_INLINE v4uu
613 SYSmin(const v4uu &a, const v4uu &b)
614 {
615  return ternary(a < b, a, b);
616 }
617 static SYS_FORCE_INLINE v4uu
618 SYSmax(const v4uu &a, const v4uu &b)
619 {
620  return ternary(a > b, a, b);
621 }
622 static SYS_FORCE_INLINE v4uu
623 SYSclamp(const v4uu &a, const v4uu &b, const v4uu &c)
624 {
625  return SYSmax(SYSmin(a, c), b);
626 }
627 static SYS_FORCE_INLINE v4uu
628 SYSclamp(const v4uu &a, int b, int c)
629 {
630  return SYSmax(SYSmin(a, v4uu(c)), v4uu(b));
631 }
632 
633 static SYS_FORCE_INLINE v4uf
634 SYSmin(const v4uf &a, const v4uf &b)
635 {
636  return v4uf(VM_MIN(a.vector, b.vector));
637 }
638 
639 static SYS_FORCE_INLINE v4uf
640 SYSmax(const v4uf &a, const v4uf &b)
641 {
642  return v4uf(VM_MAX(a.vector, b.vector));
643 }
644 
645 static SYS_FORCE_INLINE v4uf
646 SYSlerp(const v4uf &a, const v4uf &b, const v4uf &t)
647 {
648  v4uf diff = b-a;
649  return madd(diff, t, a);
650 }
651 
652 static SYS_FORCE_INLINE v4uf
653 SYSlerp(const v4uf &a, const v4uf &b, const float t)
654 {
655  v4uf diff = b-a;
656  return madd(diff, t, a);
657 }
658 static SYS_FORCE_INLINE v4uf
659 SYSclamp(const v4uf &a, const v4uf &b, const v4uf &c)
660 {
661  return vmax(vmin(a, c), b);
662 }
663 
664 static SYS_FORCE_INLINE v4uf
665 SYSclamp(const v4uf &a, float b, float c)
666 {
667  return vmax(vmin(a, v4uf(c)), v4uf(b));
668 }
669 
670 static SYS_FORCE_INLINE v4uu
671 SYSfastFloor(const v4uf &a)
672 {
673  // The following are two versions of floor.
674  //
675  // 1.
676  // This floor is a bit problematic around 0. For example if x is a
677  // very small (in magnitude) denormal number, then floor(x) = -1
678  // and x - floor(x) = 1, which could be unexpected.
679  return ternary(a < v4uf(0.f), a.toSignedInt() - 1, a.toSignedInt());
680  //
681  // 2.
682  //
683  // This version is broken on some platforms. On thoth,
684  // SYSfastFloor(v4uf(-0.716626)) == v4ui(0)
685  //
686  //return a.floor();
687 
688 }
689 static SYS_FORCE_INLINE v4uu
690 SYSfloor(const v4uf &a)
691 {
692  return a.floor();
693 }
694 
695 // Currently there is no specific support for signed integers
696 typedef v4uu v4ui;
697 
698 // Assuming that ptr is an array of elements of type STYPE, this operation
699 // will return the index of the first element that is aligned to (1<<ASIZE)
700 // bytes.
701 #define VM_ALIGN(ptr, ASIZE, STYPE) \
702  ((((1<<ASIZE)-(intptr_t)ptr)&((1<<ASIZE)-1))/sizeof(STYPE))
703 
704 #endif
SYS_FORCE_INLINE v4uf operator/(float r) const
Definition: VM_SIMD.h:255
SYS_FORCE_INLINE v4uu operator=(const v4uu &v)
Definition: VM_SIMD.h:64
#define SYSmax(a, b)
Definition: SYS_Math.h:1367
SYS_FORCE_INLINE v4uf operator+=(float r)
Definition: VM_SIMD.h:256
Mat3< typename promote< S, T >::type > operator*(S scalar, const Mat3< T > &m)
Multiply each element of the given matrix by scalar and return the result.
Definition: Mat3.h:609
SYS_FORCE_INLINE v4uf operator&&(const v4uf &r) const
Definition: VM_SIMD.h:274
v4uu v4ui
Definition: VM_SIMD.h:696
#define VM_FLOOR
#define VM_IMUL
Definition: VM_BasicFunc.h:359
SYS_FORCE_INLINE v4uu operator>=(const v4uu &v) const
Definition: VM_SIMD.h:91
SYS_FORCE_INLINE v4uu operator=(v4si v)
Definition: VM_SIMD.h:76
SYS_FORCE_INLINE v4uf operator^(const v4uu &r) const
Definition: VM_SIMD.h:267
#define VM_NEG
#define VM_CMPGT
SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept
Definition: VM_SIMD.h:194
#define V4SI(A)
#define SYS_STATIC_ASSERT_MSG(expr, msg)
SYS_FORCE_INLINE v4uu operator||(const v4uu &r) const
Definition: VM_SIMD.h:151
SYS_FORCE_INLINE v4uf operator-() const
Definition: VM_SIMD.h:240
#define VM_ISUB
const GLdouble * v
Definition: glcorearb.h:836
SYS_FORCE_INLINE v4uu operator&&(const v4uu &r) const
Definition: VM_SIMD.h:153
SYS_FORCE_INLINE v4uu operator-(const v4uu &r) const
Definition: VM_SIMD.h:106
#define VM_CMPLE
SYS_FORCE_INLINE v4uu operator*(int32 r) const
Definition: VM_SIMD.h:125
virtual bool lerp(GA_AttributeOperand &d, GA_AttributeOperand &a, GA_AttributeOperand &b, GA_AttributeOperand &t) const
d = SYSlerp(a, b, t);
SYS_FORCE_INLINE v4uu toUnsignedInt() const
This is a lie, it is a signed int.
Definition: VM_SIMD.h:302
GLboolean GLboolean g
Definition: glcorearb.h:1221
#define VM_CMPGE
SYS_FORCE_INLINE v4uu(const int32 v[4])
Definition: VM_SIMD.h:68
SYS_FORCE_INLINE v4uu operator%=(const v4uu &r)
Definition: VM_SIMD.h:145
SYS_FORCE_INLINE v4uf operator=(v4sf v) noexcept
Definition: VM_SIMD.h:207
SYS_FORCE_INLINE v4uu operator*=(int32 r)
Definition: VM_SIMD.h:128
SYS_FORCE_INLINE v4uu operator+(const v4uu &r) const
Definition: VM_SIMD.h:104
#define VM_INSERT
Definition: VM_BasicFunc.h:341
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
GLint GLuint mask
Definition: glcorearb.h:123
SYS_FORCE_INLINE v4uf operator=(float v)
Definition: VM_SIMD.h:205
SYS_FORCE_INLINE v4uu operator==(const v4uu &v) const
Definition: VM_SIMD.h:83
SYS_FORCE_INLINE v4uu operator<(const v4uu &v) const
Definition: VM_SIMD.h:89
SYS_FORCE_INLINE v4uu(int32 v)
Definition: VM_SIMD.h:67
#define VM_ICMPGT
SYS_FORCE_INLINE v4uu operator%(const v4uu &r) const
Definition: VM_SIMD.h:131
SYS_FORCE_INLINE void operator|=(const v4uu &r)
Definition: VM_SIMD.h:164
SYS_FORCE_INLINE void condAssign(const v4uu &val, const v4uu &c)
Definition: VM_SIMD.h:79
UT_Matrix2T< T > SYSlerp(const UT_Matrix2T< T > &v1, const UT_Matrix2T< T > &v2, S t)
Definition: UT_Matrix2.h:595
SYS_FORCE_INLINE v4uu operator!() const
Definition: VM_SIMD.h:157
SYS_FORCE_INLINE v4uf operator-(float r) const
Definition: VM_SIMD.h:253
#define VM_SQRT
png_uint_32 i
Definition: png.h:2877
SYS_FORCE_INLINE v4uu operator+=(int32 r)
Definition: VM_SIMD.h:126
SYS_FORCE_INLINE v4uf abs() const
Definition: VM_SIMD.h:292
#define VM_ICMPLT
v4uu floor() const
Definition: VM_SIMD.h:305
SYS_FORCE_INLINE v4uf operator|(const v4uf &r) const
Definition: VM_SIMD.h:284
#define VM_EXTRACT
Definition: VM_BasicFunc.h:340
SYS_FORCE_INLINE v4uf operator*=(float r)
Definition: VM_SIMD.h:258
#define VM_MIN
long long int64
Definition: SYS_Types.h:107
GLfloat f
Definition: glcorearb.h:1925
SYS_FORCE_INLINE v4uf operator/(const v4uf &r) const
Definition: VM_SIMD.h:244
SYS_FORCE_INLINE v4uf(float a, float b, float c, float d)
Definition: VM_SIMD.h:201
SYS_FORCE_INLINE v4uf recip() const
Definition: VM_SIMD.h:299
SYS_FORCE_INLINE v4uu operator&(const v4uu &r) const
Definition: VM_SIMD.h:161
SYS_FORCE_INLINE v4uu operator+=(const v4uu &r)
Definition: VM_SIMD.h:120
SYS_FORCE_INLINE v4uu operator^(const v4uu &r) const
Definition: VM_SIMD.h:155
SYS_FORCE_INLINE v4uu operator=(int32 v)
Definition: VM_SIMD.h:74
SYS_FORCE_INLINE v4uf() noexcept=default
SYS_FORCE_INLINE v4uu(const v4si &v)
Definition: VM_SIMD.h:62
Definition: VM_SIMD.h:48
SYS_FORCE_INLINE v4uu toSignedInt() const
Definition: VM_SIMD.h:303
SYS_FORCE_INLINE v4uf operator&(const v4uf &r) const
Definition: VM_SIMD.h:285
#define VM_CMPLT
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
SYS_FORCE_INLINE v4uu operator<=(const v4uf &v) const
Definition: VM_SIMD.h:224
SYS_FORCE_INLINE v4uf operator=(const v4uf &v) noexcept
Definition: VM_SIMD.h:195
SYS_FORCE_INLINE v4uf operator!() const
Definition: VM_SIMD.h:269
#define VM_MUL
SYS_FORCE_INLINE v4uf operator||(const v4uf &r) const
Definition: VM_SIMD.h:272
Definition: VM_SIMD.h:180
SYS_FORCE_INLINE v4uu operator>(const v4uf &v) const
Definition: VM_SIMD.h:218
#define V4SF(A)
SYS_FORCE_INLINE void setComp(int idx, float v)
Definition: VM_SIMD.h:289
SYS_FORCE_INLINE v4uf(const float v[4])
Definition: VM_SIMD.h:199
#define VM_SPLATS
SYS_FORCE_INLINE v4uf clamp(float low, float high) const
Definition: VM_SIMD.h:296
SYS_FORCE_INLINE v4uu operator==(const v4uf &v) const
Definition: VM_SIMD.h:214
SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
Definition: VM_SIMD.h:109
SYS_FORCE_INLINE v4uu operator%(int r) const
Definition: VM_SIMD.h:138
#define VM_ABS
SYS_FORCE_INLINE v4uu operator-=(int32 r)
Definition: VM_SIMD.h:127
SYS_API fpreal32 SYSfloor(fpreal32 val)
SYS_FORCE_INLINE v4uf operator^(const v4uf &r) const
Definition: VM_SIMD.h:276
SYS_FORCE_INLINE v4uu operator-(int32 r) const
Definition: VM_SIMD.h:124
SYS_FORCE_INLINE float operator[](int idx) const
Definition: VM_SIMD.h:288
int sign(T a)
Definition: ImathFun.h:63
SYS_FORCE_INLINE void operator^=(const v4uu &r)
Definition: VM_SIMD.h:166
SYS_FORCE_INLINE v4uu()
Definition: VM_SIMD.h:61
int int32
Definition: SYS_Types.h:35
SYS_FORCE_INLINE void condAssign(const v4uf &val, const v4uu &c)
Definition: VM_SIMD.h:210
GridType::Ptr normalize(const GridType &grid, bool threaded, InterruptT *interrupt)
Normalize the vectors of the given vector-valued grid.
SYS_FORCE_INLINE v4uu operator<(const v4uf &v) const
Definition: VM_SIMD.h:220
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1221
SYS_FORCE_INLINE v4uu operator!=(const v4uu &v) const
Definition: VM_SIMD.h:85
#define VM_CMPNE
SYS_FORCE_INLINE v4uf operator*(float r) const
Definition: VM_SIMD.h:254
#define VM_SUB
SYS_FORCE_INLINE v4uu operator>=(const v4uf &v) const
Definition: VM_SIMD.h:222
unsigned short fpreal16
Definition: SYS_Types.h:198
SYS_FORCE_INLINE v4uu operator|(const v4uu &r) const
Definition: VM_SIMD.h:160
#define VM_IFLOAT
SYS_FORCE_INLINE v4uf operator*(const v4uf &r) const
Definition: VM_SIMD.h:242
SYS_FORCE_INLINE v4uf operator-(const v4uf &r) const
Definition: VM_SIMD.h:238
SYS_FORCE_INLINE void setComp(int idx, int32 v)
Definition: VM_SIMD.h:170
SYS_FORCE_INLINE v4uf operator&(const v4uu &r) const
Definition: VM_SIMD.h:280
SYS_FORCE_INLINE v4uu operator%=(int r)
Definition: VM_SIMD.h:146
GLenum GLenum dst
Definition: glcorearb.h:1792
SYS_FORCE_INLINE v4uu operator-=(const v4uu &r)
Definition: VM_SIMD.h:121
#define VM_MADD
#define VM_P_FLOOR()
SYS_FORCE_INLINE v4uu operator<=(const v4uu &v) const
Definition: VM_SIMD.h:93
SYS_FORCE_INLINE v4uu operator>(const v4uu &v) const
Definition: VM_SIMD.h:87
SYS_FORCE_INLINE v4uf operator&&(const v4uu &r) const
Definition: VM_SIMD.h:265
#define VM_INT
v4si vector
Definition: VM_SIMD.h:177
v4uu splitFloat()
Definition: VM_SIMD.h:315
GLuint GLfloat * val
Definition: glcorearb.h:1607
SYS_FORCE_INLINE v4uf clamp(const v4uf &low, const v4uf &high) const
Definition: VM_SIMD.h:293
SYS_FORCE_INLINE v4uu(int32 a, int32 b, int32 c, int32 d)
Definition: VM_SIMD.h:70
#define VM_ICMPEQ
#define VM_OR
#define VM_DIV
#define VM_ANDNOT
SYS_FORCE_INLINE v4uf swizzle() const
Definition: VM_SIMD.h:323
#define VM_MAX
#define VM_IADD
#define VM_CMPEQ
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:856
SYS_FORCE_INLINE int32 operator[](int idx) const
Definition: VM_SIMD.h:169
SYS_FORCE_INLINE v4uu operator+(int32 r) const
Definition: VM_SIMD.h:123
SYS_FORCE_INLINE v4uf operator/=(float r)
Definition: VM_SIMD.h:259
#define VM_ADD
SYS_FORCE_INLINE v4uf(float v)
Definition: VM_SIMD.h:198
SYS_FORCE_INLINE v4uu operator!=(const v4uf &v) const
Definition: VM_SIMD.h:216
GLboolean r
Definition: glcorearb.h:1221
SYS_FORCE_INLINE v4uf operator-=(const v4uf &r)
Definition: VM_SIMD.h:248
SYS_FORCE_INLINE v4uf operator~() const
Definition: VM_SIMD.h:281
#define VM_LOAD
Definition: VM_BasicFunc.h:343
SYS_FORCE_INLINE void operator&=(const v4uu &r)
Definition: VM_SIMD.h:165
SYS_FORCE_INLINE v4uf operator||(const v4uu &r) const
Definition: VM_SIMD.h:263
SYS_FORCE_INLINE v4uf operator+(float r) const
Definition: VM_SIMD.h:252
SYS_FORCE_INLINE v4uf operator*=(const v4uf &r)
Definition: VM_SIMD.h:249
SYS_FORCE_INLINE v4uf operator+=(const v4uf &r)
Definition: VM_SIMD.h:247
#define SYSmin(a, b)
Definition: SYS_Math.h:1368
SYS_FORCE_INLINE v4uf operator/=(const v4uf &r)
Definition: VM_SIMD.h:250
#define VM_AND
SYS_FORCE_INLINE v4uf operator-=(float r)
Definition: VM_SIMD.h:257
SYS_FORCE_INLINE v4uu(const v4uu &v)
Definition: VM_SIMD.h:63
float fpreal32
Definition: SYS_Types.h:191
SYS_FORCE_INLINE v4uf operator+(const v4uf &r) const
Definition: VM_SIMD.h:236
SYS_FORCE_INLINE v4uu operator~() const
Definition: VM_SIMD.h:162
SIM_DerVector3 cross(const SIM_DerVector3 &lhs, const SIM_DerVector3 &rhs)
SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept
Definition: VM_SIMD.h:193
v4sf vector
Definition: VM_SIMD.h:336
v4uf toFloat() const
Definition: VM_SIMD.h:340
SYS_FORCE_INLINE v4uu isFinite() const
Definition: VM_SIMD.h:328
#define VM_E_FLOOR()
GLenum clamp
Definition: glcorearb.h:1233
#define VM_XOR
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:794
GLenum src
Definition: glcorearb.h:1792
SYS_FORCE_INLINE v4uf operator|(const v4uu &r) const
Definition: VM_SIMD.h:279
SYS_FORCE_INLINE v4uu operator*=(const v4uu &r)
Definition: VM_SIMD.h:122
#define VM_INVERT