HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_SIMD.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_SIMD.C ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_SIMD__
12 #define __VM_SIMD__
13 
14 #include <SYS/SYS_Inline.h>
15 #include <SYS/SYS_Math.h>
16 #include <SYS/SYS_StaticAssert.h>
17 #include <SYS/SYS_Types.h>
18 #include <SYS/SYS_TypeTraits.h>
19 #include <SYS/SYS_Visibility.h>
20 #include <cstdint>
21 
22 //#define FORCE_NON_SIMD
23 
24 // Some older versions of glibc don't correctly align the stack for extra
25 // threads (this problem should have been fixed in 2.3.4 - see
26 // http://sources.redhat.com/bugzilla/show_bug.cgi?id=723). Instead of
27 // checking the glibc version, check the compiler version and assume newer
28 // compilers are used with newer glibc.
29 
30 #if defined(FORCE_NON_SIMD)
31  #include "VM_BasicFunc.h"
32 #else
33  #if defined(CELLRSX) || defined(PPC)
34  #include "VM_AltivecFunc.h"
35  #elif defined(LINUX) && SYS_IS_GCC_GE(3, 4) && defined(__SSE2__)
36  #include "VM_SSEFunc.h"
37  #elif defined(WIN32)
38  #include "VM_SSEFunc.h"
39  #elif defined(MBSD_INTEL)
40  #include "VM_SSEFunc.h"
41  #else
42  #include "VM_BasicFunc.h"
43  #endif
44 #endif
45 
46 class v4uf;
47 
48 class v4uu {
49 public:
50  // NOTE: For unknown reasons, BVH construction is significantly
51  // slower on GCC 5.4 if v4uu and v4uf are POD types, so I put
52  // back the previous code.
53 #if 0
54  SYS_FORCE_INLINE v4uu() noexcept = default;
55  SYS_FORCE_INLINE v4uu(const v4si &v) : vector(v) {
56  SYS_STATIC_ASSERT_MSG(SYSisPOD<v4uu>(), "v4uu should be POD, for better performance in UT_Array, etc.");
57  }
58  SYS_FORCE_INLINE v4uu(const v4uu &v) noexcept = default;
59  SYS_FORCE_INLINE v4uu &operator=(const v4uu &v) noexcept = default;
60 #else
62  SYS_FORCE_INLINE v4uu(const v4si &v) : vector(v) {}
63  SYS_FORCE_INLINE v4uu(const v4uu &v) : vector(v.vector) {}
64  SYS_FORCE_INLINE v4uu operator=(const v4uu &v)
65  { vector = v.vector; return *this; }
66 #endif
67  explicit SYS_FORCE_INLINE v4uu(int32 v) { vector = VM_SPLATS(v); }
68  explicit SYS_FORCE_INLINE v4uu(const int32 v[4])
69  { vector = VM_LOAD(v); }
71  { vector = VM_SPLATS(a, b, c, d); }
72 
73  // store (readback)
74  SYS_FORCE_INLINE void store(int32 v[4]) const
75  { VM_STORE(v, vector); }
76 
77  // Assignment
79  { vector = v4uu(v).vector; return *this; }
81  { vector = v; return *this; }
82 
83  SYS_FORCE_INLINE void condAssign(const v4uu &val, const v4uu &c)
84  { *this = (c & val) | ((!c) & *this); }
85 
86  // Comparison
87  SYS_FORCE_INLINE v4uu operator == (const v4uu &v) const
88  { return v4uu(VM_ICMPEQ(vector, v.vector)); }
89  SYS_FORCE_INLINE v4uu operator != (const v4uu &v) const
90  { return ~(*this == v); }
91  SYS_FORCE_INLINE v4uu operator > (const v4uu &v) const
92  { return v4uu(VM_ICMPGT(vector, v.vector)); }
93  SYS_FORCE_INLINE v4uu operator < (const v4uu &v) const
94  { return v4uu(VM_ICMPLT(vector, v.vector)); }
95  SYS_FORCE_INLINE v4uu operator >= (const v4uu &v) const
96  { return ~(*this < v); }
97  SYS_FORCE_INLINE v4uu operator <= (const v4uu &v) const
98  { return ~(*this > v); }
99 
100  SYS_FORCE_INLINE v4uu operator == (int32 v) const { return *this == v4uu(v); }
101  SYS_FORCE_INLINE v4uu operator != (int32 v) const { return *this != v4uu(v); }
102  SYS_FORCE_INLINE v4uu operator > (int32 v) const { return *this > v4uu(v); }
103  SYS_FORCE_INLINE v4uu operator < (int32 v) const { return *this < v4uu(v); }
104  SYS_FORCE_INLINE v4uu operator >= (int32 v) const { return *this >= v4uu(v); }
105  SYS_FORCE_INLINE v4uu operator <= (int32 v) const { return *this <= v4uu(v); }
106 
107  // BitShifting
110 
111  // Basic math
112  SYS_FORCE_INLINE v4uu operator+(const v4uu &r) const
113  { return v4uu(VM_IADD(vector, r.vector)); }
114  SYS_FORCE_INLINE v4uu operator-(const v4uu &r) const
115  { return v4uu(VM_ISUB(vector, r.vector)); }
116 #if defined(VM_IMUL)
117  SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
118  { return v4uu(VM_IMUL(vector, r.vector)); }
119 #else
120  SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
121  {
122  return v4uu((*this)[0] * r[0],
123  (*this)[1] * r[1],
124  (*this)[2] * r[2],
125  (*this)[3] * r[3]);
126  }
127 #endif
128  SYS_FORCE_INLINE v4uu operator+=(const v4uu &r) { return (*this = *this + r); }
129  SYS_FORCE_INLINE v4uu operator-=(const v4uu &r) { return (*this = *this - r); }
130  SYS_FORCE_INLINE v4uu operator*=(const v4uu &r) { return (*this = *this * r); }
131  SYS_FORCE_INLINE v4uu operator+(int32 r) const { return *this + v4uu(r); }
132  SYS_FORCE_INLINE v4uu operator-(int32 r) const { return *this - v4uu(r); }
133  SYS_FORCE_INLINE v4uu operator*(int32 r) const { return *this * v4uu(r); }
134  SYS_FORCE_INLINE v4uu operator+=(int32 r) { return (*this = *this + r); }
135  SYS_FORCE_INLINE v4uu operator-=(int32 r) { return (*this = *this - r); }
136  SYS_FORCE_INLINE v4uu operator*=(int32 r) { return (*this = *this * r); }
137 
138  // Modulo
139  SYS_FORCE_INLINE v4uu operator%(const v4uu &r) const
140  {
141  return v4uu((*this)[0] % r[0],
142  (*this)[1] % r[1],
143  (*this)[2] % r[2],
144  (*this)[3] % r[3]);
145  }
146  SYS_FORCE_INLINE v4uu operator%(int r) const
147  {
148  return v4uu((*this)[0] % r,
149  (*this)[1] % r,
150  (*this)[2] % r,
151  (*this)[3] % r);
152  }
153  SYS_FORCE_INLINE v4uu operator%=(const v4uu &r) { return (*this = *this % r); }
154  SYS_FORCE_INLINE v4uu operator%=(int r) { return (*this = *this % r); }
155 
156 
157  // logical/bitwise
158 
159  SYS_FORCE_INLINE v4uu operator||(const v4uu &r) const
160  { return v4uu(VM_OR(vector, r.vector)); }
161  SYS_FORCE_INLINE v4uu operator&&(const v4uu &r) const
162  { return v4uu(VM_AND(vector, r.vector)); }
163  SYS_FORCE_INLINE v4uu operator^(const v4uu &r) const
164  { return v4uu(VM_XOR(vector, r.vector)); }
166  { return *this == v4uu(0); }
167 
168  SYS_FORCE_INLINE v4uu operator|(const v4uu &r) const { return *this || r; }
169  SYS_FORCE_INLINE v4uu operator&(const v4uu &r) const { return *this && r; }
171  { return *this ^ v4uu(0xFFFFFFFF); }
172  SYS_FORCE_INLINE void operator|=(const v4uu &r) { vector = VM_OR(vector, r.vector); }
173  SYS_FORCE_INLINE void operator&=(const v4uu &r) { vector = VM_AND(vector, r.vector); }
174  SYS_FORCE_INLINE void operator^=(const v4uu &r) { vector = VM_XOR(vector, r.vector); }
175 
176  // component
177  SYS_FORCE_INLINE int32 operator[](int idx) const { return VM_EXTRACT(vector, idx); }
178  SYS_FORCE_INLINE void setComp(int idx, int32 v) { vector = VM_INSERT(vector, v, idx); }
179 
180  v4uf toFloat() const;
181 
182  operator v4uf() const;
183 
184 public:
186 };
187 
188 class v4uf {
189 public:
190  SYS_FORCE_INLINE v4uf() noexcept = default;
191  // NOTE: For unknown reasons, BVH construction is significantly
192  // slower on GCC 5.4 if v4uu and v4uf are POD types, so I put
193  // back the previous code.
194 #if 0
195  SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept = default;
196  SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept : vector(v) {
197  SYS_STATIC_ASSERT_MSG(SYSisPOD<v4uf>(), "v4uf should be POD, for better performance in UT_Array, etc.");
198  }
199  SYS_FORCE_INLINE v4uf &operator=(const v4uf &v) noexcept = default;
200 #else
201  SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept : vector(v) {}
202  SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept : vector(v.vector) {}
203  SYS_FORCE_INLINE v4uf operator=(const v4uf &v) noexcept
204  { vector = v.vector; return *this; }
205 #endif
206  explicit SYS_FORCE_INLINE v4uf(float v) { vector = VM_SPLATS(v); }
207  explicit SYS_FORCE_INLINE v4uf(const float v[4])
208  { vector = VM_LOAD(v); }
209  SYS_FORCE_INLINE v4uf(float a, float b, float c, float d)
210  { vector = VM_SPLATS(a, b, c, d); }
211 
212  // store (readback)
213  SYS_FORCE_INLINE void store(float v[4]) const
214  { VM_STORE(v, vector); }
215 
216  // Assignment
218  { vector = v4uf(v).vector; return *this; }
220  { vector = v; return *this; }
221 
222  SYS_FORCE_INLINE void condAssign(const v4uf &val, const v4uu &c)
223  { *this = (val & c) | (*this & ~c); }
224 
225  // Comparison
226  SYS_FORCE_INLINE v4uu operator == (const v4uf &v) const
227  { return v4uu(VM_CMPEQ(vector, v.vector)); }
228  SYS_FORCE_INLINE v4uu operator != (const v4uf &v) const
229  { return v4uu(VM_CMPNE(vector, v.vector)); }
230  SYS_FORCE_INLINE v4uu operator > (const v4uf &v) const
231  { return v4uu(VM_CMPGT(vector, v.vector)); }
232  SYS_FORCE_INLINE v4uu operator < (const v4uf &v) const
233  { return v4uu(VM_CMPLT(vector, v.vector)); }
234  SYS_FORCE_INLINE v4uu operator >= (const v4uf &v) const
235  { return v4uu(VM_CMPGE(vector, v.vector)); }
236  SYS_FORCE_INLINE v4uu operator <= (const v4uf &v) const
237  { return v4uu(VM_CMPLE(vector, v.vector)); }
238 
239  SYS_FORCE_INLINE v4uu operator == (float v) const { return *this == v4uf(v); }
240  SYS_FORCE_INLINE v4uu operator != (float v) const { return *this != v4uf(v); }
241  SYS_FORCE_INLINE v4uu operator > (float v) const { return *this > v4uf(v); }
242  SYS_FORCE_INLINE v4uu operator < (float v) const { return *this < v4uf(v); }
243  SYS_FORCE_INLINE v4uu operator >= (float v) const { return *this >= v4uf(v); }
244  SYS_FORCE_INLINE v4uu operator <= (float v) const { return *this <= v4uf(v); }
245 
246 
247  // Basic math
249  { return v4uf(VM_ADD(vector, r.vector)); }
251  { return v4uf(VM_SUB(vector, r.vector)); }
253  { return v4uf(VM_NEG(vector)); }
255  { return v4uf(VM_MUL(vector, r.vector)); }
257  { return v4uf(VM_DIV(vector, r.vector)); }
258 
259  SYS_FORCE_INLINE v4uf operator+=(const v4uf &r) { return (*this = *this + r); }
260  SYS_FORCE_INLINE v4uf operator-=(const v4uf &r) { return (*this = *this - r); }
261  SYS_FORCE_INLINE v4uf operator*=(const v4uf &r) { return (*this = *this * r); }
262  SYS_FORCE_INLINE v4uf operator/=(const v4uf &r) { return (*this = *this / r); }
263 
264  SYS_FORCE_INLINE v4uf operator+(float r) const { return *this + v4uf(r); }
265  SYS_FORCE_INLINE v4uf operator-(float r) const { return *this - v4uf(r); }
266  SYS_FORCE_INLINE v4uf operator*(float r) const { return *this * v4uf(r); }
267  SYS_FORCE_INLINE v4uf operator/(float r) const { return *this / v4uf(r); }
268  SYS_FORCE_INLINE v4uf operator+=(float r) { return (*this = *this + r); }
269  SYS_FORCE_INLINE v4uf operator-=(float r) { return (*this = *this - r); }
270  SYS_FORCE_INLINE v4uf operator*=(float r) { return (*this = *this * r); }
271  SYS_FORCE_INLINE v4uf operator/=(float r) { return (*this = *this / r); }
272 
273  // logical/bitwise
274 
275  SYS_FORCE_INLINE v4uf operator||(const v4uu &r) const
276  { return v4uf(V4SF(VM_OR(V4SI(vector), r.vector))); }
277  SYS_FORCE_INLINE v4uf operator&&(const v4uu &r) const
278  { return v4uf(V4SF(VM_AND(V4SI(vector), r.vector))); }
279  SYS_FORCE_INLINE v4uf operator^(const v4uu &r) const
280  { return v4uf(V4SF(VM_XOR(V4SI(vector), r.vector))); }
282  { return v4uf(V4SF((*this == v4uf(0.0F)).vector)); }
283 
285  { return v4uf(V4SF(VM_OR(V4SI(vector), V4SI(r.vector)))); }
287  { return v4uf(V4SF(VM_AND(V4SI(vector), V4SI(r.vector)))); }
289  { return v4uf(V4SF(VM_XOR(V4SI(vector), V4SI(r.vector)))); }
290 
291  SYS_FORCE_INLINE v4uf operator|(const v4uu &r) const { return *this || r; }
292  SYS_FORCE_INLINE v4uf operator&(const v4uu &r) const { return *this && r; }
294  { return *this ^ v4uu(0xFFFFFFFF); }
295 
296  SYS_FORCE_INLINE v4uf operator|(const v4uf &r) const { return *this || r; }
297  SYS_FORCE_INLINE v4uf operator&(const v4uf &r) const { return *this && r; }
298 
299  // component
300  SYS_FORCE_INLINE float operator[](int idx) const { return VM_EXTRACT(vector, idx); }
301  SYS_FORCE_INLINE void setComp(int idx, float v) { vector = VM_INSERT(vector, v, idx); }
302 
303  // more math
304  SYS_FORCE_INLINE v4uf abs() const { return v4uf(VM_ABS(vector)); }
305  SYS_FORCE_INLINE v4uf clamp(const v4uf &low, const v4uf &high) const
306  { return v4uf(
307  VM_MIN(VM_MAX(vector, low.vector), high.vector)); }
308  SYS_FORCE_INLINE v4uf clamp(float low, float high) const
309  { return v4uf(VM_MIN(VM_MAX(vector,
310  v4uf(low).vector), v4uf(high).vector)); }
312 
313  /// This is a lie, it is a signed int.
314  SYS_FORCE_INLINE v4uu toUnsignedInt() const { return VM_INT(vector); }
315  SYS_FORCE_INLINE v4uu toSignedInt() const { return VM_INT(vector); }
316 
317  v4uu floor() const
318  {
319  VM_P_FLOOR();
320  v4uu result = VM_FLOOR(vector);
321  VM_E_FLOOR();
322  return result;
323  }
324 
325  /// Returns the integer part of this float, this becomes the
326  /// 0..1 fractional component.
327  v4uu splitFloat()
328  {
329  v4uu base = toSignedInt();
330  *this -= base.toFloat();
331  return base;
332  }
333 
334  template <int A, int B, int C, int D>
336  {
337  return VM_SHUFFLE<A,B,C,D>(vector);
338  }
339 
341  {
342  // If the exponent is the maximum value, it's either infinite or NaN.
343  const v4si mask = VM_SPLATS(0x7F800000);
344  return ~v4uu(VM_ICMPEQ(VM_AND(V4SI(vector), mask), mask));
345  }
346 
347 public:
349 };
350 
353 {
354  return v4uf(VM_IFLOAT(vector));
355 }
357 v4uu::operator v4uf() const
358 {
359  return toFloat();
360 }
361 
362 static SYS_FORCE_INLINE v4uf
363 operator+(float r, const v4uf &v) noexcept
364 {
365  return v4uf(r) + v;
366 }
367 
368 static SYS_FORCE_INLINE v4uf
369 operator-(float r, const v4uf &v) noexcept
370 {
371  return v4uf(r) - v;
372 }
373 
374 static SYS_FORCE_INLINE v4uf
375 operator*(float r, const v4uf &v) noexcept
376 {
377  return v4uf(r) * v;
378 }
379 static SYS_FORCE_INLINE v4uf
380 operator/(float r, const v4uf &v) noexcept
381 {
382  return v4uf(r) / v;
383 }
384 
385 //
386 // Bitcast
387 //
388 static SYS_FORCE_INLINE v4uf
389 bitCastIntToFloat(const v4uu &v) { return V4SF(v.vector); }
390 
391 static SYS_FORCE_INLINE v4uu
392 bitCastFloatToInt(const v4uf &v) { return V4SI(v.vector); }
393 
394 //
395 // Custom vector operations
396 //
397 
398 static SYS_FORCE_INLINE v4uf
399 sqrt(const v4uf &a)
400 {
401  return v4uf(VM_SQRT(a.vector));
402 }
403 
404 static SYS_FORCE_INLINE v4uf
405 fabs(const v4uf &a)
406 {
407  return a.abs();
408 }
409 
410 // Use this operation to mask disabled values to 0
411 // rval = !a ? b : 0;
412 
413 static SYS_FORCE_INLINE v4uf
414 andn(const v4uu &a, const v4uf &b)
415 {
416  return v4uf(V4SF(VM_ANDNOT(a.vector, V4SI(b.vector))));
417 }
418 
419 static SYS_FORCE_INLINE v4uu
420 andn(const v4uu &a, const v4uu &b)
421 {
422  return v4uu(VM_ANDNOT(a.vector, b.vector));
423 }
424 
425 // rval = a ? b : c;
426 static SYS_FORCE_INLINE v4uf
427 ternary(const v4uu &a, const v4uf &b, const v4uf &c)
428 {
429  return (b & a) | andn(a, c);
430 }
431 
432 static SYS_FORCE_INLINE v4uu
433 ternary(const v4uu &a, const v4uu &b, const v4uu &c)
434 {
435  return (b & a) | andn(a, c);
436 }
437 
438 // rval = !(a && b)
439 static SYS_FORCE_INLINE v4uu
440 nand(const v4uu &a, const v4uu &b)
441 {
442  return !v4uu(VM_AND(a.vector, b.vector));
443 }
444 
445 static SYS_FORCE_INLINE v4uf
446 vmin(const v4uf &a, const v4uf &b)
447 {
448  return v4uf(VM_MIN(a.vector, b.vector));
449 }
450 
451 static SYS_FORCE_INLINE v4uf
452 vmax(const v4uf &a, const v4uf &b)
453 {
454  return v4uf(VM_MAX(a.vector, b.vector));
455 }
456 
457 static SYS_FORCE_INLINE v4uf
458 clamp(const v4uf &a, const v4uf &b, const v4uf &c)
459 {
460  return vmax(vmin(a, c), b);
461 }
462 
463 static SYS_FORCE_INLINE v4uf
464 clamp(const v4uf &a, float b, float c)
465 {
466  return vmax(vmin(a, v4uf(c)), v4uf(b));
467 }
468 
469 // Returns an integer with the lower 4 bits set
470 // if the corresponding sign bit in a is set.
471 // Useful after comparisons as comparisons will
472 // be -1 or 0; so no need for an allbits test.
473 // 1 is a[0], 2 is a[1], and 8 is a[3].
474 static SYS_FORCE_INLINE int
475 signbits(const v4uu &a)
476 {
477  return vm_signbits(a.vector);
478 }
479 
480 static SYS_FORCE_INLINE int
481 signbits(const v4uf &a)
482 {
483  return vm_signbits(a.vector);
484 }
485 
486 static SYS_FORCE_INLINE bool
487 allbits(const v4uu &a)
488 {
489  return vm_allbits(a.vector);
490 }
491 
492 static SYS_FORCE_INLINE bool
493 anybits(const v4uu &a)
494 {
495  return !allbits(~a);
496 }
497 
498 static SYS_FORCE_INLINE v4uf
499 madd(const v4uf &v, const v4uf &f, const v4uf &a)
500 {
501  return v4uf(VM_MADD(v.vector, f.vector, a.vector));
502 }
503 
504 static SYS_FORCE_INLINE v4uf
505 madd(const v4uf &v, float f, float a)
506 {
507  return v4uf(VM_MADD(v.vector, v4uf(f).vector, v4uf(a).vector));
508 }
509 
510 static SYS_FORCE_INLINE v4uf
511 madd(const v4uf &v, float f, const v4uf &a)
512 {
513  return v4uf(VM_MADD(v.vector, v4uf(f).vector, a.vector));
514 }
515 
516 static SYS_FORCE_INLINE v4uf
517 msub(const v4uf &v, const v4uf &f, const v4uf &s)
518 {
519  return madd(v, f, -s);
520 }
521 
522 static SYS_FORCE_INLINE v4uf
523 msub(const v4uf &v, float f, float s)
524 {
525  return madd(v, f, -s);
526 }
527 
528 static SYS_FORCE_INLINE v4uf
529 lerp(const v4uf &a, const v4uf &b, const v4uf &w)
530 {
531  v4uf w1 = v4uf(1.0F) - w;
532  return madd(a, w1, b*w);
533 }
534 
535 static SYS_FORCE_INLINE v4uf
536 luminance(const v4uf &r, const v4uf &g, const v4uf &b,
537  float rw, float gw, float bw)
538 {
539  return v4uf(madd(r, v4uf(rw), madd(g, v4uf(gw), b * bw)));
540 }
541 
542 static SYS_FORCE_INLINE float
543 dot3(const v4uf &a, const v4uf &b)
544 {
545  v4uf res = a*b;
546  return res[0] + res[1] + res[2];
547 }
548 
549 static SYS_FORCE_INLINE float
550 dot4(const v4uf &a, const v4uf &b)
551 {
552  v4uf res = a*b;
553  return res[0] + res[1] + res[2] + res[3];
554 }
555 
556 static SYS_FORCE_INLINE float
557 length(const v4uf &a)
558 {
559  return SYSsqrt(dot3(a, a));
560 }
561 
562 static SYS_FORCE_INLINE v4uf
563 normalize(const v4uf &a)
564 {
565  return a / length(a);
566 }
567 
568 static SYS_FORCE_INLINE v4uf
569 cross(const v4uf &a, const v4uf &b)
570 {
571  return v4uf(a[1]*b[2] - a[2]*b[1],
572  a[2]*b[0] - a[0]*b[2],
573  a[0]*b[1] - a[1]*b[0], 0);
574 }
575 
576 // Convert 4 fpreal32 to 4 fpreal16's, returned as an int64
577 #if defined(AMD64) && (defined(__SSE4_1__) || defined(_MSC_VER))
578 #include <smmintrin.h> // SSE 4.1
579 static SYS_FORCE_INLINE int64
580 VMconvert4F32ToF16(v4si input)
581 {
582  // Separate sign bit and the rest
583  __m128i sign_mask = _mm_set1_epi32(0x80000000);
584  __m128i sign = _mm_and_si128(sign_mask, input);
585  __m128i positive = _mm_andnot_si128(sign_mask, input); // order matters here
586 
587  // Move the sign bit to its new location.
588  sign = _mm_srli_epi32(sign,16);
589 
590  // Clamp the value to avoid problems below.
591  __m128i min_value = _mm_set1_epi32((127-15)<<23);
592  __m128i max_value = _mm_set1_epi32((127+16)<<23);
593  positive = _mm_max_epi32(positive, min_value); // SSE4.1
594  positive = _mm_min_epi32(positive, max_value); // SSE4.1
595 
596  // Isolate the bit that's going to determine whether we round up or down.
597  __m128i bit0_mask = _mm_set1_epi32(1);
598  __m128i round_bit = _mm_srli_epi32(positive, (23-10-1));
599  round_bit = _mm_and_si128(round_bit, bit0_mask);
600 
601  // Isolate the exponent, change it from excess-127 to excess-15,
602  // then shift it into its new location.
603  __m128i exponent_diff = _mm_set1_epi32(127-15);
604  __m128i exponent = _mm_srli_epi32(positive, 23);
605  exponent = _mm_sub_epi32(exponent, exponent_diff);
606  exponent = _mm_slli_epi32(exponent, 10);
607 
608  // Isolate the mantissa bits that we'll be keeping.
609  __m128i ten_bits_mask = _mm_set1_epi32(0x3FF);
610  positive = _mm_srli_epi32(positive, 23-10);
611  positive = _mm_and_si128(positive, ten_bits_mask);
612 
613  // Combine the exponent, mantissa, and sign
614  __m128i f16s = _mm_or_si128(exponent, positive);
615  f16s = _mm_or_si128(f16s, sign);
616 
617  // Add the rounding bit
618  f16s = _mm_add_epi32(f16s, round_bit);
619 
620  // Pack the 4 low 16 bits for each 32-bit integer into
621  // first 4 16-bit integers, (and last 4 16-bit integers,
622  // but we don't care about those.)
623  f16s = _mm_packus_epi32(f16s,f16s); // SSE4.1
624 
625  // Just extract out the first 4 16-bit integers.
626  return _mm_cvtsi128_si64(f16s);
627 }
628 #else
629 static SYS_FORCE_INLINE int64
630 VMconvert4F32ToF16(v4si input)
631 {
632  int64 result;
633  const fpreal32 * src = reinterpret_cast<const fpreal32 *>(&input);
634  fpreal16 * dst = reinterpret_cast<fpreal16 *>(&result);
635 
636  for (int i = 0; i < 4; ++i)
637  dst[i] = fpreal16(src[i]);
638 
639  return result;
640 }
641 #endif
642 static SYS_FORCE_INLINE int64
643 VMconvert4F32ToF16(v4sf input)
644 {
645  return VMconvert4F32ToF16(V4SI(input));
646 }
647 
648 // SYS version of everything
649 
650 static SYS_FORCE_INLINE v4uu
651 SYSmin(const v4uu &a, const v4uu &b)
652 {
653  return ternary(a < b, a, b);
654 }
655 static SYS_FORCE_INLINE v4uu
656 SYSmax(const v4uu &a, const v4uu &b)
657 {
658  return ternary(a > b, a, b);
659 }
660 static SYS_FORCE_INLINE v4uu
661 SYSclamp(const v4uu &a, const v4uu &b, const v4uu &c)
662 {
663  return SYSmax(SYSmin(a, c), b);
664 }
665 static SYS_FORCE_INLINE v4uu
666 SYSclamp(const v4uu &a, int b, int c)
667 {
668  return SYSmax(SYSmin(a, v4uu(c)), v4uu(b));
669 }
670 
671 static SYS_FORCE_INLINE v4uf
672 SYSmin(const v4uf &a, const v4uf &b)
673 {
674  return v4uf(VM_MIN(a.vector, b.vector));
675 }
676 
677 static SYS_FORCE_INLINE v4uf
678 SYSmax(const v4uf &a, const v4uf &b)
679 {
680  return v4uf(VM_MAX(a.vector, b.vector));
681 }
682 
683 static SYS_FORCE_INLINE v4uf
684 SYSlerp(const v4uf &a, const v4uf &b, const v4uf &t)
685 {
686  v4uf diff = b-a;
687  return madd(diff, t, a);
688 }
689 
690 static SYS_FORCE_INLINE v4uf
691 SYSlerp(const v4uf &a, const v4uf &b, const float t)
692 {
693  v4uf diff = b-a;
694  return madd(diff, t, a);
695 }
696 static SYS_FORCE_INLINE v4uf
697 SYSclamp(const v4uf &a, const v4uf &b, const v4uf &c)
698 {
699  return vmax(vmin(a, c), b);
700 }
701 
702 static SYS_FORCE_INLINE v4uf
703 SYSclamp(const v4uf &a, float b, float c)
704 {
705  return vmax(vmin(a, v4uf(c)), v4uf(b));
706 }
707 
708 static SYS_FORCE_INLINE v4uf
709 sin(const v4uf &x)
710 {
711  return (v4uf)VM_SIN(x.vector);
712 }
713 
714 static SYS_FORCE_INLINE v4uf
715 cos(const v4uf &x)
716 {
717  return (v4uf)VM_COS(x.vector);
718 }
719 
720 static SYS_FORCE_INLINE v4uf
721 tan(const v4uf &x)
722 {
723  return (v4uf)VM_TAN(x.vector);
724 }
725 
726 static SYS_FORCE_INLINE void
727 sincos(const v4uf &x, v4uf *s, v4uf *c)
728 {
729  VM_SINCOS(x.vector, &(s->vector), &(c->vector));
730 }
731 
732 static SYS_FORCE_INLINE v4uu
733 SYSfastFloor(const v4uf &a)
734 {
735  // The following are two versions of floor.
736  //
737  // 1.
738  // This floor is a bit problematic around 0. For example if x is a
739  // very small (in magnitude) denormal number, then floor(x) = -1
740  // and x - floor(x) = 1, which could be unexpected.
741  return ternary(a < v4uf(0.f), a.toSignedInt() - 1, a.toSignedInt());
742  //
743  // 2.
744  //
745  // This version is broken on some platforms. On thoth,
746  // SYSfastFloor(v4uf(-0.716626)) == v4ui(0)
747  //
748  //return a.floor();
749 
750 }
751 static SYS_FORCE_INLINE v4uu
752 SYSfloor(const v4uf &a)
753 {
754  return a.floor();
755 }
756 
757 // Currently there is no specific support for signed integers
758 typedef v4uu v4ui;
759 
760 // Assuming that ptr is an array of elements of type STYPE, this operation
761 // will return the index of the first element that is aligned to (1<<ASIZE)
762 // bytes.
763 #define VM_ALIGN(ptr, ASIZE, STYPE) \
764  ((((1<<ASIZE)-(intptr_t)ptr)&((1<<ASIZE)-1))/sizeof(STYPE))
765 
766 #endif
GLdouble s
Definition: glew.h:1390
SYS_FORCE_INLINE v4uf operator/(float r) const
Definition: VM_SIMD.h:267
SYS_FORCE_INLINE v4uu operator=(const v4uu &v)
Definition: VM_SIMD.h:64
#define SYSmax(a, b)
Definition: SYS_Math.h:1447
#define VM_SIN
SYS_FORCE_INLINE v4uf operator+=(float r)
Definition: VM_SIMD.h:268
Mat3< typename promote< S, T >::type > operator*(S scalar, const Mat3< T > &m)
Multiply each element of the given matrix by scalar and return the result.
Definition: Mat3.h:609
SYS_API double cos(double x)
Definition: SYS_FPUMath.h:69
GLenum GLenum GLenum input
Definition: glew.h:13879
SYS_FORCE_INLINE v4uf operator&&(const v4uf &r) const
Definition: VM_SIMD.h:286
v4uu v4ui
Definition: VM_SIMD.h:758
GLenum src
Definition: glew.h:2410
#define VM_FLOOR
#define VM_IMUL
Definition: VM_BasicFunc.h:405
SYS_FORCE_INLINE v4uu operator>=(const v4uu &v) const
Definition: VM_SIMD.h:95
SYS_FORCE_INLINE v4uu operator=(v4si v)
Definition: VM_SIMD.h:80
#define VM_STORE
Definition: VM_BasicFunc.h:390
SYS_FORCE_INLINE v4uf operator^(const v4uu &r) const
Definition: VM_SIMD.h:279
#define VM_NEG
#define VM_CMPGT
SYS_FORCE_INLINE v4uf(const v4uf &v) noexcept
Definition: VM_SIMD.h:202
#define V4SI(A)
#define SYS_STATIC_ASSERT_MSG(expr, msg)
SYS_FORCE_INLINE v4uu operator||(const v4uu &r) const
Definition: VM_SIMD.h:159
SYS_FORCE_INLINE v4uf operator-() const
Definition: VM_SIMD.h:252
#define VM_ISUB
SYS_FORCE_INLINE v4uu operator&&(const v4uu &r) const
Definition: VM_SIMD.h:161
GLuint const GLfloat * val
Definition: glew.h:2794
Mat3< typename promote< T0, T1 >::type > operator+(const Mat3< T0 > &m0, const Mat3< T1 > &m1)
Add corresponding elements of m0 and m1 and return the result.
Definition: Mat3.h:625
SYS_FORCE_INLINE v4uu operator-(const v4uu &r) const
Definition: VM_SIMD.h:114
#define VM_CMPLE
SYS_FORCE_INLINE v4uu operator*(int32 r) const
Definition: VM_SIMD.h:133
GLboolean GLboolean GLboolean GLboolean a
Definition: glew.h:9477
virtual bool lerp(GA_AttributeOperand &d, GA_AttributeOperand &a, GA_AttributeOperand &b, GA_AttributeOperand &t) const
d = SYSlerp(a, b, t);
SYS_FORCE_INLINE v4uu toUnsignedInt() const
This is a lie, it is a signed int.
Definition: VM_SIMD.h:314
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7231
#define VM_CMPGE
SYS_FORCE_INLINE v4uu(const int32 v[4])
Definition: VM_SIMD.h:68
SYS_FORCE_INLINE v4uu operator%=(const v4uu &r)
Definition: VM_SIMD.h:153
SYS_FORCE_INLINE v4uf operator=(v4sf v) noexcept
Definition: VM_SIMD.h:219
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
Definition: fmath.h:534
SYS_FORCE_INLINE v4uu operator*=(int32 r)
Definition: VM_SIMD.h:136
SYS_FORCE_INLINE v4uu operator+(const v4uu &r) const
Definition: VM_SIMD.h:112
#define VM_INSERT
Definition: VM_BasicFunc.h:387
SYS_FORCE_INLINE v4uf operator=(float v)
Definition: VM_SIMD.h:217
SYS_FORCE_INLINE v4uu operator==(const v4uu &v) const
Definition: VM_SIMD.h:87
SYS_FORCE_INLINE v4uu operator<(const v4uu &v) const
Definition: VM_SIMD.h:93
SYS_FORCE_INLINE v4uu(int32 v)
Definition: VM_SIMD.h:67
#define VM_ICMPGT
SYS_FORCE_INLINE v4uu operator%(const v4uu &r) const
Definition: VM_SIMD.h:139
SYS_FORCE_INLINE void operator|=(const v4uu &r)
Definition: VM_SIMD.h:172
const GLdouble * v
Definition: glew.h:1391
SYS_FORCE_INLINE void condAssign(const v4uu &val, const v4uu &c)
Definition: VM_SIMD.h:83
GLenum GLint GLuint mask
Definition: glew.h:1845
UT_Matrix2T< T > SYSlerp(const UT_Matrix2T< T > &v1, const UT_Matrix2T< T > &v2, S t)
Definition: UT_Matrix2.h:604
SYS_FORCE_INLINE v4uu operator!() const
Definition: VM_SIMD.h:165
SYS_FORCE_INLINE v4uf operator-(float r) const
Definition: VM_SIMD.h:265
#define VM_SQRT
SYS_FORCE_INLINE v4uu operator+=(int32 r)
Definition: VM_SIMD.h:134
float dot3(const vfloat4 &a, const vfloat4 &b)
Return the float 3-component dot (inner) product of a and b.
Definition: simd.h:7067
SYS_FORCE_INLINE v4uf abs() const
Definition: VM_SIMD.h:304
#define VM_ICMPLT
OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:267
v4uu floor() const
Definition: VM_SIMD.h:317
SYS_FORCE_INLINE v4uf operator|(const v4uf &r) const
Definition: VM_SIMD.h:296
#define VM_EXTRACT
Definition: VM_BasicFunc.h:386
OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: -(a*b - c)
Definition: fmath.h:281
SYS_FORCE_INLINE v4uf operator*=(float r)
Definition: VM_SIMD.h:270
SYS_FORCE_INLINE void store(float v[4]) const
Definition: VM_SIMD.h:213
#define VM_MIN
long long int64
Definition: SYS_Types.h:111
SYS_FORCE_INLINE v4uf operator/(const v4uf &r) const
Definition: VM_SIMD.h:256
SYS_FORCE_INLINE v4uf(float a, float b, float c, float d)
Definition: VM_SIMD.h:209
GLclampf f
Definition: glew.h:3499
GLint GLint GLint GLint GLint x
Definition: glew.h:1252
GLenum clamp
Definition: glew.h:2166
SYS_FORCE_INLINE v4uf recip() const
Definition: VM_SIMD.h:311
SYS_FORCE_INLINE v4uu operator&(const v4uu &r) const
Definition: VM_SIMD.h:169
Mat3< typename promote< T0, T1 >::type > operator-(const Mat3< T0 > &m0, const Mat3< T1 > &m1)
Subtract corresponding elements of m0 and m1 and return the result.
Definition: Mat3.h:635
SYS_FORCE_INLINE v4uu operator+=(const v4uu &r)
Definition: VM_SIMD.h:128
SYS_FORCE_INLINE v4uu operator^(const v4uu &r) const
Definition: VM_SIMD.h:163
SYS_FORCE_INLINE v4uu operator=(int32 v)
Definition: VM_SIMD.h:78
SYS_FORCE_INLINE v4uf() noexcept=default
SYS_FORCE_INLINE v4uu(const v4si &v)
Definition: VM_SIMD.h:62
Definition: VM_SIMD.h:48
SYS_FORCE_INLINE v4uu toSignedInt() const
Definition: VM_SIMD.h:315
SYS_FORCE_INLINE v4uf operator&(const v4uf &r) const
Definition: VM_SIMD.h:297
#define VM_CMPLT
#define VM_COS
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
SYS_FORCE_INLINE v4uu operator<=(const v4uf &v) const
Definition: VM_SIMD.h:236
SYS_FORCE_INLINE v4uf operator=(const v4uf &v) noexcept
Definition: VM_SIMD.h:203
SYS_FORCE_INLINE v4uf operator!() const
Definition: VM_SIMD.h:281
GLubyte GLubyte GLubyte GLubyte w
Definition: glew.h:1890
UT_Vector3T< T > SYSclamp(const UT_Vector3T< T > &v, const UT_Vector3T< T > &min, const UT_Vector3T< T > &max)
Definition: UT_Vector3.h:820
#define VM_MUL
SYS_FORCE_INLINE v4uf operator||(const v4uf &r) const
Definition: VM_SIMD.h:284
Definition: VM_SIMD.h:188
#define VM_TAN
SYS_FORCE_INLINE v4uu operator>(const v4uf &v) const
Definition: VM_SIMD.h:230
const GLfloat * c
Definition: glew.h:16296
GLuint GLsizei GLsizei * length
Definition: glew.h:1825
GLenum GLenum dst
Definition: glew.h:2410
#define V4SF(A)
SYS_FORCE_INLINE void setComp(int idx, float v)
Definition: VM_SIMD.h:301
SYS_FORCE_INLINE void store(int32 v[4]) const
Definition: VM_SIMD.h:74
SYS_FORCE_INLINE v4uf(const float v[4])
Definition: VM_SIMD.h:207
#define VM_SPLATS
SYS_FORCE_INLINE v4uf clamp(float low, float high) const
Definition: VM_SIMD.h:308
SYS_FORCE_INLINE v4uu operator==(const v4uf &v) const
Definition: VM_SIMD.h:226
SYS_FORCE_INLINE v4uu operator*(const v4uu &r) const
Definition: VM_SIMD.h:117
SYS_FORCE_INLINE v4uu operator%(int r) const
Definition: VM_SIMD.h:146
Quat< T > operator/(const Quat< T > &q1, const Quat< T > &q2)
Definition: ImathQuat.h:887
#define VM_ABS
SYS_FORCE_INLINE v4uu operator-=(int32 r)
Definition: VM_SIMD.h:135
SYS_API fpreal32 SYSfloor(fpreal32 val)
SYS_FORCE_INLINE v4uf operator^(const v4uf &r) const
Definition: VM_SIMD.h:288
SYS_FORCE_INLINE v4uu operator-(int32 r) const
Definition: VM_SIMD.h:132
SYS_FORCE_INLINE float operator[](int idx) const
Definition: VM_SIMD.h:300
int sign(T a)
Definition: ImathFun.h:63
SYS_FORCE_INLINE void operator^=(const v4uu &r)
Definition: VM_SIMD.h:174
SYS_FORCE_INLINE v4uu()
Definition: VM_SIMD.h:61
int int32
Definition: SYS_Types.h:39
SYS_FORCE_INLINE void condAssign(const v4uf &val, const v4uu &c)
Definition: VM_SIMD.h:222
GridType::Ptr normalize(const GridType &grid, bool threaded, InterruptT *interrupt)
Normalize the vectors of the given vector-valued grid.
SYS_FORCE_INLINE v4uu operator<(const v4uf &v) const
Definition: VM_SIMD.h:232
SYS_FORCE_INLINE v4uu operator!=(const v4uu &v) const
Definition: VM_SIMD.h:89
#define VM_CMPNE
SYS_FORCE_INLINE v4uf operator*(float r) const
Definition: VM_SIMD.h:266
#define VM_SUB
SYS_FORCE_INLINE v4uu operator>=(const v4uf &v) const
Definition: VM_SIMD.h:234
unsigned short fpreal16
Definition: SYS_Types.h:202
SYS_FORCE_INLINE v4uu operator|(const v4uu &r) const
Definition: VM_SIMD.h:168
#define VM_IFLOAT
SYS_FORCE_INLINE v4uf operator*(const v4uf &r) const
Definition: VM_SIMD.h:254
SYS_FORCE_INLINE v4uf operator-(const v4uf &r) const
Definition: VM_SIMD.h:250
SYS_FORCE_INLINE void setComp(int idx, int32 v)
Definition: VM_SIMD.h:178
SYS_FORCE_INLINE v4uf operator&(const v4uu &r) const
Definition: VM_SIMD.h:292
GLdouble GLdouble GLdouble b
Definition: glew.h:9122
SYS_API double tan(double x)
Definition: SYS_FPUMath.h:75
SYS_FORCE_INLINE v4uu operator%=(int r)
Definition: VM_SIMD.h:154
SYS_FORCE_INLINE v4uu operator-=(const v4uu &r)
Definition: VM_SIMD.h:129
#define VM_MADD
#define VM_P_FLOOR()
SYS_FORCE_INLINE v4uu operator<=(const v4uu &v) const
Definition: VM_SIMD.h:97
SYS_FORCE_INLINE v4uu operator>(const v4uu &v) const
Definition: VM_SIMD.h:91
SYS_FORCE_INLINE v4uf operator&&(const v4uu &r) const
Definition: VM_SIMD.h:277
#define VM_INT
v4si vector
Definition: VM_SIMD.h:185
GLdouble GLdouble GLdouble r
Definition: glew.h:1406
v4uu splitFloat()
Definition: VM_SIMD.h:327
SYS_FORCE_INLINE v4uf clamp(const v4uf &low, const v4uf &high) const
Definition: VM_SIMD.h:305
SYS_FORCE_INLINE v4uu(int32 a, int32 b, int32 c, int32 d)
Definition: VM_SIMD.h:70
#define VM_ICMPEQ
#define VM_OR
#define VM_DIV
#define VM_ANDNOT
SYS_FORCE_INLINE v4uf swizzle() const
Definition: VM_SIMD.h:335
#define VM_MAX
GLuint64EXT * result
Definition: glew.h:14007
#define VM_IADD
#define VM_CMPEQ
#define VM_SHIFTLEFT(A, C)
SYS_FORCE_INLINE v4uu operator<<(int32 c) const
Definition: VM_SIMD.h:108
SYS_FORCE_INLINE int32 operator[](int idx) const
Definition: VM_SIMD.h:177
SYS_FORCE_INLINE v4uu operator+(int32 r) const
Definition: VM_SIMD.h:131
SYS_FORCE_INLINE v4uf operator/=(float r)
Definition: VM_SIMD.h:271
#define VM_SINCOS
#define VM_ADD
SYS_FORCE_INLINE v4uf(float v)
Definition: VM_SIMD.h:206
SYS_FORCE_INLINE v4uu operator!=(const v4uf &v) const
Definition: VM_SIMD.h:228
SYS_FORCE_INLINE v4uf operator-=(const v4uf &r)
Definition: VM_SIMD.h:260
SYS_FORCE_INLINE v4uf operator~() const
Definition: VM_SIMD.h:293
#define VM_LOAD
Definition: VM_BasicFunc.h:389
SYS_FORCE_INLINE void operator&=(const v4uu &r)
Definition: VM_SIMD.h:173
SYS_FORCE_INLINE v4uf operator||(const v4uu &r) const
Definition: VM_SIMD.h:275
SYS_FORCE_INLINE v4uf operator+(float r) const
Definition: VM_SIMD.h:264
SYS_FORCE_INLINE v4uf operator*=(const v4uf &r)
Definition: VM_SIMD.h:261
SYS_FORCE_INLINE v4uf operator+=(const v4uf &r)
Definition: VM_SIMD.h:259
SYS_FORCE_INLINE v4uu operator>>(int32 c) const
Definition: VM_SIMD.h:109
#define SYSmin(a, b)
Definition: SYS_Math.h:1448
SYS_FORCE_INLINE v4uf operator/=(const v4uf &r)
Definition: VM_SIMD.h:262
#define VM_AND
SYS_FORCE_INLINE v4uf operator-=(float r)
Definition: VM_SIMD.h:269
SYS_FORCE_INLINE v4uu(const v4uu &v)
Definition: VM_SIMD.h:63
float fpreal32
Definition: SYS_Types.h:195
#define VM_SHIFTRIGHT(A, C)
SYS_FORCE_INLINE v4uf operator+(const v4uf &r) const
Definition: VM_SIMD.h:248
SYS_FORCE_INLINE v4uu operator~() const
Definition: VM_SIMD.h:170
SIM_DerVector3 cross(const SIM_DerVector3 &lhs, const SIM_DerVector3 &rhs)
GLdouble GLdouble t
Definition: glew.h:1398
SYS_FORCE_INLINE v4uf(const v4sf &v) noexcept
Definition: VM_SIMD.h:201
SYS_API double sin(double x)
Definition: SYS_FPUMath.h:71
v4sf vector
Definition: VM_SIMD.h:348
v4uf toFloat() const
Definition: VM_SIMD.h:352
SYS_FORCE_INLINE v4uu isFinite() const
Definition: VM_SIMD.h:340
GLuint res
Definition: glew.h:11507
#define VM_E_FLOOR()
#define VM_XOR
GLboolean GLboolean g
Definition: glew.h:9477
SYS_FORCE_INLINE v4uf operator|(const v4uu &r) const
Definition: VM_SIMD.h:291
SYS_FORCE_INLINE v4uu operator*=(const v4uu &r)
Definition: VM_SIMD.h:130
#define VM_INVERT