80 # define OIIO_NO_SSE 1
81 # define OIIO_NO_AVX 1
82 # define OIIO_NO_AVX2 1
83 # define OIIO_NO_NEON 1
86 #if defined(_M_ARM64) || defined(__aarch64__) || defined(__aarch64)
94 #if defined(_M_ARM64) || defined(__aarch64) || defined(__aarch64__) \
95 || defined(__CUDA_ARCH__)
97 # define OIIO_NO_SSE 1
100 # define OIIO_NO_AVX 1
102 # ifndef OIIO_NO_AVX2
103 # define OIIO_NO_AVX2 1
107 #if !(defined(_M_ARM64) || defined(__aarch64) || defined(__aarch64__)) || defined(__CUDA_ARCH__)
108 # ifndef OIIO_NO_NEON
109 # define OIIO_NO_NEON 1
113 #if defined(__CUDA_ARCH__)
115 #elif defined(_WIN32)
117 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
118 # include <x86intrin.h>
119 #elif defined(__GNUC__) && defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
120 # include <arm_neon.h>
126 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
127 #define OIIO_NO_SSE 1
130 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
131 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
132 # define OIIO_SIMD_SSE 4
138 # elif defined(__SSSE3__)
139 # define OIIO_SIMD_SSE 3
148 # define OIIO_SIMD_SSE 2
151 # define OIIO_SIMD_MAX_SIZE_BYTES 16
152 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
153 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
155 # define OIIO_SIMD_SSE 0
158 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
160 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
161 # define OIIO_SIMD_AVX 2
163 # define OIIO_SIMD_AVX 1
167 # undef OIIO_SIMD_MAX_SIZE_BYTES
168 # define OIIO_SIMD_MAX_SIZE_BYTES 32
169 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
170 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
171 # if defined(__AVX512F__)
172 # undef OIIO_SIMD_AVX
173 # define OIIO_SIMD_AVX 512
174 # undef OIIO_SIMD_MAX_SIZE_BYTES
175 # define OIIO_SIMD_MAX_SIZE_BYTES 64
177 # define OIIO_SIMD 16
178 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
179 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
180 # define OIIO_AVX512F_ENABLED 1
182 # if defined(__AVX512DQ__)
183 # define OIIO_AVX512DQ_ENABLED 1
185 # define OIIO_AVX512DQ_ENABLED 0
187 # if defined(__AVX512PF__)
188 # define OIIO_AVX512PF_ENABLED 1
190 # define OIIO_AVX512PF_ENABLED 0
192 # if defined(__AVX512ER__)
193 # define OIIO_AVX512ER_ENABLED 1
195 # define OIIO_AVX512ER_ENABLED 0
197 # if defined(__AVX512CD__)
198 # define OIIO_AVX512CD_ENABLED 1
200 # define OIIO_AVX512CD_ENABLED 0
202 # if defined(__AVX512BW__)
203 # define OIIO_AVX512BW_ENABLED 1
205 # define OIIO_AVX512BW_ENABLED 0
207 # if defined(__AVX512VL__)
208 # define OIIO_AVX512VL_ENABLED 1
210 # define OIIO_AVX512VL_ENABLED 0
213 # define OIIO_SIMD_AVX 0
214 # define OIIO_AVX512VL_ENABLED 0
215 # define OIIO_AVX512DQ_ENABLED 0
216 # define OIIO_AVX512PF_ENABLED 0
217 # define OIIO_AVX512ER_ENABLED 0
218 # define OIIO_AVX512CD_ENABLED 0
219 # define OIIO_AVX512BW_ENABLED 0
223 # define OIIO_FMA_ENABLED 1
225 # define OIIO_FMA_ENABLED 0
227 #if defined(__AVX512IFMA__)
228 # define OIIO_AVX512IFMA_ENABLED 1
230 # define OIIO_AVX512IFMA_ENABLED 0
233 #if defined(__F16C__)
234 # define OIIO_F16C_ENABLED 1
236 # define OIIO_F16C_ENABLED 0
239 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
241 # define OIIO_SIMD_NEON 1
242 # define OIIO_SIMD_MAX_SIZE_BYTES 16
243 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
244 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
246 # define OIIO_SIMD_NEON 0
252 # define OIIO_SIMD4_ALIGN
253 # define OIIO_SIMD_MAX_SIZE_BYTES 16
256 #ifndef OIIO_SIMD8_ALIGN
257 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
259 #ifndef OIIO_SIMD16_ALIGN
260 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
269 #define OIIO_SIMD_HAS_MATRIX4 1
270 #define OIIO_SIMD_HAS_FLOAT8 1
271 #define OIIO_SIMD_HAS_SIMD8 1
272 #define OIIO_SIMD_HAS_SIMD16 1
303 #if OIIO_DISABLE_DEPRECATED < OIIO_MAKE_VERSION(1,9,0) && !defined(OIIO_INTERNAL)
339 template<>
struct simd_bool_t<4> {
typedef __m128
type; };
343 template<>
struct simd_raw_t<
int,8> {
typedef __m256i
type; };
344 template<>
struct simd_raw_t<
float,8> {
typedef __m256
type; };
345 template<>
struct simd_bool_t<8> {
typedef __m256
type; };
348 #if OIIO_SIMD_AVX >= 512
349 template<>
struct simd_raw_t<
int,16> {
typedef __m512i
type; };
350 template<>
struct simd_raw_t<
float,16> {
typedef __m512
type; };
351 template<>
struct simd_bool_t<16> {
typedef __mmask16
type; };
360 template<>
struct simd_raw_t<
float,4> {
typedef float32x4_t
type; };
361 template<>
struct simd_bool_t<4> {
typedef uint32x4_t
type; };
367 template<
typename T,
int elements>
struct VecType {};
401 template<
typename T>
struct SimdTypeName {
static const char *
name() {
return "unknown"; } };
413 template<
typename T>
struct is_simd : std::false_type {};
430 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
431 static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
432 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
433 static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
434 # define OIIO_SIMD_INT4_CONST(name,val) \
435 static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
436 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
437 static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
438 # define OIIO_SIMD_UINT4_CONST(name,val) \
439 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
440 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
441 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
443 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
444 static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
445 (val), (val), (val), (val) }
446 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
447 static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
448 (v4), (v5), (v6), (v7) }
449 # define OIIO_SIMD_INT8_CONST(name,val) \
450 static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
451 (val), (val), (val), (val) }
452 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
453 static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
454 (v4), (v5), (v6), (v7) }
455 # define OIIO_SIMD_UINT8_CONST(name,val) \
456 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
457 (val), (val), (val), (val) }
458 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
459 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
460 (v4), (v5), (v6), (v7) }
462 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
463 static const OIIO_SIMD16_ALIGN float name[16] = { \
464 (val), (val), (val), (val), (val), (val), (val), (val), \
465 (val), (val), (val), (val), (val), (val), (val), (val) }
466 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
467 static const OIIO_SIMD16_ALIGN float name[16] = { \
468 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
469 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
470 # define OIIO_SIMD_INT16_CONST(name,val) \
471 static const OIIO_SIMD16_ALIGN int name[16] = { \
472 (val), (val), (val), (val), (val), (val), (val), (val), \
473 (val), (val), (val), (val), (val), (val), (val), (val) }
474 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
475 static const OIIO_SIMD16_ALIGN int name[16] = { \
476 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
477 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
478 # define OIIO_SIMD_UINT16_CONST(name,val) \
479 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
480 (val), (val), (val), (val), (val), (val), (val), (val), \
481 (val), (val), (val), (val), (val), (val), (val), (val) }
482 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
483 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
484 (val), (val), (val), (val), (val), (val), (val), (val), \
485 (val), (val), (val), (val), (val), (val), (val), (val) }
492 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
493 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
494 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
495 for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
496 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
497 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
538 explicit vbool4 (
const bool *
a);
548 load (
bool(a),
bool(b),
bool(c),
bool(d));
597 void load (
bool a,
bool b,
bool c,
bool d);
634 template<
int i0,
int i1,
int i2,
int i3>
652 bool all (
const vbool4&
v);
653 bool any (
const vbool4&
v);
654 bool none (
const vbool4&
v);
657 inline bool all (
bool v) {
return v; }
683 vbool8 (
bool a,
bool b,
bool c,
bool d,
bool e,
bool f,
bool g,
bool h);
689 vbool8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
746 void load (
bool a,
bool b,
bool c,
bool d,
747 bool e,
bool f,
bool g,
bool h);
785 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
803 bool all (
const vbool8&
v);
804 bool any (
const vbool8&
v);
805 bool none (
const vbool8&
v);
834 vbool16 (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
835 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
841 vbool16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
842 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
897 void load (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
898 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
949 bool all (
const vbool16&
v);
950 bool any (
const vbool16&
v);
951 bool none (
const vbool16&
v);
988 vint4 (
const int *vals);
991 explicit vint4 (
const unsigned short *vals);
994 explicit vint4 (
const short *vals);
997 explicit vint4 (
const unsigned char *vals);
1000 explicit vint4 (
const char *vals);
1067 void load (
int a,
int b,
int c,
int d);
1111 template<
int scale=4>
1114 template<
int scale=4>
1116 template<
int scale=4>
1120 template<
int scale=4>
1123 template<
int scale=4>
1125 template<
int scale=4>
1176 vint4
srl (
const vint4&
val,
const unsigned int bits);
1180 template<
int i0,
int i1,
int i2,
int i3>
1200 vint4
blend (
const vint4&
a,
const vint4&
b,
const vbool4&
mask);
1205 vint4
blend0 (
const vint4&
a,
const vbool4&
mask);
1215 vint4
select (
const vbool4&
mask,
const vint4&
a,
const vint4&
b);
1218 vint4
abs (
const vint4&
a);
1219 vint4
min (
const vint4&
a,
const vint4&
b);
1220 vint4
max (
const vint4&
a,
const vint4&
b);
1223 vint4
rotl (
const vint4&
x,
const int s);
1225 vint4
rotl32 (
const vint4&
x,
const unsigned int k);
1228 vint4
andnot (
const vint4&
a,
const vint4&
b);
1235 void transpose (vint4 &
a, vint4 &
b, vint4 &
c, vint4 &d);
1236 void transpose (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d,
1237 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1239 vint4
AxBxCxDx (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d);
1242 vint4
safe_mod (
const vint4&
a,
const vint4&
b);
1276 vint8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1279 vint8 (
const int *vals);
1282 explicit vint8 (
const unsigned short *vals);
1285 explicit vint8 (
const short *vals);
1288 explicit vint8 (
const unsigned char *vals);
1291 explicit vint8 (
const char *vals);
1367 void load (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1411 template<
int scale=4>
1414 template<
int scale=4>
1416 template<
int scale=4>
1420 template<
int scale=4>
1423 template<
int scale=4>
1425 template<
int scale=4>
1477 vint8
srl (
const vint8&
val,
const unsigned int bits);
1481 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
1504 vint8
blend (
const vint8&
a,
const vint8&
b,
const vbool8&
mask);
1509 vint8
blend0 (
const vint8&
a,
const vbool8&
mask);
1519 vint8
select (
const vbool8&
mask,
const vint8&
a,
const vint8&
b);
1522 vint8
abs (
const vint8&
a);
1523 vint8
min (
const vint8&
a,
const vint8&
b);
1524 vint8
max (
const vint8&
a,
const vint8&
b);
1527 vint8
rotl (
const vint8&
x,
const int s);
1529 vint8
rotl32 (
const vint8&
x,
const unsigned int k);
1532 vint8
andnot (
const vint8&
a,
const vint8&
b);
1540 vint8
safe_mod (
const vint8&
a,
const vint8&
b);
1572 vint16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1573 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1576 vint16 (
const int *vals);
1579 explicit vint16 (
const unsigned short *vals);
1582 explicit vint16 (
const short *vals);
1585 explicit vint16 (
const unsigned char *vals);
1588 explicit vint16 (
const char *vals);
1667 void load (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1668 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1712 template<
int scale=4>
1715 template<
int scale=4>
1717 template<
int scale=4>
1719 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
1723 template<
int scale=4>
1726 template<
int scale=4>
1728 template<
int scale=4>
1730 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
1782 vint16
srl (
const vint16&
val,
const unsigned int bits);
1785 template<
int i0,
int i1,
int i2,
int i3>
1789 template<
int i> vint16
shuffle4 (
const vint16&
a);
1792 template<
int i0,
int i1,
int i2,
int i3>
1796 template<
int i> vint16
shuffle (
const vint16&
a);
1815 vint16
blend (
const vint16&
a,
const vint16&
b,
const vbool16&
mask);
1820 vint16
blend0 (
const vint16&
a,
const vbool16&
mask);
1830 vint16
select (
const vbool16&
mask,
const vint16&
a,
const vint16&
b);
1833 vint16
abs (
const vint16&
a);
1834 vint16
min (
const vint16&
a,
const vint16&
b);
1835 vint16
max (
const vint16&
a,
const vint16&
b);
1838 vint16
rotl (
const vint16&
x,
const int s);
1840 vint16
rotl32 (
const vint16&
x,
const unsigned int k);
1843 vint16
andnot (
const vint16&
a,
const vint16&
b);
1851 vint16
safe_mod (
const vint16&
a,
const vint16&
b);
1919 #ifdef INCLUDED_IMATHVEC_H
1939 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1972 load ((
const float *)&v);
1981 load (v[0], v[1], v[2], 0.0
f);
2006 void load (
float a,
float b,
float c,
float d=0.0
f);
2027 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2034 void load_pairs(
const float* lo,
const float* hi);
2041 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2056 template<
int scale=4>
2059 template<
int scale=4>
2061 template<
int scale=4>
2065 template<
int scale=4>
2068 template<
int scale=4>
2070 template<
int scale=4>
2126 template<
int i0,
int i1,
int i2,
int i3>
2136 shuffle(
const vfloat4&
a,
const vfloat4&
b);
2152 vfloat4
vdot (
const vfloat4 &
a,
const vfloat4 &
b);
2155 float dot (
const vfloat4 &
a,
const vfloat4 &
b);
2159 vfloat4
vdot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2162 float dot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2166 vfloat4
blend (
const vfloat4&
a,
const vfloat4&
b,
const vbool4&
mask);
2171 vfloat4
blend0 (
const vfloat4&
a,
const vbool4&
mask);
2180 vfloat4
safe_div (
const vfloat4 &
a,
const vfloat4 &
b);
2183 vfloat3
hdiv (
const vfloat4 &
a);
2188 vfloat4
select (
const vbool4&
mask,
const vfloat4&
a,
const vfloat4&
b);
2191 vfloat4
abs (
const vfloat4&
a);
2192 vfloat4
sign (
const vfloat4&
a);
2193 vfloat4
ceil (
const vfloat4&
a);
2194 vfloat4
floor (
const vfloat4&
a);
2195 vint4
ifloor (
const vfloat4&
a);
2206 vfloat4
round (
const vfloat4&
a);
2213 vint4
rint (
const vfloat4&
a);
2216 vfloat4
sqrt (
const vfloat4 &
a);
2217 vfloat4
rsqrt (
const vfloat4 &
a);
2219 vfloat4
min (
const vfloat4&
a,
const vfloat4&
b);
2220 vfloat4
max (
const vfloat4&
a,
const vfloat4&
b);
2225 vfloat4
andnot (
const vfloat4&
a,
const vfloat4&
b);
2228 vfloat4
madd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2229 vfloat4
msub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2230 vfloat4
nmadd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2231 vfloat4
nmsub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2236 void transpose (vfloat4 &
a, vfloat4 &
b, vfloat4 &
c, vfloat4 &d);
2237 void transpose (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c,
const vfloat4& d,
2238 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2241 vfloat4
AxBxCxDx (
const vfloat4&
a,
const vfloat4&
b,
2242 const vfloat4&
c,
const vfloat4& d);
2297 #ifdef INCLUDED_IMATHVEC_H
2314 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2353 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2362 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2408 vfloat3
abs (
const vfloat3&
a);
2409 vfloat3
sign (
const vfloat3&
a);
2410 vfloat3
ceil (
const vfloat3&
a);
2411 vfloat3
floor (
const vfloat3&
a);
2412 vfloat3
round (
const vfloat3&
a);
2457 const float *
c,
const float *d) {
2466 float f10,
float f11,
float f12,
float f13,
2467 float f20,
float f21,
float f22,
float f23,
2468 float f30,
float f31,
float f32,
float f33)
2476 #ifdef INCLUDED_IMATHMATRIX_H
2527 vfloat3
transformp (
const matrix44 &M,
const vfloat3 &V);
2530 vfloat3
transformv (
const matrix44 &M,
const vfloat3 &V);
2533 vfloat3
transformvT (
const matrix44 &M,
const vfloat3 &V);
2568 float e,
float f,
float g,
float h) {
load(a,b,c,d,e,f,g,h); }
2606 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2660 void load (
float a,
float b,
float c,
float d,
2661 float e,
float f,
float g,
float h);
2682 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2692 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2707 template<
int scale=4>
2709 template<
int scale=4>
2712 template<
int scale=4>
2716 template<
int scale=4>
2719 template<
int scale=4>
2721 template<
int scale=4>
2763 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
2783 vfloat8
vdot (
const vfloat8 &
a,
const vfloat8 &
b);
2786 float dot (
const vfloat8 &
a,
const vfloat8 &
b);
2790 vfloat8
vdot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2793 float dot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2797 vfloat8
blend (
const vfloat8&
a,
const vfloat8&
b,
const vbool8&
mask);
2802 vfloat8
blend0 (
const vfloat8&
a,
const vbool8&
mask);
2811 vfloat8
safe_div (
const vfloat8 &
a,
const vfloat8 &
b);
2816 vfloat8
select (
const vbool8&
mask,
const vfloat8&
a,
const vfloat8&
b);
2819 vfloat8
abs (
const vfloat8&
a);
2820 vfloat8
sign (
const vfloat8&
a);
2821 vfloat8
ceil (
const vfloat8&
a);
2822 vfloat8
floor (
const vfloat8&
a);
2823 vint8
ifloor (
const vfloat8&
a);
2833 vfloat8
round (
const vfloat8&
a);
2840 vint8
rint (
const vfloat8&
a);
2843 vfloat8
sqrt (
const vfloat8 &
a);
2844 vfloat8
rsqrt (
const vfloat8 &
a);
2846 vfloat8
min (
const vfloat8&
a,
const vfloat8&
b);
2847 vfloat8
max (
const vfloat8&
a,
const vfloat8&
b);
2852 vfloat8
andnot (
const vfloat8&
a,
const vfloat8&
b);
2855 vfloat8
madd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2856 vfloat8
msub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2857 vfloat8
nmadd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2858 vfloat8
nmsub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2889 float v4,
float v5,
float v6,
float v7,
2890 float v8,
float v9,
float v10,
float v11,
2891 float v12,
float v13,
float v14,
float v15);
2932 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2987 float v4,
float v5,
float v6,
float v7,
2988 float v8,
float v9,
float v10,
float v11,
2989 float v12,
float v13,
float v14,
float v15);
3010 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
3020 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
3035 template<
int scale=4>
3038 template<
int scale=4>
3040 template<
int scale=4>
3042 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
3046 template<
int scale=4>
3049 template<
int scale=4>
3051 template<
int scale=4>
3053 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
3094 template<
int i0,
int i1,
int i2,
int i3>
3101 template<
int i0,
int i1,
int i2,
int i3>
3105 template<
int i> vfloat16
shuffle (
const vfloat16&
a);
3122 vfloat16
blend (
const vfloat16&
a,
const vfloat16&
b,
const vbool4&
mask);
3127 vfloat16
blend0 (
const vfloat16&
a,
const vbool4&
mask);
3136 vfloat16
safe_div (
const vfloat16 &
a,
const vfloat16 &
b);
3141 vfloat16
select (
const vbool16&
mask,
const vfloat16&
a,
const vfloat16&
b);
3144 vfloat16
abs (
const vfloat16&
a);
3145 vfloat16
sign (
const vfloat16&
a);
3146 vfloat16
ceil (
const vfloat16&
a);
3147 vfloat16
floor (
const vfloat16&
a);
3148 vint16
ifloor (
const vfloat16&
a);
3159 vfloat16
round (
const vfloat16&
a);
3166 vint16
rint (
const vfloat16&
a);
3169 vfloat16
sqrt (
const vfloat16 &
a);
3170 vfloat16
rsqrt (
const vfloat16 &
a);
3172 vfloat16
min (
const vfloat16&
a,
const vfloat16&
b);
3173 vfloat16
max (
const vfloat16&
a,
const vfloat16&
b);
3178 vfloat16
andnot (
const vfloat16&
a,
const vfloat16&
b);
3181 vfloat16
madd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3182 vfloat16
msub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3183 vfloat16
nmadd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3184 vfloat16
nmsub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3193 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3194 _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3203 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3204 _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3212 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3213 return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3220 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3221 return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3259 return ((_mm_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3273 m_val[i] = value ? -1 : 0;
3279 for (
int i = 1; i < a.elements; ++i)
3280 cout <<
' ' << a[i];
3287 m_simd = _mm_castsi128_ps(_mm_set1_epi32(-
int(a)));
3288 #elif OIIO_SIMD_NEON
3289 m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3301 m_simd = _mm_castsi128_ps(_mm_set_epi32(-
int(d), -
int(c), -
int(b), -
int(a)));
3302 #elif OIIO_SIMD_NEON
3304 m_simd = vld1q_u32((
const uint32_t*)values);
3316 load (a[0], a[1], a[2], a[3]);
3327 return _mm_movemask_ps(
m_simd);
3328 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3329 const int shifts[4] { 0, 1, 2, 3 };
3330 const int32x4_t shift = vld1q_s32(shifts);
3331 uint32x4_t
t = vshrq_n_u32(
m_simd, 31);
3332 return vaddvq_u32(vshlq_u32(t, shift));
3352 m_simd = _mm_setzero_ps();
3361 return _mm_setzero_ps();
3370 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3371 __m128i anyval = _mm_undefined_si128();
3373 __m128i anyval = _mm_setzero_si128();
3375 return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3387 for (
int i = 0; i <
n; ++i)
3388 values[i] =
m_val[i] ?
true :
false;
3396 #elif OIIO_SIMD_NEON
3397 return vmvnq_u32(a.
simd());
3405 return _mm_and_ps (a.
simd(), b.
simd());
3406 #elif OIIO_SIMD_NEON
3407 return vandq_u32(a.
simd(), b.
simd());
3415 return _mm_or_ps (a.
simd(), b.
simd());
3416 #elif OIIO_SIMD_NEON
3417 return vorrq_u32(a.
simd(), b.
simd());
3425 return _mm_xor_ps (a.
simd(), b.
simd());
3426 #elif OIIO_SIMD_NEON
3427 return veorq_u32(a.
simd(), b.
simd());
3450 #elif OIIO_SIMD_NEON
3451 return vmvnq_u32(a.
m_simd);
3459 return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3460 #elif OIIO_SIMD_NEON
3469 return _mm_xor_ps (a, b);
3470 #elif OIIO_SIMD_NEON
3482 template<
int i0,
int i1,
int i2,
int i3>
3484 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3,
i2,
i1, i0));
3488 #if OIIO_SIMD_SSE >= 3
3491 return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(
a)));
3494 return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(
a)));
3497 return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(
a)));
3502 template<
int i0,
int i1,
int i2,
int i3>
3504 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3,
i2,
i1, i0)));
3508 #if OIIO_SIMD_SSE >= 3
3511 return _mm_moveldup_ps(a);
3514 return _mm_movehdup_ps(a);
3517 return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3524 template<
int i0,
int i1,
int i2,
int i3>
3527 return shuffle_sse<i0,i1,i2,i3> (a.
simd());
3535 return shuffle<i,i,i,i>(
a);
3543 #if OIIO_SIMD_SSE >= 4
3544 return _mm_extract_epi32(_mm_castps_si128(a.
simd()), i);
3545 #elif OIIO_SIMD_NEON
3546 return vgetq_lane_u32(a, i);
3557 #if OIIO_SIMD_SSE >= 4
3558 int ival = -
int(val);
3559 return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3560 #elif OIIO_SIMD_NEON
3561 uint32_t ival = uint32_t(val ? -1 : 0);
3562 return vld1q_lane_u32(&ival, a, i);
3575 return _mm_testc_ps (v,
vbool4(
true)) != 0;
3577 return _mm_movemask_ps(v.
simd()) == 0xf;
3578 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3579 uint32x4_t
t = vshrq_n_u32(v.
simd(), 31);
3580 return vaddvq_u32(t) == 4;
3588 return ! _mm_testz_ps (v, v);
3590 return _mm_movemask_ps(v) != 0;
3591 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3592 uint32x4_t
t = vshrq_n_u32(v.
simd(), 31);
3593 return vaddvq_u32(t) != 0;
3612 return ((_mm256_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3620 m_val[i] = value ? -1 : 0;
3631 for (
int i = 1; i < a.elements; ++i)
3632 cout <<
' ' << a[i];
3639 m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-
int(a)));
3640 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3651 bool e,
bool f,
bool g,
bool h) {
3655 m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-
int(h), -
int(g), -
int(f), -
int(e),
3656 -
int(d), -
int(c), -
int(b), -
int(a)));
3657 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3673 bool e,
bool f,
bool g,
bool h) {
3674 load (a, b, c, d, e, f, g, h);
3678 int e,
int f,
int g,
int h) {
3679 load (
bool(a),
bool(b),
bool(c),
bool(d),
3680 bool(e),
bool(f),
bool(g),
bool(h));
3684 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3700 return _mm256_movemask_ps(
m_simd);
3716 m_simd = _mm256_setzero_ps();
3724 return _mm256_setzero_ps();
3733 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3735 __m256i anyval = _mm256_undefined_si256();
3736 return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3738 return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3752 for (
int i = 0; i <
n; ++i)
3753 values[i] =
m_val[i] ?
true :
false;
3759 return _mm256_castps256_ps128 (
simd());
3767 return _mm256_extractf128_ps (
simd(), 1);
3776 __m256
r = _mm256_castps128_ps256 (lo);
3777 m_simd = _mm256_insertf128_ps (r, hi, 1);
3796 return _mm256_and_ps (a.
simd(), b.
simd());
3804 return _mm256_or_ps (a.
simd(), b.
simd());
3812 return _mm256_xor_ps (a.
simd(), b.
simd());
3843 #if OIIO_SIMD_AVX >= 2
3844 return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3846 return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3854 return _mm256_xor_ps (a, b);
3861 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
3863 #if OIIO_SIMD_AVX >= 2
3865 return _mm256_permutevar8x32_ps (a.
simd(), index.
simd());
3867 return vbool8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3872 return shuffle<i,i,i,i,i,i,i,i>(
a);
3878 #if OIIO_SIMD_AVX && !_WIN32
3879 return _mm256_extract_epi32(_mm256_castps_si256(a.
simd()), i);
3887 #if OIIO_SIMD_AVX && !_WIN32
3888 int ival = -
int(val);
3889 return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.
simd()), ival, i));
3900 return _mm256_testc_ps (v,
vbool8(
true)) != 0;
3909 return ! _mm256_testz_ps (v, v);
3929 #if OIIO_SIMD_AVX >= 512
3930 return (
int(
m_simd) >> i) & 1;
3932 return (
m_bits >> i) & 1;
3939 bits &= (0xffff ^ (1<<i));
3940 bits |= (
int(value)<<i);
3947 for (
int i = 1; i < a.elements; ++i)
3948 cout <<
' ' << a[i];
3964 bool v4,
bool v5,
bool v6,
bool v7,
3965 bool v8,
bool v9,
bool v10,
bool v11,
3966 bool v12,
bool v13,
bool v14,
bool v15) {
3986 bool v4,
bool v5,
bool v6,
bool v7,
3987 bool v8,
bool v9,
bool v10,
bool v11,
3988 bool v12,
bool v13,
bool v14,
bool v15) {
3989 load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3993 int v4,
int v5,
int v6,
int v7,
3994 int v8,
int v9,
int v10,
int v11,
3995 int v12,
int v13,
int v14,
int v15) {
3996 load (
bool(v0),
bool(v1),
bool(v2),
bool(v3),
3997 bool(v4),
bool(v5),
bool(v6),
bool(v7),
3998 bool(v8),
bool(v9),
bool(v10),
bool(v11),
3999 bool(v12),
bool(v13),
bool(v14),
bool(v15));
4007 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
4008 a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
4024 #if OIIO_SIMD_AVX >= 512
4052 for (
int i = 0; i <
n; ++i)
4053 values[i] =
m_bits & (1<<i);
4059 #if OIIO_SIMD_AVX >= 512
4060 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()&0xff, -1));
4067 #if OIIO_SIMD_AVX >= 512
4068 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()>>8, -1));
4076 #if OIIO_SIMD_AVX >= 512
4077 return _mm512_knot (a.
simd());
4084 #if OIIO_SIMD_AVX >= 512
4085 return _mm512_kand (a.
simd(), b.
simd());
4092 #if OIIO_SIMD_AVX >= 512
4093 return _mm512_kor (a.
simd(), b.
simd());
4100 #if OIIO_SIMD_AVX >= 512
4101 return _mm512_kxor (a.
simd(), b.
simd());
4127 #if OIIO_SIMD_AVX >= 512
4128 return _mm512_kxnor (a.
simd(), b.
simd());
4135 #if OIIO_SIMD_AVX >= 512
4136 return _mm512_kxor (a.
simd(), b.
simd());
4200 m_simd = _mm_set1_epi32 (a);
4201 #elif OIIO_SIMD_NEON
4202 m_simd = vdupq_n_s32 (a);
4212 m_simd = _mm_set_epi32 (d, c, b, a);
4213 #elif OIIO_SIMD_NEON
4215 m_simd = vld1q_s32 (values);
4235 #elif OIIO_SIMD_NEON
4236 m_simd = vld1q_s32 (values);
4246 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4247 m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
4251 m_simd = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4255 m_simd = _mm_castpd_si128 (_mm_load_sd ((
const double*)values));
4260 m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((
const double*)values)),
4261 _mm_load_ss ((
const float *)values + 2)));
4271 for (
int i = 0; i <
n; ++i)
4272 m_val[i] = values[i];
4280 #if OIIO_SIMD_SSE >= 4
4282 simd_t a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
4283 m_simd = _mm_cvtepu16_epi32 (a);
4291 #if OIIO_SIMD_SSE >= 4
4293 simd_t a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
4294 m_simd = _mm_cvtepi16_epi32 (a);
4302 #if OIIO_SIMD_SSE >= 4
4304 simd_t a = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4305 m_simd = _mm_cvtepu8_epi32 (a);
4306 #elif OIIO_SIMD_SSE >= 2
4308 simd_t a = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4309 a = _mm_unpacklo_epi8(a, _mm_setzero_si128());
4310 m_simd = _mm_unpacklo_epi16(a, _mm_setzero_si128());
4318 #if OIIO_SIMD_SSE >= 4
4320 simd_t a = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4321 m_simd = _mm_cvtepi8_epi32 (a);
4354 #elif OIIO_SIMD_NEON
4355 vst1q_s32(values,
m_simd);
4363 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4364 m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (
const simd_t *)values);
4365 #elif OIIO_SIMD_AVX >= 2
4374 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4376 #elif OIIO_SIMD_AVX >= 2
4377 m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask));
4385 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4386 _mm_mask_storeu_epi32 (values, __mmask8(mask),
m_simd);
4387 #elif OIIO_SIMD_AVX >= 2
4390 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
4396 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4398 #elif OIIO_SIMD_AVX >= 2
4399 _mm_maskstore_epi32 (values, _mm_castps_si128(mask),
m_simd);
4401 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
4406 template <
int scale>
4410 #if OIIO_SIMD_AVX >= 2
4411 m_simd = _mm_i32gather_epi32 (baseptr, vindex,
scale);
4421 #if OIIO_SIMD_AVX >= 2
4422 m_simd = _mm_mask_i32gather_epi32 (
m_simd, baseptr, vindex, _mm_cvtps_epi32(mask),
scale);
4432 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4434 _mm_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
4443 const vint_t& vindex)
const
4445 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4456 m_simd = _mm_setzero_si128();
4466 return _mm_setzero_si128();
4479 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
4480 __m128i anyval = _mm_undefined_si128();
4482 __m128i anyval = _mm_setzero_si128();
4484 return _mm_cmpeq_epi8 (anyval, anyval);
4493 return vint4 (start+0*step, start+1*step, start+2*step, start+3*step);
4498 return vint4 (1<<0, 1<<1, 1<<2, 1<<3);
4504 return _mm_add_epi32 (a.
simd(), b.
simd());
4505 #elif OIIO_SIMD_NEON
4519 return _mm_sub_epi32 (_mm_setzero_si128(), a);
4520 #elif OIIO_SIMD_NEON
4521 return vnegq_s32(a.
m_simd);
4530 return _mm_sub_epi32 (a.
simd(), b.
simd());
4531 #elif OIIO_SIMD_NEON
4548 #if OIIO_SIMD_SSE >= 4
4549 return _mm_mullo_epi32(a, b);
4555 t0 = _mm_mul_epu32 (a, b);
4556 t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1),
4557 _mm_shuffle_epi32 (b, 0xB1));
4558 t0 = _mm_shuffle_epi32 (t0, 0xD8);
4559 t1 = _mm_shuffle_epi32 (t1, 0xD8);
4560 return _mm_unpacklo_epi32 (t0, t1);
4568 return mul_epi32 (a.
simd(), b.
simd());
4569 #elif OIIO_SIMD_NEON
4610 return _mm_and_si128 (a.
simd(), b.
simd());
4611 #elif OIIO_SIMD_NEON
4612 return vandq_s32(a.
simd(), b.
simd());
4625 return _mm_or_si128 (a.
simd(), b.
simd());
4626 #elif OIIO_SIMD_NEON
4627 return vorrq_s32(a.
simd(), b.
simd());
4638 return _mm_xor_si128 (a.
simd(), b.
simd());
4639 #elif OIIO_SIMD_NEON
4640 return veorq_s32(a.
simd(), b.
simd());
4653 #elif OIIO_SIMD_NEON
4654 return vmvnq_s32(a.
m_simd);
4662 return _mm_slli_epi32 (a, bits);
4663 #elif OIIO_SIMD_NEON
4664 return vshlq_s32(a.
m_simd, vdupq_n_s32(bits));
4671 return a = a << bits;
4677 return _mm_srai_epi32 (a, bits);
4678 #elif OIIO_SIMD_NEON
4679 return vshlq_s32(a.
m_simd, vdupq_n_s32(-(
int)bits));
4686 return a = a >> bits;
4692 return _mm_srli_epi32 (a, bits);
4693 #elif OIIO_SIMD_NEON
4694 uint32x4_t au = vreinterpretq_u32_s32(a);
4695 au = vshlq_u32(au, vdupq_n_s32(-(
int)bits));
4696 return vreinterpretq_s32_u32(au);
4705 return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b));
4706 #elif OIIO_SIMD_NEON
4707 return vceqq_s32 (a, b);
4720 return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b));
4721 #elif OIIO_SIMD_NEON
4722 return vcgtq_s32 (a, b);
4730 return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b));
4731 #elif OIIO_SIMD_NEON
4732 return vcltq_s32 (a, b);
4739 return (b < a) | (a ==
b);
4743 return (b > a) | (a ==
b);
4748 for (
int i = 1; i < val.elements; ++i)
4749 cout <<
' ' << val[i];
4756 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4760 _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)),
m_simd);
4766 for (
int i = 0; i <
n; ++i)
4767 values[i] =
m_val[i];
4769 for (
int i = 0; i <
n; ++i)
4770 values[i] =
m_val[i];
4777 #if OIIO_AVX512VL_ENABLED
4778 _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf),
m_simd);
4783 vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6));
4785 vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6));
4787 vint4 highswapped = shuffle_sse<2,3,0,1>(high);
4788 vint4
result = low | highswapped;
4789 _mm_storel_pd ((
double *)values, _mm_castsi128_pd(result));
4799 #if OIIO_AVX512VL_ENABLED
4800 _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf),
m_simd);
4803 simd_t val16 = _mm_packs_epi32(clamped, _mm_setzero_si128());
4804 simd_t val8 = _mm_packus_epi16(val16, _mm_setzero_si128());
4805 _mm_store_ss((
float*)values, _mm_castsi128_ps(val8));
4806 #elif OIIO_SIMD_NEON
4808 int16x8_t val16 = vcombine_s16(vqmovn_s32(clamped), vdup_n_s16(0));
4809 uint8x16_t val8 = vcombine_u8(vqmovun_s16(val16), vdup_n_u8(0));
4810 vst1q_lane_u32((uint32_t*)values, vreinterpretq_u32_u8(val8), 0);
4819 template<
int i0,
int i1,
int i2,
int i3>
4822 return shuffle_sse<i0,i1,i2,i3> (__m128i(a));
4824 return vint4(a[i0], a[
i1], a[
i2], a[i3]);
4833 #if OIIO_SIMD_SSE >= 4
4834 return _mm_extract_epi32(v.
simd(), i);
4835 #elif OIIO_SIMD_NEON
4836 return vgetq_lane_s32(v.
simd(), i);
4844 return _mm_cvtsi128_si32(v.simd());
4850 #if OIIO_SIMD_SSE >= 4
4851 return _mm_insert_epi32 (a.
simd(),
val, i);
4852 #elif OIIO_SIMD_NEON
4853 return vld1q_lane_s32(&val, a.
simd(), i);
4876 return _mm_castps_si128 (x.
simd());
4878 return *(
vint4 *)&x;
4888 #if OIIO_SIMD_SSE >= 3
4897 #elif OIIO_SIMD_SSE >= 2
4901 vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(
v) + v;
4903 vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
4905 vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;
4907 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4908 return vint4(vaddvq_s32(v));
4918 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4919 return vaddvq_s32(v);
4928 vint4 ab = v & shuffle<1,1,3,3>(
v);
4929 vint4 abcd = ab & shuffle<2>(ab);
4930 return extract<0>(abcd);
4939 vint4 ab = v | shuffle<1,1,3,3>(
v);
4940 vint4 abcd = ab | shuffle<2>(ab);
4941 return extract<0>(abcd);
4950 #if OIIO_SIMD_SSE >= 4
4951 return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.
simd()),
4952 _mm_castsi128_ps(b.
simd()), mask));
4954 return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.
simd()), b.
simd()),
4955 _mm_andnot_si128(_mm_castps_si128(mask.
simd()), a.
simd()));
4956 #elif OIIO_SIMD_NEON
4965 return _mm_and_si128(_mm_castps_si128(mask), a.
simd());
4966 #elif OIIO_SIMD_NEON
4976 return _mm_andnot_si128(_mm_castps_si128(mask), a.
simd());
4984 return blend (b, a, mask);
4990 #if OIIO_SIMD_SSE >= 3
4991 return _mm_abs_epi32(a.
simd());
4992 #elif OIIO_SIMD_NEON
4993 return vabsq_s32(a.
simd());
5002 #if OIIO_SIMD_SSE >= 4
5003 return _mm_min_epi32 (a, b);
5004 #elif OIIO_SIMD_NEON
5005 return vminq_s32(a, b);
5013 #if OIIO_SIMD_SSE >= 4
5014 return _mm_max_epi32 (a, b);
5015 #elif OIIO_SIMD_NEON
5016 return vmaxq_s32(a, b);
5024 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5029 return (x<<s) |
srl(x,32-s);
5031 return (x<<s) |
srl(x,32-s);
5043 return _mm_andnot_si128 (a.
simd(), b.
simd());
5095 m_simd = _mm256_set1_epi32 (a);
5096 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5106 int e,
int f,
int g,
int h) {
5108 m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a);
5109 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5128 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5140 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5141 m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values);
5142 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5146 hi.
load (values+4, n-4);
5151 lo.
load (values, n);
5157 for (
int i = 0; i <
n; ++i)
5158 m_val[i] = values[i];
5166 #if OIIO_SIMD_AVX >= 2
5167 m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values));
5168 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5177 #if OIIO_SIMD_AVX >= 2
5178 m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values));
5179 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5189 #if OIIO_SIMD_AVX >= 2
5190 __m128i
bytes = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
5191 m_simd = _mm256_cvtepi8_epi32 (bytes);
5192 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5201 #if OIIO_SIMD_AVX >= 2
5202 __m128i
bytes = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
5203 m_simd = _mm256_cvtepu8_epi32 (bytes);
5204 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5217 int e,
int f,
int g,
int h) {
5218 load(a,b,c,d,e,f,g,h);
5236 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5246 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5247 m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (
const simd_t *)values);
5248 #elif OIIO_SIMD_AVX >= 2
5257 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5259 #elif OIIO_SIMD_AVX >= 2
5260 m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask));
5268 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5269 _mm256_mask_storeu_epi32 (values, __mmask8(mask),
m_simd);
5270 #elif OIIO_SIMD_AVX >= 2
5273 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
5279 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5280 _mm256_mask_storeu_epi32 (values, __mmask8(mask.
bitmask()),
m_simd);
5281 #elif OIIO_SIMD_AVX >= 2
5282 _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask),
m_simd);
5284 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
5289 template <
int scale>
5293 #if OIIO_SIMD_AVX >= 2
5294 m_simd = _mm256_i32gather_epi32 (baseptr, vindex,
scale);
5304 #if OIIO_SIMD_AVX >= 2
5305 m_simd = _mm256_mask_i32gather_epi32 (
m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask),
scale);
5315 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5316 _mm256_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
5325 const vint_t& vindex)
const
5327 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5337 m_simd = _mm256_setzero_si256();
5346 return _mm256_setzero_si256();
5358 return vint8 (start+0*step, start+1*step, start+2*step, start+3*step,
5359 start+4*step, start+5*step, start+6*step, start+7*step);
5364 return vint8 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7);
5370 return _mm256_castsi256_si128 (
simd());
5378 return _mm256_extractf128_si256 (
simd(), 1);
5387 __m256i
r = _mm256_castsi128_si256 (lo);
5388 m_simd = _mm256_insertf128_si256 (r, hi, 1);
5399 #if OIIO_SIMD_AVX >= 2
5400 return _mm256_add_epi32 (a.
simd(), b.
simd());
5413 #if OIIO_SIMD_AVX >= 2
5414 return _mm256_sub_epi32 (_mm256_setzero_si256(), a);
5422 #if OIIO_SIMD_AVX >= 2
5423 return _mm256_sub_epi32 (a.
simd(), b.
simd());
5436 #if OIIO_SIMD_AVX >= 2
5437 return _mm256_mullo_epi32 (a.
simd(), b.
simd());
5472 #if OIIO_SIMD_AVX >= 2
5473 return _mm256_and_si256 (a.
simd(), b.
simd());
5482 #if OIIO_SIMD_AVX >= 2
5483 return _mm256_or_si256 (a.
simd(), b.
simd());
5492 #if OIIO_SIMD_AVX >= 2
5493 return _mm256_xor_si256 (a.
simd(), b.
simd());
5503 #if OIIO_SIMD_AVX >= 2
5512 #if OIIO_SIMD_AVX >= 2
5513 return _mm256_slli_epi32 (a, bits);
5515 return vint8 (a.
lo() << bits, a.
hi() << bits);
5523 return a = a << bits;
5527 #if OIIO_SIMD_AVX >= 2
5528 return _mm256_srai_epi32 (a, bits);
5530 return vint8 (a.
lo() >> bits, a.
hi() >> bits);
5537 return a = a >> bits;
5542 #if OIIO_SIMD_AVX >= 2
5543 return _mm256_srli_epi32 (a, bits);
5552 #if OIIO_SIMD_AVX >= 2
5553 return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.
m_simd, b.
m_simd));
5570 #if OIIO_SIMD_AVX >= 2
5571 return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b));
5582 #if OIIO_SIMD_AVX >= 2
5595 return (a > b) | (a ==
b);
5601 return (b > a) | (a ==
b);
5607 for (
int i = 1; i < val.elements; ++i)
5608 cout <<
' ' << val[i];
5615 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5619 _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)),
m_simd);
5630 for (
int i = 0; i <
n; ++i)
5631 values[i] =
m_val[i];
5639 #if OIIO_AVX512VL_ENABLED
5640 _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff),
m_simd);
5651 #if OIIO_AVX512VL_ENABLED
5652 _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff),
m_simd);
5662 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
5664 #if OIIO_SIMD_AVX >= 2
5666 return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.
simd()), index.
simd()));
5668 return vint8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
5673 return shuffle<i,i,i,i,i,i,i,i>(
a);
5679 #if OIIO_SIMD_AVX && !_WIN32
5680 return _mm256_extract_epi32(v.
simd(), i);
5689 #if OIIO_SIMD_AVX && !_WIN32
5690 return _mm256_insert_epi32 (a.
simd(),
val, i);
5712 return _mm256_castps_si256 (x.
simd());
5720 #if OIIO_SIMD_AVX >= 2
5722 vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.
simd(), _mm256_setzero_si256());
5723 vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256());
5725 vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
5726 vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
5727 return shuffle<0>(final_sum);
5728 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5730 return vint8(hadd4, hadd4);
5738 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
5747 #if OIIO_SSE_AVX >= 2
5748 vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(
v);
5749 vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab);
5750 vint8 abcdefgh = abcd & shuffle<4>(abcdefgh);
5751 return extract<0> (abcdefgh);
5760 #if OIIO_SSE_AVX >= 2
5761 vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(
v);
5762 vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab);
5763 vint8 abcdefgh = abcd | shuffle<4>(abcdefgh);
5764 return extract<0> (abcdefgh);
5774 return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.
simd()),
5775 _mm256_castsi256_ps(b.
simd()), mask));
5789 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.
simd()), mask));
5803 return _mm256_castps_si256 (_mm256_andnot_ps (mask.
simd(), _mm256_castsi256_ps(a.
simd())));
5813 return blend (b, a, mask);
5818 #if OIIO_SIMD_AVX >= 2
5819 return _mm256_abs_epi32(a.
simd());
5820 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5829 #if OIIO_SIMD_AVX >= 2
5830 return _mm256_min_epi32 (a, b);
5838 #if OIIO_SIMD_AVX >= 2
5839 return _mm256_max_epi32 (a, b);
5847 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5852 return (x<<s) |
srl(x,32-s);
5854 return (x<<s) |
srl(x,32-s);
5865 #if OIIO_SIMD_AVX >= 2
5866 return _mm256_andnot_si256 (a.
simd(), b.
simd());
5867 #elif OIIO_SIMD_AVX >= 1
5868 return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.
simd()), _mm256_castsi256_ps(b.
simd())));
5919 #if OIIO_SIMD_AVX >= 512
5920 m_simd = _mm512_set1_epi32 (a);
5929 int v4,
int v5,
int v6,
int v7,
5930 int v8,
int v9,
int v10,
int v11,
5931 int v12,
int v13,
int v14,
int v15) {
5932 #if OIIO_SIMD_AVX >= 512
5933 m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7,
5934 v8, v9, v10, v11, v12, v13, v14, v15);
5957 #if OIIO_SIMD_AVX >= 512
5968 #if OIIO_SIMD_AVX >= 512
5969 m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values);
5983 #if OIIO_SIMD_AVX >= 512
5984 m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values));
5992 #if OIIO_SIMD_AVX >= 512
5993 m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values));
6002 #if OIIO_SIMD_AVX >= 512
6003 m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values));
6011 #if OIIO_SIMD_AVX >= 512
6012 m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values));
6023 int v4,
int v5,
int v6,
int v7,
6024 int v8,
int v9,
int v10,
int v11,
6025 int v12,
int v13,
int v14,
int v15) {
6026 load (v0, v1, v2, v3, v4, v5, v6, v7,
6027 v8, v9, v10, v11, v12, v13, v14, v15);
6040 #if OIIO_SIMD_AVX >= 512
6041 m_simd = _mm512_maskz_loadu_epi32 (mask, (
const simd_t *)values);
6050 #if OIIO_SIMD_AVX >= 512
6059 template <
int scale>
6062 #if OIIO_SIMD_AVX >= 512
6063 m_simd = _mm512_i32gather_epi32 (vindex, baseptr,
scale);
6073 #if OIIO_SIMD_AVX >= 512
6084 #if OIIO_SIMD_AVX >= 512
6085 _mm512_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
6095 const vint_t& vindex)
const {
6096 #if OIIO_SIMD_AVX >= 512
6097 _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex,
m_simd,
scale);
6106 #if OIIO_SIMD_AVX >= 512
6119 #if OIIO_SIMD_AVX >= 512
6120 m_simd = _mm512_setzero_si512();
6128 #if OIIO_SIMD_AVX >= 512
6129 return _mm512_setzero_epi32();
6141 return vint16 (start+0*step, start+1*step, start+2*step, start+3*step,
6142 start+4*step, start+5*step, start+6*step, start+7*step,
6143 start+8*step, start+9*step, start+10*step, start+11*step,
6144 start+12*step, start+13*step, start+14*step, start+15*step);
6149 return vint16 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
6150 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15);
6155 #if OIIO_SIMD_AVX >= 512
6156 return _mm512_castsi512_si256 (
simd());
6163 #if OIIO_SIMD_AVX >= 512
6164 return _mm512_extracti64x4_epi64 (
simd(), 1);
6172 #if OIIO_SIMD_AVX >= 512
6173 __m512i
r = _mm512_castsi256_si512 (lo);
6174 m_simd = _mm512_inserti32x8 (r, hi, 1);
6183 #if OIIO_SIMD_AVX >= 512
6184 m_simd = _mm512_broadcast_i32x4(a);
6196 #if OIIO_SIMD_AVX >= 512
6197 return _mm512_add_epi32 (a.
simd(), b.
simd());
6210 #if OIIO_SIMD_AVX >= 512
6211 return _mm512_sub_epi32 (_mm512_setzero_si512(), a);
6219 #if OIIO_SIMD_AVX >= 512
6220 return _mm512_sub_epi32 (a.
simd(), b.
simd());
6233 #if OIIO_SIMD_AVX >= 512
6234 return _mm512_mullo_epi32 (a.
simd(), b.
simd());
6269 #if OIIO_SIMD_AVX >= 512
6270 return _mm512_and_si512 (a.
simd(), b.
simd());
6279 #if OIIO_SIMD_AVX >= 512
6280 return _mm512_or_si512 (a.
simd(), b.
simd());
6289 #if OIIO_SIMD_AVX >= 512
6290 return _mm512_xor_si512 (a.
simd(), b.
simd());
6300 #if OIIO_SIMD_AVX >= 512
6309 #if OIIO_SIMD_AVX >= 512
6310 return _mm512_sllv_epi32 (a,
vint16(
int(bits)));
6314 return vint16 (a.
lo() << bits, a.
hi() << bits);
6320 return a = a << bits;
6324 #if OIIO_SIMD_AVX >= 512
6325 return _mm512_srav_epi32 (a,
vint16(
int(bits)));
6328 return vint16 (a.
lo() >> bits, a.
hi() >> bits);
6333 return a = a >> bits;
6338 #if OIIO_SIMD_AVX >= 512
6339 return _mm512_srlv_epi32 (a,
vint16(
int(bits)));
6348 #if OIIO_SIMD_AVX >= 512
6349 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 0 );
6357 #if OIIO_SIMD_AVX >= 512
6358 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 4 );
6366 #if OIIO_SIMD_AVX >= 512
6367 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 6 );
6375 #if OIIO_SIMD_AVX >= 512
6376 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 1 );
6384 #if OIIO_SIMD_AVX >= 512
6385 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 5 );
6393 #if OIIO_SIMD_AVX >= 512
6394 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 2 );
6403 for (
int i = 1; i < val.elements; ++i)
6404 cout <<
' ' << val[i];
6412 #if 0 && OIIO_SIMD_AVX >= 512
6416 _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)),
m_simd);
6429 #if OIIO_SIMD_AVX512
6430 _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff),
m_simd);
6431 #elif OIIO_SIMD_AVX >= 2
6441 #if OIIO_SIMD_AVX512
6442 _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff),
m_simd);
6443 #elif OIIO_SIMD_AVX >= 2
6454 template<
int i0,
int i1,
int i2,
int i3>
6456 #if OIIO_SIMD_AVX >= 512
6457 __m512
x = _mm512_castsi512_ps(a);
6458 return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,
i2,
i1,i0)));
6467 return shuffle4<i,i,i,i> (
a);
6470 template<
int i0,
int i1,
int i2,
int i3>
6472 #if OIIO_SIMD_AVX >= 512
6473 __m512
x = _mm512_castsi512_ps(a);
6474 return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,
i2,
i1,i0)));
6478 return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
6479 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
6484 return shuffle<i,i,i,i> (
a);
6503 #if OIIO_SIMD_AVX >= 512
6504 return _mm_cvtsi128_si32(_mm512_castsi512_si128(
m_simd));
6521 #if OIIO_SIMD_AVX >= 512
6522 return _mm512_maskz_set1_epi32 (x, -1);
6530 #if OIIO_SIMD_AVX >= 512
6533 vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(
v);
6534 vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
6536 vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(
w);
6537 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
6540 return vint16 (sum, sum);
6546 #if OIIO_SIMD_AVX >= 512
6555 #if OIIO_SIMD_AVX >= 512
6558 vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(
v);
6559 vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD);
6561 vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(
w);
6562 vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd);
6571 #if OIIO_SIMD_AVX >= 512
6574 vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(
v);
6575 vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD);
6577 vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(
w);
6578 vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd);
6588 #if OIIO_SIMD_AVX >= 512
6589 return _mm512_mask_blend_epi32 (mask, a, b);
6598 #if OIIO_SIMD_AVX >= 512
6599 return _mm512_maskz_mov_epi32 (mask, a);
6608 #if OIIO_SIMD_AVX >= 512
6609 return _mm512_maskz_mov_epi32 (!mask, a);
6617 return blend (b, a, mask);
6622 #if OIIO_SIMD_AVX >= 512
6623 return _mm512_abs_epi32(a.
simd());
6631 #if OIIO_SIMD_AVX >= 512
6632 return _mm512_min_epi32 (a, b);
6640 #if OIIO_SIMD_AVX >= 512
6641 return _mm512_max_epi32 (a, b);
6649 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6654 return (x<<s) |
srl(x,32-s);
6656 return (x<<s) |
srl(x,32-s);
6667 #if OIIO_SIMD_AVX >= 512
6668 return _mm512_andnot_epi32 (a.
simd(), b.
simd());
6696 #elif OIIO_SIMD_NEON
6706 return _mm_setzero_ps();
6717 return vfloat4 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step);
6723 m_simd = _mm_setzero_ps();
6743 m_simd = _mm_set1_ps (val);
6744 #elif OIIO_SIMD_NEON
6745 m_simd = vdupq_n_f32 (val);
6753 m_simd = _mm_set_ps (d, c, b, a);
6754 #elif OIIO_SIMD_NEON
6756 m_simd = vld1q_f32 (values);
6768 m_simd = _mm_loadu_ps (values);
6769 #elif OIIO_SIMD_NEON
6770 m_simd = vld1q_f32 (values);
6779 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6780 m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values);
6784 m_simd = _mm_load_ss (values);
6788 m_simd = _mm_castpd_ps (_mm_load_sd ((
const double*)values));
6791 m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0
f);
6801 m_simd = _mm_loadu_ps (values);
6807 #elif OIIO_SIMD_NEON
6814 m_simd = vld1q_f32(values);
6822 for (
int i = 0; i <
n; ++i)
6823 m_val[i] = values[i];
6831 #if OIIO_SIMD_SSE >= 2
6842 #if OIIO_SIMD_SSE >= 2
6851 #if OIIO_SIMD_SSE >= 2
6860 #if OIIO_SIMD_SSE >= 2
6867 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6869 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6871 __m128i a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
6872 m_simd = _mm_cvtph_ps (a);
6873 #elif OIIO_SIMD_SSE >= 2
6876 vint4 h ((
const unsigned short *)values);
6877 # define CONSTI(name) *(const __m128i *)&name
6878 # define CONSTF(name) *(const __m128 *)&name
6883 __m128i mnosign = CONSTI(mask_nosign);
6884 __m128i expmant = _mm_and_si128(mnosign,
h);
6885 __m128i justsign = _mm_xor_si128(
h, expmant);
6886 __m128i expmant2 = expmant;
6887 __m128i shifted = _mm_slli_epi32(expmant, 13);
6888 __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(
const __m128 *)&magic);
6889 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan));
6890 __m128i
sign = _mm_slli_epi32(justsign, 16);
6891 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan));
6892 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
6893 __m128
final = _mm_or_ps(scaled, sign_inf);
6898 #elif OIIO_SIMD_NEON
6899 vint4 h ((
const unsigned short *)values);
6900 uint32x4_t u32 = vreinterpretq_u32_s32(
h);
6901 uint16x4_t u16 = vmovn_u32(u32);
6902 float16x4_t f16 = vreinterpret_f16_u16(u16);
6903 m_simd = vcvt_f32_f16(f16);
6914 m_simd = _mm_loadh_pi(_mm_loadl_pi(
Zero(), (__m64*)lo), (__m64*)hi);
6928 _mm_storeu_ps (values,
m_simd);
6929 #elif OIIO_SIMD_NEON
6930 vst1q_f32 (values,
m_simd);
6938 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6942 _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)),
m_simd);
6946 _mm_store_ss (values,
m_simd);
6950 _mm_store_sd ((
double*)values, _mm_castps_pd(
m_simd));
6953 values[0] =
m_val[0];
6954 values[1] =
m_val[1];
6955 values[2] =
m_val[2];
6968 #elif OIIO_SIMD_NEON
6971 vst1q_lane_f32 (values,
m_simd, 0);
6974 vst1q_lane_f32 (values++,
m_simd, 0);
6975 vst1q_lane_f32 (values,
m_simd, 1);
6978 vst1q_lane_f32 (values++,
m_simd, 0);
6979 vst1q_lane_f32 (values++,
m_simd, 1);
6980 vst1q_lane_f32 (values,
m_simd, 2);
6983 vst1q_f32 (values,
m_simd);
break;
6988 for (
int i = 0; i <
n; ++i)
6989 values[i] =
m_val[i];
6993 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6995 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6996 __m128i
h = _mm_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
6997 _mm_store_sd ((
double *)values, _mm_castsi128_pd(h));
6998 #elif OIIO_SIMD_NEON
6999 float16x4_t f16 = vcvt_f16_f32(
m_simd);
7000 uint16x4_t u16 = vreinterpret_u16_f16(f16);
7001 vst1_u16((
unsigned short*)values, u16);
7010 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7011 m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (
const simd_t *)values);
7021 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7024 m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask));
7032 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7033 _mm_mask_storeu_ps (values, __mmask8(mask),
m_simd);
7037 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
7043 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7044 _mm_mask_storeu_ps (values, __mmask8(mask.
bitmask()),
m_simd);
7046 _mm_maskstore_ps (values, _mm_castps_si128(mask.
simd()),
m_simd);
7048 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
7053 template <
int scale>
7057 #if OIIO_SIMD_AVX >= 2
7058 m_simd = _mm_i32gather_ps (baseptr, vindex,
scale);
7068 #if OIIO_SIMD_AVX >= 2
7079 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7081 _mm_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
7090 const vint_t& vindex)
const
7092 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7104 #elif OIIO_SIMD_NEON
7114 #elif OIIO_SIMD_NEON
7124 return _mm_sub_ps (_mm_setzero_ps(),
m_simd);
7125 #elif OIIO_SIMD_NEON
7135 #elif OIIO_SIMD_NEON
7145 #elif OIIO_SIMD_NEON
7155 return _mm_mul_ps (a.
m_simd, _mm_set1_ps(b));
7156 #elif OIIO_SIMD_NEON
7157 return vmulq_n_f32 (a.
m_simd, b);
7170 #elif OIIO_SIMD_NEON
7180 #elif OIIO_SIMD_NEON
7191 #elif OIIO_SIMD_NEON
7202 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7212 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7223 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7234 #elif OIIO_SIMD_NEON
7244 #elif OIIO_SIMD_NEON
7255 #elif OIIO_SIMD_NEON
7265 #elif OIIO_SIMD_NEON
7275 #elif OIIO_SIMD_NEON
7285 #elif OIIO_SIMD_NEON
7296 return vfloat4 (a[0], a[1], b[0], b[1]);
7304 return vfloat4 (a[0], b[0], a[1], b[1]);
7309 return insert<3>(*
this, 0.0f);
7313 return insert<3>(*
this, 1.0f);
7318 for (
int i = 1; i < val.elements; ++i)
7319 cout <<
' ' << val[i];
7329 #elif OIIO_SIMD_NEON
7337 template<
int i0,
int i1,
int i2,
int i3>
7340 return shuffle_sse<i0,i1,i2,i3> (__m128(a));
7350 float32x2_t
t = vget_low_f32(a.simd());
return vdupq_lane_f32(t,0);
7353 float32x2_t t = vget_low_f32(a.simd());
return vdupq_lane_f32(t,1);
7356 float32x2_t t = vget_high_f32(a.simd());
return vdupq_lane_f32(t,0);
7359 float32x2_t t = vget_high_f32(a.simd());
return vdupq_lane_f32(t,1);
7369 return vfloat4(_mm_shuffle_ps(a, b, i));
7371 return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
7372 b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
7382 return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.
simd()));
7383 #elif OIIO_SIMD_NEON
7384 return vgetq_lane_f32(a.
simd(), i);
7392 return _mm_cvtss_f32(a.simd());
7400 #if OIIO_SIMD_SSE >= 4
7401 return _mm_insert_ps (a, _mm_set_ss(val), i<<4);
7402 #elif OIIO_SIMD_NEON
7403 return vld1q_lane_f32(&val, a.
simd(), i);
7414 return _mm_move_ss (a.simd(), _mm_set_ss(
val));
7432 return _mm_castps_si128 (x.
simd());
7433 #elif OIIO_SIMD_NEON
7434 return vreinterpretq_s32_f32 (x.
simd());
7436 return *(
vint4 *)&x;
7443 return _mm_castsi128_ps (x.
simd());
7444 #elif OIIO_SIMD_NEON
7445 return vreinterpretq_f32_s32 (x.
simd());
7459 #if OIIO_SIMD_SSE >= 3
7472 vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(
v) + v;
7474 vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
7476 vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;
7478 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7479 return vfloat4(vaddvq_f32(v));
7481 return vfloat4 (v[0] + v[1] + v[2] + v[3]);
7489 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7490 return vaddvq_f32(v);
7492 return v[0] + v[1] + v[2] + v[3];
7497 #if OIIO_SIMD_SSE >= 4
7498 return _mm_dp_ps (a.
simd(), b.
simd(), 0xff);
7499 #elif OIIO_SIMD_NEON
7500 float32x4_t ab = vmulq_f32(a, b);
7501 float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab));
7502 return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
7509 #if OIIO_SIMD_SSE >= 4
7510 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0xff));
7517 #if OIIO_SIMD_SSE >= 4
7518 return _mm_dp_ps (a.
simd(), b.
simd(), 0x7f);
7525 #if OIIO_SIMD_SSE >= 4
7526 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0x77));
7535 #if OIIO_SIMD_SSE >= 4
7537 return _mm_blendv_ps (a.
simd(), b.
simd(), mask.
simd());
7540 return _mm_or_ps (_mm_and_ps(mask.
simd(), b.
simd()),
7541 _mm_andnot_ps(mask.
simd(), a.
simd()));
7542 #elif OIIO_SIMD_NEON
7545 return vfloat4 (mask[0] ? b[0] : a[0],
7546 mask[1] ? b[1] : a[1],
7547 mask[2] ? b[2] : a[2],
7548 mask[3] ? b[3] : a[3]);
7556 return _mm_and_ps(mask.
simd(), a.
simd());
7558 return vfloat4 (mask[0] ? a[0] : 0.0
f,
7559 mask[1] ? a[1] : 0.0f,
7560 mask[2] ? a[2] : 0.0f,
7561 mask[3] ? a[3] : 0.0f);
7569 return _mm_andnot_ps(mask.
simd(), a.
simd());
7571 return vfloat4 (mask[0] ? 0.0
f : a[0],
7572 mask[1] ? 0.0
f : a[1],
7573 mask[2] ? 0.0
f : a[2],
7574 mask[3] ? 0.0
f : a[3]);
7583 return vfloat4 (b[0] == 0.0
f ? 0.0
f : a[0] / b[0],
7584 b[1] == 0.0
f ? 0.0
f : a[1] / b[1],
7585 b[2] == 0.0
f ? 0.0
f : a[2] / b[2],
7586 b[3] == 0.0
f ? 0.0
f : a[3] / b[3]);
7605 return blend (b, a, mask);
7613 return _mm_and_ps (a.
simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
7614 #elif OIIO_SIMD_NEON
7615 return vabsq_f32(a.
simd());
7631 #if OIIO_SIMD_SSE >= 4
7632 return _mm_ceil_ps (a);
7640 #if OIIO_SIMD_SSE >= 4
7641 return _mm_floor_ps (a);
7649 #if OIIO_SIMD_SSE >= 4
7650 return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
7651 #elif OIIO_SIMD_NEON
7652 return vrndnq_f32(a);
7661 #if OIIO_SIMD_SSE >= 4
7677 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
7681 #elif OIIO_SIMD_AVX512
7683 vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a));
7684 return _mm512_castps512_ps128(r);
7697 return _mm_sqrt_ps (a.
simd());
7698 #elif OIIO_SIMD_NEON
7699 return vsqrtq_f32 (a.
simd());
7709 return _mm_div_ps (_mm_set1_ps(1.0
f), _mm_sqrt_ps (a.
simd()));
7710 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7711 return vdivq_f32(vdupq_n_f32(1.0
f), vsqrtq_f32(a));
7720 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
7722 return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
7723 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7725 return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
7727 return _mm_rsqrt_ps (a.
simd());
7737 return _mm_min_ps (a, b);
7738 #elif OIIO_SIMD_NEON
7739 return vminq_f32(a, b);
7748 return _mm_max_ps (a, b);
7749 #elif OIIO_SIMD_NEON
7750 return vmaxq_f32(a, b);
7759 return _mm_andnot_ps (a.
simd(), b.
simd());
7774 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7776 return _mm_fmadd_ps (a, b, c);
7777 #elif OIIO_SIMD_NEON
7779 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7794 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7796 return _mm_fmsub_ps (a, b, c);
7797 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7813 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7815 return _mm_fnmadd_ps (a, b, c);
7816 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7832 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7834 return _mm_fnmsub_ps (a, b, c);
7835 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7842 return -(a *
b) - c;
7849 template<
typename T>
7857 typedef typename T::vint_t int_t;
7859 const float exp_hi (88.3762626647949
f);
7860 const float exp_lo (-88.3762626647949
f);
7861 const float cephes_LOG2EF (1.44269504088896341
f);
7862 const float cephes_exp_C1 (0.693359375
f);
7863 const float cephes_exp_C2 (-2.12194440e-4
f);
7864 const float cephes_exp_p0 (1.9875691500E-4
f);
7865 const float cephes_exp_p1 (1.3981999507E-3
f);
7866 const float cephes_exp_p2 (8.3334519073E-3
f);
7867 const float cephes_exp_p3 (4.1665795894E-2
f);
7868 const float cephes_exp_p4 (1.6666665459E-1
f);
7869 const float cephes_exp_p5 (5.0000001201E-1
f);
7872 x =
min (x,
T(exp_hi));
7873 x =
max (x,
T(exp_lo));
7874 T fx =
madd (x,
T(cephes_LOG2EF),
T(0.5
f));
7875 int_t emm0 = int_t(fx);
7879 tmp = fx * cephes_exp_C1;
7880 T z = fx * cephes_exp_C2;
7884 T y = cephes_exp_p0;
7885 y =
madd (y, x, cephes_exp_p1);
7886 y =
madd (y, x, cephes_exp_p2);
7887 y =
madd (y, x, cephes_exp_p3);
7888 y =
madd (y, x, cephes_exp_p4);
7889 y =
madd (y, x, cephes_exp_p5);
7892 emm0 = (int_t(fx) + int_t(0x7f)) << 23;
7904 template<
typename T>
7912 typedef typename T::vint_t int_t;
7913 typedef typename T::vbool_t bool_t;
7918 bool_t invalid_mask = (x <= zero);
7919 const int min_norm_pos ((
int)0x00800000);
7920 const int inv_mant_mask ((
int)~0x7f800000);
7926 emm0 = emm0 - int_t(0x7f);
7930 const float cephes_SQRTHF (0.707106781186547524
f);
7931 bool_t
mask = (x <
T(cephes_SQRTHF));
7937 const float cephes_log_p0 (7.0376836292E-2
f);
7938 const float cephes_log_p1 (- 1.1514610310E-1
f);
7939 const float cephes_log_p2 (1.1676998740E-1
f);
7940 const float cephes_log_p3 (- 1.2420140846E-1
f);
7941 const float cephes_log_p4 (+ 1.4249322787E-1
f);
7942 const float cephes_log_p5 (- 1.6668057665E-1
f);
7943 const float cephes_log_p6 (+ 2.0000714765E-1
f);
7944 const float cephes_log_p7 (- 2.4999993993E-1
f);
7945 const float cephes_log_p8 (+ 3.3333331174E-1
f);
7946 const float cephes_log_q1 (-2.12194440e-4
f);
7947 const float cephes_log_q2 (0.693359375
f);
7948 T y = cephes_log_p0;
7949 y =
madd (y, x,
T(cephes_log_p1));
7950 y =
madd (y, x,
T(cephes_log_p2));
7951 y =
madd (y, x,
T(cephes_log_p3));
7952 y =
madd (y, x,
T(cephes_log_p4));
7953 y =
madd (y, x,
T(cephes_log_p5));
7954 y =
madd (y, x,
T(cephes_log_p6));
7955 y =
madd (y, x,
T(cephes_log_p7));
7956 y =
madd (y, x,
T(cephes_log_p8));
7959 y =
madd(e,
T(cephes_log_q1), y);
7962 x =
madd (e,
T(cephes_log_q2), x);
7977 vfloat4 A (a[0], b[0], c[0], d[0]);
7978 vfloat4 B (a[1], b[1], c[1], d[1]);
7979 vfloat4 C (a[2], b[2], c[2], d[2]);
7980 vfloat4 D (a[3], b[3], c[3], d[3]);
7981 a =
A; b =
B; c = C; d = D;
7991 auto l02 = _mm_unpacklo_ps (a, c);
7992 auto h02 = _mm_unpackhi_ps (a, c);
7993 auto l13 = _mm_unpacklo_ps (b, d);
7994 auto h13 = _mm_unpackhi_ps (b, d);
7995 r0 =
vfloat4(_mm_unpacklo_ps (l02, l13));
7996 r1 =
vfloat4(_mm_unpackhi_ps (l02, l13));
7997 r2 =
vfloat4(_mm_unpacklo_ps (h02, h13));
7998 r3 =
vfloat4(_mm_unpackhi_ps (h02, h13));
8000 r0.
load (a[0], b[0], c[0], d[0]);
8001 r1.
load (a[1], b[1], c[1], d[1]);
8002 r2.
load (a[2], b[2], c[2], d[2]);
8003 r3.
load (a[3], b[3], c[3], d[3]);
8011 __m128
A = _mm_castsi128_ps (a);
8012 __m128
B = _mm_castsi128_ps (b);
8013 __m128 C = _mm_castsi128_ps (c);
8014 __m128 D = _mm_castsi128_ps (d);
8015 _MM_TRANSPOSE4_PS (A, B, C, D);
8016 a = _mm_castps_si128 (A);
8017 b = _mm_castps_si128 (B);
8018 c = _mm_castps_si128 (C);
8019 d = _mm_castps_si128 (D);
8021 vint4 A (a[0], b[0], c[0], d[0]);
8022 vint4 B (a[1], b[1], c[1], d[1]);
8023 vint4 C (a[2], b[2], c[2], d[2]);
8024 vint4 D (a[3], b[3], c[3], d[3]);
8025 a =
A; b =
B; c = C; d = D;
8034 __m128
A = _mm_castsi128_ps (a);
8035 __m128
B = _mm_castsi128_ps (b);
8036 __m128 C = _mm_castsi128_ps (c);
8037 __m128 D = _mm_castsi128_ps (d);
8038 _MM_TRANSPOSE4_PS (A, B, C, D);
8039 r0 = _mm_castps_si128 (A);
8040 r1 = _mm_castps_si128 (B);
8041 r2 = _mm_castps_si128 (C);
8042 r3 = _mm_castps_si128 (D);
8044 r0.
load (a[0], b[0], c[0], d[0]);
8045 r1.
load (a[1], b[1], c[1], d[1]);
8046 r2.
load (a[2], b[2], c[2], d[2]);
8047 r3.
load (a[3], b[3], c[3], d[3]);
8056 vfloat4 l02 = _mm_unpacklo_ps (a, c);
8057 vfloat4 l13 = _mm_unpacklo_ps (b, d);
8058 return _mm_unpacklo_ps (l02, l13);
8060 return vfloat4 (a[0], b[0], c[0], d[0]);
8069 vint4 l02 = _mm_unpacklo_epi32 (a, c);
8070 vint4 l13 = _mm_unpacklo_epi32 (b, d);
8071 return _mm_unpacklo_epi32 (l02, l13);
8073 return vint4 (a[0], b[0], c[0], d[0]);
8083 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
8091 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
8103 return vfloat3 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step);
8116 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
8120 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
8124 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
8128 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
8131 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8133 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
8145 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8157 *
this = *
this +
a;
return *
this;
8169 *
this = *
this -
a;
return *
this;
8185 *
this = *
this *
a;
return *
this;
8189 *
this = *
this *
a;
return *
this;
8197 *
this = *
this /
a;
return *
this;
8201 *
this = *
this /
a;
return *
this;
8207 for (
int i = 1; i < val.elements; ++i)
8208 cout <<
' ' << val[i];
8217 return vfloat3(_mm_and_ps (a.
simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
8218 #elif OIIO_SIMD_NEON
8235 #if OIIO_SIMD_SSE >= 4
8236 return vfloat3(_mm_ceil_ps (a));
8244 #if OIIO_SIMD_SSE >= 4
8245 return vfloat3(_mm_floor_ps (a));
8253 #if OIIO_SIMD_SSE >= 4
8254 return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)));
8265 return vfloat3 (v[0] + v[1] + v[2]);
8271 #if OIIO_SIMD_SSE >= 4
8280 #if OIIO_SIMD_SSE >= 4
8281 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0x77));
8285 return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
8291 #if OIIO_SIMD_SSE >= 4
8301 return dot(*
this, *
this);
8307 return sqrtf(
dot(*
this, *
this));
8317 float len2 =
dot (*
this, *
this);
8318 return len2 > 0.0f ? (*this) / sqrtf(len2) :
vfloat3::Zero();
8328 return vfloat3 ((*
this) * invlen);
8330 float len2 =
dot (*
this, *
this);
8331 return len2 > 0.0f ? (*this) / sqrtf(len2) :
vfloat3::Zero();
8341 #ifdef INCLUDED_IMATHMATRIX_H
8381 R = R / shuffle<3>(R);
8386 b = V[0] * m_vals[0][1] + V[1] * m_vals[1][1] + V[2] * m_vals[2][1] + m_vals[3][1];
8387 c = V[0] * m_vals[0][2] + V[1] * m_vals[1][2] + V[2] * m_vals[2][2] + m_vals[3][2];
8388 w = V[0] * m_vals[0][3] + V[1] * m_vals[1][3] + V[2] * m_vals[2][3] + m_vals[3][3];
8389 return vfloat3(a / w, b / w, c / w);
8396 shuffle<2>(V) *
m_row[2];
8401 b = V[0] * m_vals[0][1] + V[1] * m_vals[1][1] + V[2] * m_vals[2][1];
8402 c = V[0] * m_vals[0][2] + V[1] * m_vals[1][2] + V[2] * m_vals[2][2];
8410 vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8411 shuffle<2>(V) * T[2];
8416 b = V[0] * m_vals[1][0] + V[1] * m_vals[1][1] + V[2] * m_vals[1][2];
8417 c = V[0] * m_vals[2][0] + V[1] * m_vals[2][1] + V[2] * m_vals[2][2];
8425 return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8426 shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8429 a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0];
8430 b = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1] + V[3] * M[3][1];
8431 c = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2] + V[3] * M[3][2];
8432 w = V[0] * M[0][3] + V[1] * M[1][3] + V[2] * M[2][3] + V[3] * M[3][3];
8439 #if OIIO_SIMD_SSE >= 3
8444 vfloat4 s01 = _mm_hadd_ps(m0v, m1v);
8446 vfloat4 s23 = _mm_hadd_ps(m2v, m3v);
8468 return memcmp(
this, &m, 16*
sizeof(
float)) == 0;
8480 return memcmp(
this, &m, 16*
sizeof(
float)) != 0;
8507 vfloat4 minor0, minor1, minor2, minor3;
8511 vfloat4 row0, row1, row2, row3;
8512 const float *
src = (
const float *)&msrc;
8515 row0 = shuffle<0x88>(tmp1, row1);
8516 row1 = shuffle<0xDD>(row1, tmp1);
8519 row2 = shuffle<0x88>(tmp1, row3);
8520 row3 = shuffle<0xDD>(row3, tmp1);
8525 vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
8527 vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
8538 tmp1 = shuffle<1,0,3,2>(tmp1);
8539 minor0 = row1 * tmp1;
8540 minor1 = row0 * tmp1;
8541 tmp1 = shuffle<2,3,0,1>(tmp1);
8542 minor0 = (row1 * tmp1) - minor0;
8543 minor1 = (row0 * tmp1) - minor1;
8544 minor1 = shuffle<2,3,0,1>(minor1);
8547 tmp1 = shuffle<1,0,3,2>(tmp1);
8548 minor0 = (row3 * tmp1) + minor0;
8549 minor3 = row0 * tmp1;
8550 tmp1 = shuffle<2,3,0,1>(tmp1);
8551 minor0 = minor0 - (row3 * tmp1);
8552 minor3 = (row0 * tmp1) - minor3;
8553 minor3 = shuffle<2,3,0,1>(minor3);
8555 tmp1 = shuffle<2,3,0,1>(row1) * row3;
8556 tmp1 = shuffle<1,0,3,2>(tmp1);
8557 row2 = shuffle<2,3,0,1>(row2);
8558 minor0 = (row2 * tmp1) + minor0;
8559 minor2 = row0 * tmp1;
8560 tmp1 = shuffle<2,3,0,1>(tmp1);
8561 minor0 = minor0 - (row2 * tmp1);
8562 minor2 = (row0 * tmp1) - minor2;
8563 minor2 = shuffle<2,3,0,1>(minor2);
8566 tmp1 = shuffle<1,0,3,2>(tmp1);
8567 minor2 = (row3 * tmp1) + minor2;
8568 minor3 = (row2 * tmp1) - minor3;
8569 tmp1 = shuffle<2,3,0,1>(tmp1);
8570 minor2 = (row3 * tmp1) - minor2;
8571 minor3 = minor3 - (row2 * tmp1);
8574 tmp1 = shuffle<1,0,3,2>(tmp1);
8575 minor1 = minor1 - (row2 * tmp1);
8576 minor2 = (row1 * tmp1) + minor2;
8577 tmp1 = shuffle<2,3,0,1>(tmp1);
8578 minor1 = (row2 * tmp1) + minor1;
8579 minor2 = minor2 - (row1 * tmp1);
8582 tmp1 = shuffle<1,0,3,2>(tmp1);
8583 minor1 = (row3 * tmp1) + minor1;
8584 minor3 = minor3 - (row1 * tmp1);
8585 tmp1 = shuffle<2,3,0,1>(tmp1);
8586 minor1 = minor1 - (row3 * tmp1);
8587 minor3 = (row1 * tmp1) + minor3;
8589 det = row0 * minor0;
8591 float tmp1_0 = 1.0f / det0;
8592 det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
8594 return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8600 const float *m = (
const float *)&M;
8602 for (
int i = 1; i < 16; ++i)
8603 cout <<
' ' << m[i];
8656 for (
int i = 1; i < val.elements; ++i)
8657 cout <<
' ' << val[i];
8664 return _mm256_castps256_ps128 (
simd());
8672 return _mm256_extractf128_ps (
simd(), 1);
8681 __m256
r = _mm256_castps128_ps256 (lo);
8682 m_simd = _mm256_insertf128_ps (r, hi, 1);
8694 m_simd = _mm256_cvtepi32_ps (ival);
8703 return _mm256_setzero_ps();
8714 return vfloat8 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step,
8715 start+4.0
f*step, start+5.0
f*step, start+6.0
f*step, start+7.0
f*step);
8721 m_simd = _mm256_setzero_ps();
8731 m_simd = _mm256_set1_ps (val);
8732 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8741 float e,
float f,
float g,
float h) {
8743 m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a);
8744 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8762 m_simd = _mm256_loadu_ps (values);
8763 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8774 #if 0 && OIIO_AVX512VL_ENABLED
8778 m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values);
8779 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8783 hi.
load (values+4, n-4);
8788 lo.
load (values, n);
8794 for (
int i = 0; i <
n; ++i)
8795 m_val[i] = values[i];
8806 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8819 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8831 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8843 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8851 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8853 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8855 vint4 a ((
const int *)values);
8856 m_simd = _mm256_cvtph_ps (a);
8857 #elif OIIO_SIMD_SSE >= 2 || OIIO_SIMD_NEON
8872 _mm256_storeu_ps (values,
m_simd);
8873 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8884 #if 0 && OIIO_AVX512VL_ENABLED
8888 _mm256_mask_storeu_ps (values, __mmask8(~(0xff << n)),
m_simd);
8889 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8892 }
else if (n <= 8) {
8897 for (
int i = 0; i <
n; ++i)
8898 values[i] =
m_val[i];
8902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8904 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8905 __m128i h = _mm256_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
8906 _mm_storeu_si128 ((__m128i *)values, h);
8907 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8918 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8919 m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (
const simd_t *)values);
8929 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8932 m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask));
8940 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8941 _mm256_mask_storeu_ps (values, __mmask8(mask),
m_simd);
8945 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
8951 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8952 _mm256_mask_storeu_ps (values, __mmask8(mask.
bitmask()),
m_simd);
8954 _mm256_maskstore_ps (values, _mm256_castps_si256(mask.
simd()),
m_simd);
8956 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
8961 template <
int scale>
8965 #if OIIO_SIMD_AVX >= 2
8966 m_simd = _mm256_i32gather_ps (baseptr, vindex,
scale);
8976 #if OIIO_SIMD_AVX >= 2
8987 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8988 _mm256_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
8997 const vint_t& vindex)
const
8999 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9010 return _mm256_add_ps (a, b);
9022 return _mm256_sub_ps (_mm256_setzero_ps(), a);
9030 return _mm256_sub_ps (a, b);
9042 return _mm256_mul_ps (a.
m_simd, _mm256_set1_ps(b));
9054 return _mm256_mul_ps (a, b);
9066 return _mm256_div_ps (a, b);
9078 return _mm256_cmp_ps (a, b, _CMP_EQ_OQ);
9086 return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ);
9094 return _mm256_cmp_ps (a, b, _CMP_LT_OQ);
9102 return _mm256_cmp_ps (a, b, _CMP_GT_OQ);
9110 return _mm256_cmp_ps (a, b, _CMP_GE_OQ);
9118 return _mm256_cmp_ps (a, b, _CMP_LE_OQ);
9129 m_simd = _mm256_cvttps_epi32(f);
9130 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9138 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
9140 #if OIIO_SIMD_AVX >= 2
9142 return _mm256_permutevar8x32_ps (a, index);
9144 return vfloat8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
9149 #if OIIO_SIMD_AVX >= 2
9150 return _mm256_permutevar8x32_ps (a,
vint8(i));
9152 return shuffle<i,i,i,i,i,i,i,i>(
a);
9159 #if OIIO_SIMD_AVX_NO_FIXME
9162 _m128 f4 = _mm256_extractf128_ps (i >> 2);
9164 return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd()));
9173 #if OIIO_SIMD_AVX_NO_FIXME
9174 return _mm256_insert_epi32 (a, val, i);
9196 return _mm256_castps_si256 (x.
simd());
9205 return _mm256_castsi256_ps (x.
simd());
9215 vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.
simd(), _mm256_setzero_ps());
9216 vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
9218 vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
9219 vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
9220 return shuffle<0>(final_sum);
9229 #if OIIO_SIMD_AVX >= 2
9240 return _mm256_blendv_ps (a, b, mask);
9241 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9253 return _mm256_and_ps(mask, a);
9254 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9266 return _mm256_andnot_ps(mask, a);
9267 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9278 return blend (b, a, mask);
9295 return _mm256_and_ps (a.
simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
9296 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9314 return _mm256_ceil_ps (a);
9323 return _mm256_floor_ps (a);
9332 return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9360 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
9375 return _mm256_sqrt_ps (a.
simd());
9386 return _mm256_div_ps (_mm256_set1_ps(1.0
f), _mm256_sqrt_ps (a.
simd()));
9396 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9398 return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
9399 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9401 return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
9403 return _mm256_rsqrt_ps (a.
simd());
9416 return _mm256_min_ps (a, b);
9425 return _mm256_max_ps (a, b);
9434 return _mm256_andnot_ps (a.
simd(), b.
simd());
9453 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9455 return _mm256_fmadd_ps (a, b, c);
9456 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9469 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9471 return _mm256_fmsub_ps (a, b, c);
9472 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9486 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9488 return _mm256_fnmadd_ps (a, b, c);
9489 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9503 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9505 return _mm256_fnmsub_ps (a, b, c);
9506 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9511 return -(a *
b) - c;
9534 for (
int i = 1; i < val.elements; ++i)
9535 cout <<
' ' << val[i];
9541 #if OIIO_SIMD_AVX >= 512
9542 return _mm512_castps512_ps256 (
simd());
9549 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED
9550 return _mm512_extractf32x8_ps (
simd(), 1);
9558 float v4,
float v5,
float v6,
float v7,
9559 float v8,
float v9,
float v10,
float v11,
9560 float v12,
float v13,
float v14,
float v15) {
9561 load (v0, v1, v2, v3, v4, v5, v6, v7,
9562 v8, v9, v10, v11, v12, v13, v14, v15);
9566 #if OIIO_SIMD_AVX >= 512
9567 __m512
r = _mm512_castps256_ps512 (lo);
9568 m_simd = _mm512_insertf32x8 (r, hi, 1);
9576 #if OIIO_SIMD_AVX >= 512
9577 m_simd = _mm512_broadcast_f32x4(a);
9589 #if OIIO_SIMD_AVX >= 512
9590 m_simd = _mm512_cvtepi32_ps (ival);
9598 #if OIIO_SIMD_AVX >= 512
9599 return _mm512_setzero_ps();
9610 return vfloat16 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step,
9611 start+4.0
f*step, start+5.0
f*step, start+6.0
f*step, start+7.0
f*step,
9612 start+8.0
f*step, start+9.0
f*step, start+10.0
f*step, start+11.0
f*step,
9613 start+12.0
f*step, start+13.0
f*step, start+14.0
f*step, start+15.0
f*step);
9618 #if OIIO_SIMD_AVX >= 512
9619 m_simd = _mm512_setzero_ps();
9627 #if OIIO_SIMD_AVX >= 512
9628 m_simd = _mm512_set1_ps (a);
9637 float v4,
float v5,
float v6,
float v7,
9638 float v8,
float v9,
float v10,
float v11,
9639 float v12,
float v13,
float v14,
float v15) {
9640 #if OIIO_SIMD_AVX >= 512
9641 m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7,
9642 v8, v9, v10, v11, v12, v13, v14, v15);
9665 #if OIIO_SIMD_AVX >= 512
9666 m_simd = _mm512_loadu_ps (values);
9677 #if OIIO_SIMD_AVX >= 512
9678 m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values);
9692 #if OIIO_SIMD_AVX >= 512
9703 #if OIIO_SIMD_AVX >= 512
9714 #if OIIO_SIMD_AVX >= 512
9724 #if OIIO_SIMD_AVX >= 512
9733 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9735 #if OIIO_SIMD_AVX >= 512
9737 vint8 a ((
const int *)values);
9738 m_simd = _mm512_cvtph_ps (a);
9749 #if OIIO_SIMD_AVX >= 512
9753 _mm512_storeu_ps (values,
m_simd);
9764 #if 0 && OIIO_SIMD_AVX >= 512
9768 _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)),
m_simd);
9772 }
else if (n < 16) {
9781 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9783 #if OIIO_SIMD_AVX >= 512
9784 __m256i h = _mm512_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9785 _mm256_storeu_si256 ((__m256i *)values, h);
9795 #if OIIO_SIMD_AVX >= 512
9796 m_simd = _mm512_maskz_loadu_ps (mask, (
const simd_t *)values);
9805 #if OIIO_SIMD_AVX >= 512
9815 template <
int scale>
9819 #if OIIO_SIMD_AVX >= 512
9820 m_simd = _mm512_i32gather_ps (vindex, baseptr,
scale);
9831 #if OIIO_SIMD_AVX >= 512
9843 #if OIIO_SIMD_AVX >= 512
9844 _mm512_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
9854 const vint_t& vindex)
const
9856 #if OIIO_SIMD_AVX >= 512
9857 _mm512_mask_i32scatter_ps (baseptr, mask, vindex,
m_simd,
scale);
9867 #if OIIO_SIMD_AVX >= 512
9879 #if OIIO_SIMD_AVX >= 512
9880 return _mm512_sub_ps (_mm512_setzero_ps(), a.
simd());
9887 #if OIIO_SIMD_AVX >= 512
9900 #if OIIO_SIMD_AVX >= 512
9901 return _mm512_mul_ps (a.
m_simd, _mm512_set1_ps(b));
9912 #if OIIO_SIMD_AVX >= 512
9924 #if OIIO_SIMD_AVX >= 512
9937 #if OIIO_SIMD_AVX >= 512
9938 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_EQ_OQ);
9946 #if OIIO_SIMD_AVX >= 512
9947 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_NEQ_OQ);
9955 #if OIIO_SIMD_AVX >= 512
9956 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_LT_OQ);
9964 #if OIIO_SIMD_AVX >= 512
9965 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_GT_OQ);
9973 #if OIIO_SIMD_AVX >= 512
9974 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_GE_OQ);
9982 #if OIIO_SIMD_AVX >= 512
9983 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_LE_OQ);
9993 #if OIIO_SIMD_AVX >= 512
9994 m_simd = _mm512_cvttps_epi32(f);
10003 template<
int i0,
int i1,
int i2,
int i3>
10005 #if OIIO_SIMD_AVX >= 512
10006 return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,
i2,
i1,i0));
10009 a.
store ((
float *)x);
10015 return shuffle4<i,i,i,i> (
a);
10018 template<
int i0,
int i1,
int i2,
int i3>
10020 #if OIIO_SIMD_AVX >= 512
10021 return _mm512_permute_ps(a,_MM_SHUFFLE(i3,
i2,
i1,i0));
10024 a.
store ((
float *)x);
10025 return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
10026 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
10031 return shuffle<i,i,i,i> (
a);
10050 #if OIIO_SIMD_AVX >= 512
10051 return _mm_cvtss_f32(_mm512_castps512_ps128(
m_simd));
10068 #if OIIO_SIMD_AVX >= 512
10069 return _mm512_castps_si512 (x.
simd());
10077 #if OIIO_SIMD_AVX >= 512
10078 return _mm512_castsi512_ps (x.
simd());
10086 #if OIIO_SIMD_AVX >= 512
10089 vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(
v);
10090 vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
10092 vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(
w);
10093 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
10102 #if OIIO_SIMD_AVX >= 512
10112 #if OIIO_SIMD_AVX >= 512
10113 return _mm512_mask_blend_ps (mask, a, b);
10123 #if OIIO_SIMD_AVX >= 512
10124 return _mm512_maskz_mov_ps (mask, a);
10134 #if OIIO_SIMD_AVX >= 512
10135 return _mm512_maskz_mov_ps (!mask, a);
10145 return blend (b, a, mask);
10160 #if OIIO_SIMD_AVX >= 512
10163 return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.
simd()),
10164 _mm512_set1_epi32(0x7fffffff)));
10180 #if OIIO_SIMD_AVX >= 512
10181 return _mm512_ceil_ps (a);
10189 #if OIIO_SIMD_AVX >= 512
10190 return _mm512_floor_ps (a);
10199 #if OIIO_SIMD_AVX >= 512
10200 return _mm512_roundscale_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
10208 #if OIIO_SIMD_AVX >= 512
10209 return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC));
10224 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10225 return _mm512_rcp28_ps(a);
10226 #elif OIIO_SIMD_AVX >= 512
10237 #if OIIO_SIMD_AVX >= 512
10238 return _mm512_sqrt_ps (a);
10247 #if OIIO_SIMD_AVX >= 512
10248 return _mm512_div_ps (_mm512_set1_ps(1.0
f), _mm512_sqrt_ps (a));
10257 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10258 return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
10259 #elif OIIO_SIMD_AVX >= 512
10260 return _mm512_rsqrt14_ps (a);
10269 #if OIIO_SIMD_AVX >= 512
10270 return _mm512_min_ps (a, b);
10278 #if OIIO_SIMD_AVX >= 512
10279 return _mm512_max_ps (a, b);
10287 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__)
10288 return _mm512_andnot_ps (a, b);
10298 #if OIIO_SIMD_AVX >= 512
10299 return _mm512_fmadd_ps (a, b, c);
10310 #if OIIO_SIMD_AVX >= 512
10311 return _mm512_fmsub_ps (a, b, c);
10323 #if OIIO_SIMD_AVX >= 512
10324 return _mm512_fnmadd_ps (a, b, c);
10336 #if OIIO_SIMD_AVX >= 512
10337 return _mm512_fnmsub_ps (a, b, c);
10356 : OIIO::pvt::index_formatter<OIIO::simd::vfloat3> {};
10358 : OIIO::pvt::index_formatter<OIIO::simd::vfloat4> {};
10360 : OIIO::pvt::index_formatter<OIIO::simd::vfloat8> {};
10362 : OIIO::pvt::index_formatter<OIIO::simd::vfloat16> {};
10364 : OIIO::pvt::index_formatter<OIIO::simd::vint4> {};
10366 : OIIO::pvt::index_formatter<OIIO::simd::vint8> {};
10368 : OIIO::pvt::index_formatter<OIIO::simd::vint16> {};
10370 : OIIO::pvt::array_formatter<OIIO::simd::matrix44, float, 16> {};
10374 #undef SIMD_CONSTRUCT
10375 #undef SIMD_CONSTRUCT_PAD
10377 #undef SIMD_RETURN_REDUCE
friend const vfloat8 & operator/=(vfloat8 &a, const vfloat8 &b)
friend vfloat8 operator+(const vfloat8 &a, const vfloat8 &b)
static const char * type_name()
static const char * type_name()
static const vbool4 True()
Return a vbool4 the is 'true' for all values.
vint16()
Default constructor (contents undefined)
OIIO_FORCEINLINE matrix44(M44fParam M)
Construct from an OIIO::M44fParam (including an Imath::M44f)
friend vint4 operator|(const vint4 &a, const vint4 &b)
vfloat16(float a)
Construct from a single value (store it in all slots)
friend const vint8 & operator%=(vint8 &a, const vint8 &b)
static const vint4 NegOne()
Return an vint4 with all components set to -1 (aka 0xffffffff)
friend vbool8 operator!(const vbool8 &a)
Logical/bitwise operators, component-by-component.
friend const vint4 & operator%=(vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
friend const vfloat16 & operator*=(vfloat16 &a, const vfloat16 &b)
vint4 max(const vint4 &a, const vint4 &b)
friend vfloat3 operator*(const vfloat3 &a, const vfloat3 &b)
static const char * type_name()
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
vfloat4(float a)
Construct from a single value (store it in all slots)
static vbool4 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool4.
bool none(const vbool4 &v)
void clear()
Set all components to 0.0.
vbool4(bool a)
Construct from a single value (store it in all slots)
friend const vbool16 & operator|=(vbool16 &a, const vbool16 &b)
vfloat3 operator-() const
friend vbool8 operator!=(const vint8 &a, const vint8 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
const vfloat3 & operator-=(const vfloat3 &a)
void store(float *values) const
static const vfloat8 One()
Return a vfloat8 with all components set to 1.0.
friend vfloat4 operator*(const vfloat4 &a, const vfloat4 &b)
vint16 shuffle4(const vint16 &a)
Shuffle groups of 4.
friend vbool16 operator<=(const vfloat16 &a, const vfloat16 &b)
OIIO_FORCEINLINE const vint4 & operator/=(vint4 &a, const vint4 &b)
friend vint8 operator~(const vint8 &a)
void store_mask(int mask, value_t *values) const
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator-(const vfloat16 &a)
vfloat8(float a)
Construct from a single value (store it in all slots)
OIIO_FORCEINLINE vbool4 shuffle(const vbool4 &a)
friend vfloat3 operator+(const vfloat3 &a, const vfloat3 &b)
friend vbool16 operator!=(const vint16 &a, const vint16 &b)
friend const vbool8 & operator|=(vbool8 &a, const vbool8 &b)
static const char * name()
SYS_API float expf(float x)
Matrix44< float > M44f
4x4 matrix of float
friend vfloat16 operator%(const vfloat16 &a, const vfloat16 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat4 &val)
Stream output.
friend vint8 operator/(const vint8 &a, const vint8 &b)
vfloat4 bitcast_to_float(const vint4 &x)
static const vint4 Giota()
Return an vint4 with "geometric" iota: (1, 2, 4, 8).
OIIO_FORCEINLINE const vint4 & operator>>=(vint4 &a, const unsigned int bits)
friend const vint8 & operator>>=(vint8 &a, unsigned int bits)
void load_mask(const vbool_t &mask, const value_t *values)
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
friend vint4 operator&(const vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
friend vbool8 operator>(const vint8 &a, const vint8 &b)
static const vfloat3 Zero()
Return a vfloat3 with all components set to 0.0.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Vec4< float > V4f
Vec4 of float.
static const char * type_name()
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
void store_mask(int mask, value_t *values) const
vint4 srl(const vint4 &val, const unsigned int bits)
OIIO_FORCEINLINE vint4 operator%(const vint4 &a, const vint4 &b)
void store(V &vec) const
Store into a generic subscripted or xyz 3-vector, including Imath::V3f.
vint4 bitcast_to_int4(const vfloat4 &x)
friend const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
friend const vint4 & operator>>=(vint4 &a, unsigned int bits)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend vint8 operator|(const vint8 &a, const vint8 &b)
vfloat4 vfloat_t
SIMD int type.
friend vfloat16 operator*(const vfloat16 &a, const vfloat16 &b)
int operator[](int i) const
Component access (get)
friend const vfloat8 & operator+=(vfloat8 &a, const vfloat8 &b)
int bitmask() const
Extract the bitmask.
vfloat16 vfloat_t
float type of the same length
vfloat3(const float *f)
Construct from a pointer to 3 values.
friend vfloat4 operator/(const vfloat4 &a, const vfloat4 &b)
vfloat16(const unsigned short *vals)
Construct from a pointer to unsigned short values.
int value_t
Underlying equivalent scalar value type.
OIIO_FORCEINLINE const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
void clear()
Sset all components to 0.
GLsizei const GLfloat * value
vint8()
Default constructor (contents undefined)
friend vbool16 operator==(const vbool16 &a, const vbool16 &b)
Comparison operators, component by component.
friend vint4 operator/(const vint4 &a, const vint4 &b)
vfloat8(const short *vals)
Construct from a pointer to short values.
vfloat3 transformv(const vfloat3 &V) const
Transform 3-vector V by 4x4 matrix M.
const vfloat4 & operator[](int i) const
Return one row.
int bitmask() const
Extract the bitmask.
void setcomp(int i, bool value)
Component access (set).
void clear()
Set all components to false.
static vbool16 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool16.
void load(float val)
Helper: load a single value into all components.
vfloat4 sqrt(const vfloat4 &a)
GLdouble GLdouble GLdouble z
static const vfloat16 One()
Return a vfloat16 with all components set to 1.0.
static const char * name()
vfloat8()
Default constructor (contents undefined)
vbool8 vbool_t
bool type of the same length
OIIO_FORCEINLINE vbool4 operator!(const vbool4 &a)
const vfloat4 & operator=(float a)
Assign a single value to all components.
const vfloat4 & operator/=(const vfloat4 &a)
friend vint16 operator+(const vint16 &a, const vint16 &b)
friend const vint8 & operator*=(vint8 &a, const vint8 &b)
friend const vint4 & operator*=(vint4 &a, const vint4 &b)
friend vint8 operator<<(const vint8 &a, unsigned int bits)
static const char * name()
vfloat8(const float *f)
Construct from a pointer to 8 values.
vfloat4(const vfloat4 &other)
Copy construct from another vfloat4.
friend vbool8 operator!=(const vfloat8 &a, const vfloat8 &b)
friend vbool8 operator>=(const vfloat8 &a, const vfloat8 &b)
void clear()
Set all components to false.
friend const vint16 & operator&=(vint16 &a, const vint16 &b)
vfloat4 rsqrt_fast(const vfloat4 &a)
Fast, approximate 1/sqrt.
GLboolean GLboolean GLboolean GLboolean a
vint16(const simd_t &m)
Construct from the underlying SIMD type.
friend const vbool16 & operator&=(vbool16 &a, const vbool16 &b)
static const vfloat4 Zero()
Return a vfloat4 with all components set to 0.0.
vbool4(const simd_t &m)
Construct from the underlying SIMD type.
float operator[](int i) const
Component access (get)
vfloat16 min(const vfloat16 &a, const vfloat16 &b)
Per-element min.
float value_t
Underlying equivalent scalar value type.
vfloat3 transformvT(const vfloat3 &V) const
Transform 3-vector V by the transpose of 4x4 matrix M.
int operator[](int i) const
Component access (get)
void load(float val)
Helper: load a single value into all components.
#define SIMD_CONSTRUCT_PAD(x)
friend const vint16 & operator<<=(vint16 &a, unsigned int bits)
OIIO_FORCEINLINE const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
friend vint16 operator*(const vint16 &a, const vint16 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 max(const vfloat16 &a, const vfloat16 &b)
Per-element max.
vfloat3()
Default constructor (contents undefined)
bool value_t
Underlying equivalent scalar value type.
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
static const vint8 Zero()
Return an vint8 with all components set to 0.
vbool8 vbool_t
SIMD bool type.
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
vint16 vint_t
SIMD int type.
vfloat4(float a, float b, float c, float d=0.0f)
Construct from 3 or 4 values.
#define OIIO_SIMD_UINT4_CONST(name, val)
static const vint8 NegOne()
Return an vint8 with all components set to -1 (aka 0xffffffff)
static const char * type_name()
bool reduce_or(const vbool4 &v)
friend vfloat8 operator-(const vfloat8 &a)
friend vint16 operator/(const vint16 &a, const vint16 &b)
**But if you need a result
static const vint16 Giota()
Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
vfloat4()
Default constructor (contents undefined)
void load_mask(int mask, const value_t *values)
GLfloat GLfloat GLfloat v2
friend const vint16 & operator^=(vint16 &a, const vint16 &b)
const vfloat8 & operator=(float a)
Assign a single value to all components.
Integer 8-vector, accelerated by SIMD instructions when available.
void clear()
Set all components to 0.0.
vfloat4(const char *vals)
Construct from a pointer to 4 char values.
static const char * name()
friend const vint4 & operator<<=(vint4 &a, unsigned int bits)
GLfloat GLfloat GLfloat GLfloat v3
static constexpr size_t size() noexcept
OIIO_FORCEINLINE vbool4 insert(const vbool4 &a, bool val)
Helper: substitute val for a[i].
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
vfloat3 hdiv(const vfloat4 &a)
Homogeneous divide to turn a vfloat4 into a vfloat3.
vfloat3 transformv(const matrix44 &M, const vfloat3 &V)
Transform 3-vector V by 4x4 matrix M.
const vbool8 & operator=(bool a)
Assign one value to all components.
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
vbool8 lo() const
Extract the lower precision vbool8.
void load_bitmask(int a)
Helper: load all components from a bitmask in an int.
friend vbool4 operator!=(const vbool4 &a, const vbool4 &b)
vint8(const simd_t &m)
Construct from the underlying SIMD type.
void clear()
Sset all components to 0.
friend vbool4 operator<(const vint4 &a, const vint4 &b)
vbool8(bool a)
Construct from a single value (store it in all slots)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend const vint4 & operator^=(vint4 &a, const vint4 &b)
OIIO_FORCEINLINE vbool4 operator>=(const vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator/(const vfloat16 &a, const vfloat16 &b)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
OIIO_FORCEINLINE vbool4 operator~(const vbool4 &a)
float dot3(const vfloat4 &a, const vfloat4 &b)
Return the float 3-component dot (inner) product of a and b.
value_t m_val[paddedelements]
void load_mask(int mask, const value_t *values)
friend const vfloat16 & operator+=(vfloat16 &a, const vfloat16 &b)
vint8 vint_t
SIMD int type.
friend vbool4 operator|(const vbool4 &a, const vbool4 &b)
friend const vint16 & operator%=(vint16 &a, const vint16 &b)
const matrix44 & operator=(const matrix44 &m)
Assignment.
simd_bool_t< 8 >::type simd_t
the native SIMD type used
float value_t
Underlying equivalent scalar value type.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
static const char * type_name()
Template giving a printable name for each type.
vint4 abs(const vint4 &a)
vfloat4 safe_div(const vfloat4 &a, const vfloat4 &b)
void store(int *values) const
Store the values into memory.
friend vbool8 operator<=(const vfloat8 &a, const vfloat8 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat8 &val)
Stream output.
vbool16(const simd_t &m)
Construct from the underlying SIMD type.
vfloat8 lo() const
Extract the lower precision vfloat8.
vbool4(const vbool4 &other)
Copy construct from another vbool4.
value_t m_vals[rows][cols]
static const vbool4 False()
Return a vbool4 the is 'false' for all values.
vbool4 lo() const
Extract the lower precision vbool4.
static const vbool16 False()
Return a vbool16 the is 'false' for all values.
void setcomp(int i, bool value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vint4 operator~(const vint4 &a)
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
friend vbool4 operator==(const vfloat4 &a, const vfloat4 &b)
OIIO_FORCEINLINE bool extract(const vbool4 &a)
vfloat3 normalized() const
Return a normalized version of the vector.
vfloat4 floor(const vfloat4 &a)
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
OIIO_FORCEINLINE matrix44(const float *f)
Construct from a float array.
matrix44 inverse() const
Return the inverse of the matrix.
static const vint4 One()
Return an vint4 with all components set to 1.
vint4 blend(const vint4 &a, const vint4 &b, const vbool4 &mask)
int value_t
Underlying equivalent scalar value type.
vbool16(const vbool16 &other)
Copy construct from another vbool16.
vfloat8(const vfloat8 &other)
Copy construct from another vfloat8.
vint4 blend0not(const vint4 &a, const vbool4 &mask)
vint4 vint_t
SIMD int type.
GA_API const UT_StringHolder scale
friend const vbool16 & operator^=(vbool16 &a, const vbool16 &b)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
vbool4 hi() const
Extract the higher precision vbool4.
OIIO_FORCEINLINE vbool4 operator>(const vint4 &a, const vint4 &b)
friend vbool16 operator!=(const vfloat16 &a, const vfloat16 &b)
vfloat16(const float *f)
Construct from a pointer to 16 values.
vbool8(const simd_t &m)
Construct from the underlying SIMD type.
bool operator!=(const matrix44 &m) const
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
vfloat8 hi() const
Extract the higher precision vfloat8.
OIIO_FORCEINLINE vbool4 operator<=(const vint4 &a, const vint4 &b)
friend const vint8 & operator+=(vint8 &a, const vint8 &b)
friend vfloat8 operator/(const vfloat8 &a, const vfloat8 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
vfloat4 vdot3(const vfloat4 &a, const vfloat4 &b)
friend const vint8 & operator-=(vint8 &a, const vint8 &b)
float value_t
Underlying equivalent scalar value type.
vint8(const vint8 &other)
Copy construct from another vint8.
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Integer 4-vector, accelerated by SIMD instructions when available.
#define SIMD_RETURN_REDUCE(T, init, op)
friend vbool8 operator<(const vint8 &a, const vint8 &b)
vint8 vint_t
int type of the same length
OIIO_FORCEINLINE vbool4 operator==(const vbool4 &a, const vbool4 &b)
friend vfloat3 operator/(const vfloat3 &a, const vfloat3 &b)
OIIO_DEPRECATED("use bitcast_to_int() (1.8)") inline vint4 bitcast_to_int4(const vbool4 &x)
vfloat4 lo() const
Extract the lower precision vfloat4.
vint16(const vint16 &other)
Copy construct from another vint16.
simd_raw_t< float, 8 >::type simd_t
the native SIMD type used
int value_t
Underlying equivalent scalar value type.
float length() const
Length of the vector.
bool any(const vbool4 &v)
const vfloat4 & operator-=(const vfloat4 &a)
void load(bool a)
Helper: load a single value into all components.
bool reduce_and(const vbool4 &v)
Logical reduction across all components.
vint16 vint_t
int type of the same length
vfloat8(const unsigned short *vals)
Construct from a pointer to unsigned short values.
friend vbool16 operator<=(const vint16 &a, const vint16 &b)
vfloat3 transformp(const matrix44 &M, const vfloat3 &V)
Transform 3-point V by 4x4 matrix M.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend const vint4 & operator+=(vint4 &a, const vint4 &b)
vfloat8 vfloat_t
float type of the same length
OIIO_FORCEINLINE const vint4 & operator+=(vint4 &a, const vint4 &b)
void transpose(vint4 &a, vint4 &b, vint4 &c, vint4 &d)
vint4 select(const vbool4 &mask, const vint4 &a, const vint4 &b)
void load(bool a)
Helper: load a single value into all components.
vfloat4(const short *vals)
Construct from a pointer to 4 short values.
static vbool8 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool8.
vint4 rotl(const vint4 &x, const int s)
Circular bit rotate by s bits, for N values at once.
void load_pairs(const float *lo, const float *hi)
friend vbool16 operator>=(const vint16 &a, const vint16 &b)
static constexpr size_t size() noexcept
const vfloat4 & operator*=(const vfloat4 &a)
vfloat4 msub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
vbool16 vbool_t
bool type of the same length
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
friend vint8 operator&(const vint8 &a, const vint8 &b)
friend vbool16 operator^(const vbool16 &a, const vbool16 &b)
friend vbool8 operator<(const vfloat8 &a, const vfloat8 &b)
void store_mask(int mask, value_t *values) const
float operator[](int i) const
Component access (get)
vfloat3 normalized_fast() const
Return a fast, approximate normalized version of the vector.
void load_mask(int mask, const value_t *values)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend vbool4 operator<(const vfloat4 &a, const vfloat4 &b)
static const char * type_name()
void setcomp(int i, int value)
Component access (set).
OIIO_FORCEINLINE T exp(const T &v)
void load_mask(int mask, const value_t *values)
static const vfloat3 One()
Return a vfloat3 with all components set to 1.0.
vfloat16(const vfloat16 &other)
Copy construct from another vfloat16.
void load(int a)
Helper: load a single int into all components.
bool set_denorms_zero_mode(bool on)
static const char * name()
vbool16()
Default constructor (contents undefined)
simd_bool_t< 16 >::type simd_t
the native SIMD type used
friend vint16 operator<<(const vint16 &a, unsigned int bits)
void store_mask(int mask, value_t *values) const
vbool4(bool a, bool b, bool c, bool d)
Construct from 4 bool values.
OIIO_FORCEINLINE std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
static const vfloat16 Zero()
Return a vfloat16 with all components set to 0.0.
vfloat4 operator-() const
value_t m_val[paddedelements]
vfloat4(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
friend vint4 operator%(const vint4 &a, const vint4 &b)
friend vbool8 operator!=(const vbool8 &a, const vbool8 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 vfloat_t
SIMD int type.
friend const vfloat16 & operator/=(vfloat16 &a, const vfloat16 &b)
static const vbool16 True()
Return a vbool16 the is 'true' for all values.
const vbool16 & operator=(bool a)
Assign one value to all components.
vfloat4 nmsub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
vbool4()
Default constructor (contents undefined)
static const vbool8 True()
Return a vbool8 the is 'true' for all values.
static const char * name()
friend vbool8 operator==(const vbool8 &a, const vbool8 &b)
Comparison operators, component by component.
vbool8(const vbool8 &other)
Copy construct from another vbool8.
friend vint4 operator^(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
vint4 blend0(const vint4 &a, const vbool4 &mask)
vfloat3(const short *vals)
Construct from a pointer to 4 short values.
void store(float *values) const
friend vint8 operator*(const vint8 &a, const vint8 &b)
friend vfloat16 operator+(const vfloat16 &a, const vfloat16 &b)
vfloat4(const float *f)
Construct from a pointer to 4 values.
const vint4 & operator=(int a)
Assign one value to all components.
static constexpr size_t size() noexcept
void store(bool *values) const
Helper: store the values into memory as bools.
static const vint4 Zero()
Return an vint4 with all components set to 0.
friend vbool16 operator<(const vfloat16 &a, const vfloat16 &b)
friend const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
vfloat3(float a)
Construct from a single value (store it in all slots)
friend vint8 operator-(const vint8 &a)
static const char * name()
friend const vint16 & operator+=(vint16 &a, const vint16 &b)
bool set_flush_zero_mode(bool on)
friend vint8 operator^(const vint8 &a, const vint8 &b)
void clear()
Set all components to 0.0.
SYS_API float logf(float x)
float length2() const
Square of the length of the vector.
friend vbool4 operator!=(const vfloat4 &a, const vfloat4 &b)
friend vbool16 operator<(const vint16 &a, const vint16 &b)
vbool4(int a, int b, int c, int d)
Construct from 4 int values.
vfloat3(float a, float b, float c)
Construct from 3 values.
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
void store(bool *values) const
Helper: store the values into memory as bools.
friend std::ostream & operator<<(std::ostream &cout, const matrix44 &M)
Stream output.
matrix44 transposed() const
Return the transposed matrix.
OIIO_FORCEINLINE vint4 operator>>(const vint4 &a, const unsigned int bits)
vfloat4 round(const vfloat4 &a)
GLboolean GLboolean GLboolean b
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
static const vfloat4 One()
Return a vfloat4 with all components set to 1.0.
void store_mask(int mask, value_t *values) const
#define SIMD_RETURN(T, x)
friend vbool16 operator>(const vint16 &a, const vint16 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
const vint16 & operator=(int a)
Assign one value to all components.
vfloat8(const char *vals)
Construct from a pointer to char values.
bool get_denorms_zero_mode()
friend vbool8 operator~(const vbool8 &a)
friend vbool4 operator!=(const vint4 &a, const vint4 &b)
vfloat3 transformvT(const matrix44 &M, const vfloat3 &V)
const vfloat3 & operator/=(const vfloat3 &a)
friend vbool4 operator>(const vint4 &a, const vint4 &b)
friend const vint4 & operator-=(vint4 &a, const vint4 &b)
bool get_flush_zero_mode()
static constexpr size_t size() noexcept
float operator[](int i) const
Component access (get)
void load(float val)
Helper: load a single value into all components.
static const vfloat8 Iota(float start=0.0f, float step=1.0f)
friend const vint4 & operator&=(vint4 &a, const vint4 &b)
simd_raw_t< float, 4 >::type simd_t
the native SIMD type used
vbool8()
Default constructor (contents undefined)
friend vbool16 operator|(const vbool16 &a, const vbool16 &b)
vfloat16()
Default constructor (contents undefined)
vint8 lo() const
Extract the lower precision vint8.
vfloat4 vdot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b in every component.
static const vfloat8 Zero()
Return a vfloat8 with all components set to 0.0.
friend vint16 operator|(const vint16 &a, const vint16 &b)
vfloat4 rcp_fast(const vfloat4 &a)
Fast, approximate 1/a.
friend vint4 operator+(const vint4 &a, const vint4 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend vint16 operator>>(const vint16 &a, unsigned int bits)
Integer 16-vector, accelerated by SIMD instructions when available.
vfloat16(const unsigned char *vals)
Construct from a pointer to unsigned char values.
static const char * type_name()
friend vbool4 operator^(const vbool4 &a, const vbool4 &b)
const vfloat3 & operator*=(const vfloat3 &a)
static const char * type_name()
OIIO_FORCEINLINE vint4 operator/(const vint4 &a, const vint4 &b)
vfloat16(const simd_t &m)
Construct from the underlying SIMD type.
OIIO_FORCEINLINE vbool4 operator^(const vbool4 &a, const vbool4 &b)
static const vint16 One()
Return an vint16 with all components set to 1.
vfloat16(const char *vals)
Construct from a pointer to char values.
vbool16 vbool_t
SIMD bool type.
vint4 hi() const
Extract the higher precision vint4.
vfloat3 transformp(const vfloat3 &V) const
Transform 3-point V by 4x4 matrix M.
OIIO_FORCEINLINE matrix44(const float *a, const float *b, const float *c, const float *d)
Construct from 4 float[4] rows.
friend vbool8 operator>(const vfloat8 &a, const vfloat8 &b)
OIIO_FORCEINLINE vbool4 operator|(const vbool4 &a, const vbool4 &b)
Vec3< float > V3f
Vec3 of float.
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
GLfloat GLfloat GLfloat GLfloat h
void store_mask(const vbool_t &mask, value_t *values) const
static const vfloat3 Iota(float start=0.0f, float step=1.0f)
vint4 safe_mod(const vint4 &a, const vint4 &b)
friend const vint8 & operator/=(vint8 &a, const vint8 &b)
static const vfloat4 Iota(float start=0.0f, float step=1.0f)
friend const vfloat8 & operator-=(vfloat8 &a, const vfloat8 &b)
IMATH_NAMESPACE::V2f IMATH_NAMESPACE::Box2i std::string this attribute is obsolete as of OpenEXR v3 float
void load(int a)
Helper: load a single int into all components.
void store(bool *values) const
Helper: store the values into memory as bools.
vint4 vreduce_add(const vint4 &v)
The sum of all components, returned in all components.
OIIO_FORCEINLINE const vint4 & operator<<=(vint4 &a, const unsigned int bits)
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
float value_t
Underlying equivalent scalar value type.
OIIO_FORCEINLINE vbool4 operator<(const vint4 &a, const vint4 &b)
friend vbool8 operator|(const vbool8 &a, const vbool8 &b)
friend const vint16 & operator*=(vint16 &a, const vint16 &b)
OIIO_FORCEINLINE vbool4 operator!=(const vbool4 &a, const vbool4 &b)
const T * data() const noexcept
Return a pointer to the contiguous values comprising the matrix.
vfloat4 ceil(const vfloat4 &a)
friend vbool16 operator==(const vfloat16 &a, const vfloat16 &b)
vbool4 vbool_t
SIMD bool type.
void store(int *values) const
Store the values into memory.
void clear()
Sset all components to 0.
vfloat4 vfloat_t
float type of the same length
GLenum GLsizei GLsizei GLint * values
friend vint16 operator%(const vint16 &a, const vint16 &b)
OIIO_FORCEINLINE vbool4 operator&(const vbool4 &a, const vbool4 &b)
#define SIMD_CONSTRUCT(x)
vfloat4(const simd_t &m)
Construct from the underlying SIMD type.
OIIO_FORCEINLINE matrix44()
friend const vint16 & operator-=(vint16 &a, const vint16 &b)
vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
Construct from 8 values.
static const vint4 Iota(int start=0, int step=1)
void load_mask(int mask, const value_t *values)
friend vfloat4 operator+(const vfloat4 &a, const vfloat4 &b)
void clear()
Set all components to false.
int m_val[paddedelements]
friend vbool8 operator&(const vbool8 &a, const vbool8 &b)
static const vint16 NegOne()
Return an vint16 with all components set to -1 (aka 0xffffffff)
vfloat4 bitcast_to_float4(const vint4 &x)
vint4 lo() const
Extract the lower precision vint4.
friend vbool8 operator^(const vbool8 &a, const vbool8 &b)
friend vint4 operator-(const vint4 &a)
vfloat4 nmadd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
vfloat3(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
void load_mask(int mask, const value_t *values)
friend vbool16 operator~(const vbool16 &a)
friend const vint16 & operator/=(vint16 &a, const vint16 &b)
vint4(const simd_t &m)
Construct from the underlying SIMD type.
friend vbool8 operator>=(const vint8 &a, const vint8 &b)
vfloat8 vfloat_t
SIMD int type.
friend vint16 operator^(const vint16 &a, const vint16 &b)
friend vint8 operator>>(const vint8 &a, unsigned int bits)
int m_val[paddedelements]
value_t m_val[paddedelements]
vint4()
Default constructor (contents undefined)
const vbool4 & operator=(bool a)
Assign one value to all components.
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vint4 operator<<(const vint4 &a, unsigned int bits)
vfloat4(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
GA_API const UT_StringHolder N
friend vint16 operator&(const vint16 &a, const vint16 &b)
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
const vint8 & operator=(int a)
Assign one value to all components.
void store_mask(int mask, value_t *values) const
static const vbool8 False()
Return a vbool8 the is 'false' for all values.
friend vint16 operator-(const vint16 &a)
friend vbool4 operator&(const vbool4 &a, const vbool4 &b)
friend vbool16 operator&(const vbool16 &a, const vbool16 &b)
vbool4 vbool_t
bool type of the same length
friend vfloat8 operator%(const vfloat8 &a, const vfloat8 &b)
vbool8 hi() const
Extract the higher precision vbool8.
friend vint8 operator+(const vint8 &a, const vint8 &b)
friend const vbool8 & operator^=(vbool8 &a, const vbool8 &b)
static const char * name()
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend const vfloat16 & operator-=(vfloat16 &a, const vfloat16 &b)
friend vbool16 operator!=(const vbool16 &a, const vbool16 &b)
const vfloat16 & operator=(float a)
Assign a single value to all components.
vint4 rotl32(const vint4 &x, const unsigned int k)
friend const vbool8 & operator&=(vbool8 &a, const vbool8 &b)
vint4 floori(const vfloat4 &a)
OIIO_FORCEINLINE const vint4 & operator-=(vint4 &a, const vint4 &b)
friend const vint4 & operator|=(vint4 &a, const vint4 &b)
vint4(const vint4 &other)
Copy construct from another vint4.
friend vbool16 operator>=(const vfloat16 &a, const vfloat16 &b)
friend const vint16 & operator>>=(vint16 &a, unsigned int bits)
void setcomp(int i, int value)
Component access (set).
static const vint8 Iota(int start=0, int step=1)
friend vint4 operator>>(const vint4 &a, unsigned int bits)
OIIO_FORCEINLINE vint4 operator*(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
GLubyte GLubyte GLubyte GLubyte w
friend vbool8 operator==(const vfloat8 &a, const vfloat8 &b)
static const vint8 Giota()
Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
friend const vfloat8 & operator*=(vfloat8 &a, const vfloat8 &b)
friend vfloat4 operator*(const vfloat4 &V, const matrix44 &M)
friend const vint8 & operator^=(vint8 &a, const vint8 &b)
vfloat3(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
const vfloat4 & operator+=(const vfloat4 &a)
friend vint4 operator*(const vint4 &a, const vint4 &b)
void load(bool a)
Helper: load a single value into all components.
friend const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
static const char * name()
friend vbool4 operator~(const vbool4 &a)
const vfloat3 & operator+=(const vfloat3 &a)
void store_mask(const vbool_t &mask, value_t *values) const
friend vbool4 operator<=(const vint4 &a, const vint4 &b)
#define OIIO_NAMESPACE_END
friend const vint16 & operator|=(vint16 &a, const vint16 &b)
void load(float val)
Helper: load a single value into all components.
vint4 min(const vint4 &a, const vint4 &b)
friend const vint8 & operator&=(vint8 &a, const vint8 &b)
friend std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Stream output.
static const char * name()
bool value_t
Underlying equivalent scalar value type.
bool value_t
Underlying equivalent scalar value type.
vfloat4 hi() const
Extract the higher precision vfloat4.
void load_mask(const vbool_t &mask, const value_t *values)
OIIO_FORCEINLINE T log(const T &v)
friend vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
friend vfloat8 operator*(const vfloat8 &a, const vfloat8 &b)
friend vbool8 operator<=(const vint8 &a, const vint8 &b)
OIIO_FORCEINLINE vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
vfloat4 rsqrt(const vfloat4 &a)
Fully accurate 1/sqrt.
vbool16(bool a)
Construct from a single value (store it in all slots)
friend vbool4 operator<=(const vfloat4 &a, const vfloat4 &b)
static const vint8 One()
Return an vint8 with all components set to 1.
void load(int a)
Helper: load a single int into all components.
vint4 vint_t
int type of the same length
vfloat16(const short *vals)
Construct from a pointer to short values.
OIIO_FORCEINLINE matrix44(float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33)
Construct from 16 floats.
vint4 rint(const vfloat4 &a)
void setcomp(int i, float value)
Component access (set).
simd_raw_t< float, 16 >::type simd_t
the native SIMD type used
vfloat4 sign(const vfloat4 &a)
1.0 when value >= 0, -1 when negative
static const vint16 Zero()
Return an vint16 with all components set to 0.
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vbool16 operator>(const vfloat16 &a, const vfloat16 &b)
OIIO_FORCEINLINE const vint4 & operator*=(vint4 &a, const vint4 &b)
bool all(const vbool4 &v)
void store(float *values) const
static constexpr int elements
friend const vint4 & operator/=(vint4 &a, const vint4 &b)
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
vfloat8(const simd_t &m)
Construct from the underlying SIMD type.
vint4 andnot(const vint4 &a, const vint4 &b)
andnot(a,b) returns ((~a) & b)
vfloat3(const char *vals)
Construct from a pointer to 4 char values.
void store(int *values) const
Store the values into memory.
OIIO_FORCEINLINE matrix44(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d)
Construct from 4 vfloat4 rows.
const float * data() const
void normalize()
Normalize in place.
static const vint16 Iota(int start=0, int step=1)
friend vbool4 operator>=(const vint4 &a, const vint4 &b)
bool operator==(const matrix44 &m) const
vfloat4 xyz1() const
Return xyz components, plus 1 for w.
static const char * type_name()
const vfloat3 & operator=(float a)
Assign a single value to all components.
Is a type T one of our SIMD-based types?
static const vfloat16 Iota(float start=0.0f, float step=1.0f)
OIIO_FORCEINLINE vint4 operator-(const vint4 &a)
friend vbool4 operator==(const vint4 &a, const vint4 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat16 &val)
Stream output.
friend std::ostream & operator<<(std::ostream &cout, const vbool8 &a)
Stream output.
friend std::ostream & operator<<(std::ostream &cout, const vbool16 &a)
Stream output.
vfloat8(const unsigned char *vals)
Construct from a pointer to unsigned char values.
friend vbool16 operator==(const vint16 &a, const vint16 &b)
friend vint16 operator~(const vint16 &a)
int operator[](int i) const
Component access (get)
vint4 ifloor(const vfloat4 &a)
(int)floor
friend vbool4 operator==(const vbool4 &a, const vbool4 &b)
Comparison operators, component by component.
float dot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b.
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vbool8 operator==(const vint8 &a, const vint8 &b)
OIIO_FORCEINLINE vint4 operator+(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE const vint4 & operator%=(vint4 &a, const vint4 &b)
friend vbool4 operator>(const vfloat4 &a, const vfloat4 &b)
vint4 AxBxCxDx(const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d)
vint8 hi() const
Extract the higher precision vint8.
OIIO_FORCEINLINE matrix44(const matrix44 &M)
Copy constructor.
friend vint8 operator%(const vint8 &a, const vint8 &b)
void setcomp(int i, bool value)
Component access (set).
vfloat4 xyz0() const
Return xyz components, plus 0 for w.
simd_bool_t< 4 >::type simd_t
the native SIMD type used
void setcomp(int i, int value)
Component access (set).
int reduce_add(const vint4 &v)
#define OIIO_NAMESPACE_BEGIN
friend std::ostream & operator<<(std::ostream &cout, const vfloat3 &val)
Stream output.
void store(float *values) const
friend const vint8 & operator<<=(vint8 &a, unsigned int bits)
static const char * type_name()
friend vbool4 operator>=(const vfloat4 &a, const vfloat4 &b)
friend const vint8 & operator|=(vint8 &a, const vint8 &b)