68 #if defined(__CUDA_ARCH__)
72 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
73 # include <x86intrin.h>
74 #elif defined(__GNUC__) && defined(__ARM_NEON__)
75 # include <arm_neon.h>
81 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
86 #if defined(__CUDA_ARCH__) && !defined(OIIO_NO_SSE)
90 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
91 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
92 # define OIIO_SIMD_SSE 4
98 # elif defined(__SSSE3__)
99 # define OIIO_SIMD_SSE 3
108 # define OIIO_SIMD_SSE 2
111 # define OIIO_SIMD_MAX_SIZE_BYTES 16
112 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
113 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
115 # define OIIO_SIMD_SSE 0
118 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
120 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
121 # define OIIO_SIMD_AVX 2
123 # define OIIO_SIMD_AVX 1
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 32
129 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
130 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
131 # if defined(__AVX512F__)
132 # undef OIIO_SIMD_AVX
133 # define OIIO_SIMD_AVX 512
134 # undef OIIO_SIMD_MAX_SIZE_BYTES
135 # define OIIO_SIMD_MAX_SIZE_BYTES 64
137 # define OIIO_SIMD 16
138 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
139 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
140 # define OIIO_AVX512F_ENABLED 1
142 # if defined(__AVX512DQ__)
143 # define OIIO_AVX512DQ_ENABLED 1
145 # define OIIO_AVX512DQ_ENABLED 0
147 # if defined(__AVX512PF__)
148 # define OIIO_AVX512PF_ENABLED 1
150 # define OIIO_AVX512PF_ENABLED 0
152 # if defined(__AVX512ER__)
153 # define OIIO_AVX512ER_ENABLED 1
155 # define OIIO_AVX512ER_ENABLED 0
157 # if defined(__AVX512CD__)
158 # define OIIO_AVX512CD_ENABLED 1
160 # define OIIO_AVX512CD_ENABLED 0
162 # if defined(__AVX512BW__)
163 # define OIIO_AVX512BW_ENABLED 1
165 # define OIIO_AVX512BW_ENABLED 0
167 # if defined(__AVX512VL__)
168 # define OIIO_AVX512VL_ENABLED 1
170 # define OIIO_AVX512VL_ENABLED 0
173 # define OIIO_SIMD_AVX 0
174 # define OIIO_AVX512VL_ENABLED 0
175 # define OIIO_AVX512DQ_ENABLED 0
176 # define OIIO_AVX512PF_ENABLED 0
177 # define OIIO_AVX512ER_ENABLED 0
178 # define OIIO_AVX512CD_ENABLED 0
179 # define OIIO_AVX512BW_ENABLED 0
183 # define OIIO_FMA_ENABLED 1
185 # define OIIO_FMA_ENABLED 0
187 #if defined(__AVX512IFMA__)
188 # define OIIO_AVX512IFMA_ENABLED 1
190 # define OIIO_AVX512IFMA_ENABLED 0
193 #if defined(__F16C__)
194 # define OIIO_F16C_ENABLED 1
196 # define OIIO_F16C_ENABLED 0
201 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
203 # define OIIO_SIMD_NEON 1
204 # define OIIO_SIMD_MAX_SIZE_BYTES 16
205 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
206 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
208 # define OIIO_SIMD_NEON 0
214 # define OIIO_SIMD4_ALIGN
215 # define OIIO_SIMD_MAX_SIZE_BYTES 16
218 #ifndef OIIO_SIMD8_ALIGN
219 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
221 #ifndef OIIO_SIMD16_ALIGN
222 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
231 #define OIIO_SIMD_HAS_MATRIX4 1
232 #define OIIO_SIMD_HAS_FLOAT8 1
233 #define OIIO_SIMD_HAS_SIMD8 1
234 #define OIIO_SIMD_HAS_SIMD16 1
288 template<>
struct simd_bool_t<4> {
typedef __m128
type; };
292 template<>
struct simd_raw_t<
int,8> {
typedef __m256i
type; };
293 template<>
struct simd_raw_t<
float,8> {
typedef __m256
type; };
294 template<>
struct simd_bool_t<8> {
typedef __m256
type; };
297 #if OIIO_SIMD_AVX >= 512
298 template<>
struct simd_raw_t<
int,16> {
typedef __m512i
type; };
299 template<>
struct simd_raw_t<
float,16> {
typedef __m512
type; };
300 template<>
struct simd_bool_t<16> {
typedef __mmask16
type; };
309 template<>
struct simd_raw_t<
float,4> {
typedef float32x4_t
type; };
310 template<>
struct simd_bool_t<4> {
typedef uint32x4_t
type; };
316 template<
typename T,
int elements>
struct VecType {};
350 template<
typename T>
struct SimdTypeName {
static const char *
name() {
return "unknown"; } };
365 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
366 static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
367 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
368 static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
369 # define OIIO_SIMD_INT4_CONST(name,val) \
370 static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
371 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
372 static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
373 # define OIIO_SIMD_UINT4_CONST(name,val) \
374 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
375 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
376 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
378 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
379 static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
380 (val), (val), (val), (val) }
381 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
382 static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
383 (v4), (v5), (v6), (v7) }
384 # define OIIO_SIMD_INT8_CONST(name,val) \
385 static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
386 (val), (val), (val), (val) }
387 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
388 static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
389 (v4), (v5), (v6), (v7) }
390 # define OIIO_SIMD_UINT8_CONST(name,val) \
391 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
392 (val), (val), (val), (val) }
393 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
394 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
395 (v4), (v5), (v6), (v7) }
397 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
398 static const OIIO_SIMD16_ALIGN float name[16] = { \
399 (val), (val), (val), (val), (val), (val), (val), (val), \
400 (val), (val), (val), (val), (val), (val), (val), (val) }
401 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
402 static const OIIO_SIMD16_ALIGN float name[16] = { \
403 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
404 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
405 # define OIIO_SIMD_INT16_CONST(name,val) \
406 static const OIIO_SIMD16_ALIGN int name[16] = { \
407 (val), (val), (val), (val), (val), (val), (val), (val), \
408 (val), (val), (val), (val), (val), (val), (val), (val) }
409 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
410 static const OIIO_SIMD16_ALIGN int name[16] = { \
411 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
412 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
413 # define OIIO_SIMD_UINT16_CONST(name,val) \
414 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
415 (val), (val), (val), (val), (val), (val), (val), (val), \
416 (val), (val), (val), (val), (val), (val), (val), (val) }
417 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
418 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
419 (val), (val), (val), (val), (val), (val), (val), (val), \
420 (val), (val), (val), (val), (val), (val), (val), (val) }
427 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
428 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
429 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
430 for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
431 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
432 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
472 explicit vbool4 (
const bool *
a);
482 load (
bool(a),
bool(b),
bool(c),
bool(d));
531 void load (
bool a,
bool b,
bool c,
bool d);
568 template<
int i0,
int i1,
int i2,
int i3>
586 bool all (
const vbool4&
v);
587 bool any (
const vbool4&
v);
588 bool none (
const vbool4&
v);
591 inline bool all (
bool v) {
return v; }
616 vbool8 (
bool a,
bool b,
bool c,
bool d,
bool e,
bool f,
bool g,
bool h);
622 vbool8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
679 void load (
bool a,
bool b,
bool c,
bool d,
680 bool e,
bool f,
bool g,
bool h);
718 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
736 bool all (
const vbool8&
v);
737 bool any (
const vbool8&
v);
738 bool none (
const vbool8&
v);
766 vbool16 (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
767 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
773 vbool16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
774 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
829 void load (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
830 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
881 bool all (
const vbool16&
v);
882 bool any (
const vbool16&
v);
883 bool none (
const vbool16&
v);
919 vint4 (
const int *vals);
922 explicit vint4 (
const unsigned short *vals);
925 explicit vint4 (
const short *vals);
928 explicit vint4 (
const unsigned char *vals);
931 explicit vint4 (
const char *vals);
998 void load (
int a,
int b,
int c,
int d);
1042 template<
int scale=4>
1045 template<
int scale=4>
1047 template<
int scale=4>
1051 template<
int scale=4>
1054 template<
int scale=4>
1056 template<
int scale=4>
1107 vint4
srl (
const vint4&
val,
const unsigned int bits);
1111 template<
int i0,
int i1,
int i2,
int i3>
1131 vint4
blend (
const vint4&
a,
const vint4&
b,
const vbool4&
mask);
1136 vint4
blend0 (
const vint4&
a,
const vbool4&
mask);
1146 vint4
select (
const vbool4&
mask,
const vint4&
a,
const vint4&
b);
1149 vint4
abs (
const vint4&
a);
1150 vint4
min (
const vint4&
a,
const vint4&
b);
1151 vint4
max (
const vint4&
a,
const vint4&
b);
1154 vint4
rotl (
const vint4&
x,
const int s);
1156 vint4
rotl32 (
const vint4&
x,
const unsigned int k);
1159 vint4
andnot (
const vint4&
a,
const vint4&
b);
1166 void transpose (vint4 &
a, vint4 &
b, vint4 &
c, vint4 &d);
1167 void transpose (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d,
1168 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1170 vint4
AxBxCxDx (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d);
1173 vint4
safe_mod (
const vint4&
a,
const vint4&
b);
1206 vint8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1209 vint8 (
const int *vals);
1212 explicit vint8 (
const unsigned short *vals);
1215 explicit vint8 (
const short *vals);
1218 explicit vint8 (
const unsigned char *vals);
1221 explicit vint8 (
const char *vals);
1297 void load (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1341 template<
int scale=4>
1344 template<
int scale=4>
1346 template<
int scale=4>
1350 template<
int scale=4>
1353 template<
int scale=4>
1355 template<
int scale=4>
1407 vint8
srl (
const vint8&
val,
const unsigned int bits);
1411 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
1434 vint8
blend (
const vint8&
a,
const vint8&
b,
const vbool8&
mask);
1439 vint8
blend0 (
const vint8&
a,
const vbool8&
mask);
1449 vint8
select (
const vbool8&
mask,
const vint8&
a,
const vint8&
b);
1452 vint8
abs (
const vint8&
a);
1453 vint8
min (
const vint8&
a,
const vint8&
b);
1454 vint8
max (
const vint8&
a,
const vint8&
b);
1457 vint8
rotl (
const vint8&
x,
const int s);
1459 vint8
rotl32 (
const vint8&
x,
const unsigned int k);
1462 vint8
andnot (
const vint8&
a,
const vint8&
b);
1470 vint8
safe_mod (
const vint8&
a,
const vint8&
b);
1501 vint16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1502 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1505 vint16 (
const int *vals);
1508 explicit vint16 (
const unsigned short *vals);
1511 explicit vint16 (
const short *vals);
1514 explicit vint16 (
const unsigned char *vals);
1517 explicit vint16 (
const char *vals);
1596 void load (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1597 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1641 template<
int scale=4>
1644 template<
int scale=4>
1646 template<
int scale=4>
1648 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
1652 template<
int scale=4>
1655 template<
int scale=4>
1657 template<
int scale=4>
1659 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
1711 vint16
srl (
const vint16&
val,
const unsigned int bits);
1714 template<
int i0,
int i1,
int i2,
int i3>
1718 template<
int i> vint16
shuffle4 (
const vint16&
a);
1721 template<
int i0,
int i1,
int i2,
int i3>
1725 template<
int i> vint16
shuffle (
const vint16&
a);
1744 vint16
blend (
const vint16&
a,
const vint16&
b,
const vbool16&
mask);
1749 vint16
blend0 (
const vint16&
a,
const vbool16&
mask);
1759 vint16
select (
const vbool16&
mask,
const vint16&
a,
const vint16&
b);
1762 vint16
abs (
const vint16&
a);
1763 vint16
min (
const vint16&
a,
const vint16&
b);
1764 vint16
max (
const vint16&
a,
const vint16&
b);
1767 vint16
rotl (
const vint16&
x,
const int s);
1769 vint16
rotl32 (
const vint16&
x,
const unsigned int k);
1772 vint16
andnot (
const vint16&
a,
const vint16&
b);
1780 vint16
safe_mod (
const vint16&
a,
const vint16&
b);
1859 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1913 void load (
float a,
float b,
float c,
float d=0.0
f);
1934 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1944 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1959 template<
int scale=4>
1962 template<
int scale=4>
1964 template<
int scale=4>
1968 template<
int scale=4>
1971 template<
int scale=4>
1973 template<
int scale=4>
2029 template<
int i0,
int i1,
int i2,
int i3>
2049 vfloat4
vdot (
const vfloat4 &
a,
const vfloat4 &
b);
2052 float dot (
const vfloat4 &
a,
const vfloat4 &
b);
2056 vfloat4
vdot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2059 float dot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2063 vfloat4
blend (
const vfloat4&
a,
const vfloat4&
b,
const vbool4&
mask);
2068 vfloat4
blend0 (
const vfloat4&
a,
const vbool4&
mask);
2077 vfloat4
safe_div (
const vfloat4 &
a,
const vfloat4 &
b);
2080 vfloat3
hdiv (
const vfloat4 &
a);
2085 vfloat4
select (
const vbool4&
mask,
const vfloat4&
a,
const vfloat4&
b);
2088 vfloat4
abs (
const vfloat4&
a);
2089 vfloat4
sign (
const vfloat4&
a);
2090 vfloat4
ceil (
const vfloat4&
a);
2091 vfloat4
floor (
const vfloat4&
a);
2092 vint4
ifloor (
const vfloat4&
a);
2103 vfloat4
round (
const vfloat4&
a);
2110 vint4
rint (
const vfloat4&
a);
2113 vfloat4
sqrt (
const vfloat4 &
a);
2114 vfloat4
rsqrt (
const vfloat4 &
a);
2116 vfloat4
min (
const vfloat4&
a,
const vfloat4&
b);
2117 vfloat4
max (
const vfloat4&
a,
const vfloat4&
b);
2122 vfloat4
andnot (
const vfloat4&
a,
const vfloat4&
b);
2125 vfloat4
madd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2126 vfloat4
msub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2127 vfloat4
nmadd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2128 vfloat4
nmsub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2133 void transpose (vfloat4 &
a, vfloat4 &
b, vfloat4 &
c, vfloat4 &d);
2134 void transpose (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c,
const vfloat4& d,
2135 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2138 vfloat4
AxBxCxDx (
const vfloat4&
a,
const vfloat4&
b,
2139 const vfloat4&
c,
const vfloat4& d);
2199 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2238 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2247 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2288 vfloat3
abs (
const vfloat3&
a);
2289 vfloat3
sign (
const vfloat3&
a);
2290 vfloat3
ceil (
const vfloat3&
a);
2291 vfloat3
floor (
const vfloat3&
a);
2292 vfloat3
round (
const vfloat3&
a);
2302 #ifndef OIIO_SIMD_SSE
2303 : m_mat(Imath::UNINITIALIZED)
2310 m_row[0].load (M[0]);
2311 m_row[1].load (M[1]);
2312 m_row[2].load (M[2]);
2313 m_row[3].load (M[3]);
2322 m_row[0].load (f+0);
2323 m_row[1].load (f+4);
2324 m_row[2].load (f+8);
2325 m_row[3].load (f+12);
2335 m_row[0] =
a; m_row[1] =
b; m_row[2] =
c; m_row[3] = d;
2345 const float *
c,
const float *d) {
2347 m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2349 memcpy (m_mat[0], a, 4*
sizeof(
float));
2350 memcpy (m_mat[1], b, 4*
sizeof(
float));
2351 memcpy (m_mat[2], c, 4*
sizeof(
float));
2352 memcpy (m_mat[3], d, 4*
sizeof(
float));
2358 float f10,
float f11,
float f12,
float f13,
2359 float f20,
float f21,
float f22,
float f23,
2360 float f30,
float f31,
float f32,
float f33)
2363 m_row[0].load (f00, f01, f02, f03);
2364 m_row[1].load (f10, f11, f12, f13);
2365 m_row[2].load (f20, f21, f22, f23);
2366 m_row[3].load (f30, f31, f32, f33);
2368 m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2369 m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2370 m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2371 m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2421 vfloat3
transformp (
const matrix44 &M,
const vfloat3 &V);
2425 vfloat3
transformv (
const matrix44 &M,
const vfloat3 &V);
2429 vfloat3
transformvT (
const matrix44 &M,
const vfloat3 &V);
2461 float e,
float f,
float g,
float h) {
load(a,b,c,d,e,f,g,h); }
2499 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2553 void load (
float a,
float b,
float c,
float d,
2554 float e,
float f,
float g,
float h);
2575 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2585 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2600 template<
int scale=4>
2602 template<
int scale=4>
2605 template<
int scale=4>
2609 template<
int scale=4>
2612 template<
int scale=4>
2614 template<
int scale=4>
2656 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
2676 vfloat8
vdot (
const vfloat8 &
a,
const vfloat8 &
b);
2679 float dot (
const vfloat8 &
a,
const vfloat8 &
b);
2683 vfloat8
vdot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2686 float dot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2690 vfloat8
blend (
const vfloat8&
a,
const vfloat8&
b,
const vbool8&
mask);
2695 vfloat8
blend0 (
const vfloat8&
a,
const vbool8&
mask);
2704 vfloat8
safe_div (
const vfloat8 &
a,
const vfloat8 &
b);
2709 vfloat8
select (
const vbool8&
mask,
const vfloat8&
a,
const vfloat8&
b);
2712 vfloat8
abs (
const vfloat8&
a);
2713 vfloat8
sign (
const vfloat8&
a);
2714 vfloat8
ceil (
const vfloat8&
a);
2715 vfloat8
floor (
const vfloat8&
a);
2716 vint8
ifloor (
const vfloat8&
a);
2726 vfloat8
round (
const vfloat8&
a);
2733 vint8
rint (
const vfloat8&
a);
2736 vfloat8
sqrt (
const vfloat8 &
a);
2737 vfloat8
rsqrt (
const vfloat8 &
a);
2739 vfloat8
min (
const vfloat8&
a,
const vfloat8&
b);
2740 vfloat8
max (
const vfloat8&
a,
const vfloat8&
b);
2745 vfloat8
andnot (
const vfloat8&
a,
const vfloat8&
b);
2748 vfloat8
madd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2749 vfloat8
msub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2750 vfloat8
nmadd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2751 vfloat8
nmsub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2781 float v4,
float v5,
float v6,
float v7,
2782 float v8,
float v9,
float v10,
float v11,
2783 float v12,
float v13,
float v14,
float v15);
2824 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2879 float v4,
float v5,
float v6,
float v7,
2880 float v8,
float v9,
float v10,
float v11,
2881 float v12,
float v13,
float v14,
float v15);
2902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2912 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2927 template<
int scale=4>
2930 template<
int scale=4>
2932 template<
int scale=4>
2934 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
2938 template<
int scale=4>
2941 template<
int scale=4>
2943 template<
int scale=4>
2945 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
2986 template<
int i0,
int i1,
int i2,
int i3>
2993 template<
int i0,
int i1,
int i2,
int i3>
2997 template<
int i> vfloat16
shuffle (
const vfloat16&
a);
3014 vfloat16
blend (
const vfloat16&
a,
const vfloat16&
b,
const vbool4&
mask);
3019 vfloat16
blend0 (
const vfloat16&
a,
const vbool4&
mask);
3028 vfloat16
safe_div (
const vfloat16 &
a,
const vfloat16 &
b);
3033 vfloat16
select (
const vbool16&
mask,
const vfloat16&
a,
const vfloat16&
b);
3036 vfloat16
abs (
const vfloat16&
a);
3037 vfloat16
sign (
const vfloat16&
a);
3038 vfloat16
ceil (
const vfloat16&
a);
3039 vfloat16
floor (
const vfloat16&
a);
3040 vint16
ifloor (
const vfloat16&
a);
3051 vfloat16
round (
const vfloat16&
a);
3058 vint16
rint (
const vfloat16&
a);
3061 vfloat16
sqrt (
const vfloat16 &
a);
3062 vfloat16
rsqrt (
const vfloat16 &
a);
3064 vfloat16
min (
const vfloat16&
a,
const vfloat16&
b);
3065 vfloat16
max (
const vfloat16&
a,
const vfloat16&
b);
3070 vfloat16
andnot (
const vfloat16&
a,
const vfloat16&
b);
3073 vfloat16
madd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3074 vfloat16
msub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3075 vfloat16
nmadd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3076 vfloat16
nmsub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3085 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3086 _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3095 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3096 _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3104 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3105 return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3112 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3113 return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3151 return ((_mm_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3165 m_val[i] = value ? -1 : 0;
3171 for (
int i = 1; i < a.elements; ++i)
3172 cout <<
' ' << a[i];
3179 m_simd = _mm_castsi128_ps(_mm_set1_epi32(-
int(a)));
3180 #elif OIIO_SIMD_NEON
3181 m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3193 m_simd = _mm_castsi128_ps(_mm_set_epi32(-
int(d), -
int(c), -
int(b), -
int(a)));
3205 load (a[0], a[1], a[2], a[3]);
3216 return _mm_movemask_ps(
m_simd);
3236 m_simd = _mm_setzero_ps();
3245 return _mm_setzero_ps();
3254 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3255 __m128i anyval = _mm_undefined_si128();
3257 __m128i anyval = _mm_setzero_si128();
3259 return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3271 for (
int i = 0; i <
n; ++i)
3272 values[i] =
m_val[i] ?
true :
false;
3280 #elif OIIO_SIMD_NEON
3281 return vmvnq_u32(a.
simd());
3289 return _mm_and_ps (a.
simd(), b.
simd());
3290 #elif OIIO_SIMD_NEON
3291 return vandq_u32(a.
simd(), b.
simd());
3299 return _mm_or_ps (a.
simd(), b.
simd());
3300 #elif OIIO_SIMD_NEON
3301 return vorrq_u32(a.
simd(), b.
simd());
3309 return _mm_xor_ps (a.
simd(), b.
simd());
3310 #elif OIIO_SIMD_NEON
3311 return veorq_u32(a.
simd(), b.
simd());
3334 #elif OIIO_SIMD_NEON
3335 return vmvnq_u32(a.
m_simd);
3343 return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3344 #elif OIIO_SIMD_NEON
3353 return _mm_xor_ps (a, b);
3354 #elif OIIO_SIMD_NEON
3366 template<
int i0,
int i1,
int i2,
int i3>
3368 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3,
i2,
i1, i0));
3372 #if OIIO_SIMD_SSE >= 3
3375 return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(
a)));
3378 return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(
a)));
3381 return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(
a)));
3386 template<
int i0,
int i1,
int i2,
int i3>
3388 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3,
i2,
i1, i0)));
3392 #if OIIO_SIMD_SSE >= 3
3395 return _mm_moveldup_ps(a);
3398 return _mm_movehdup_ps(a);
3401 return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3408 template<
int i0,
int i1,
int i2,
int i3>
3411 return shuffle_sse<i0,i1,i2,i3> (a.
simd());
3419 return shuffle<i,i,i,i>(
a);
3427 #if OIIO_SIMD_SSE >= 4
3428 return _mm_extract_epi32(_mm_castps_si128(a.
simd()), i);
3437 #if OIIO_SIMD_SSE >= 4
3438 int ival = -
int(val);
3439 return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3449 return _mm_testc_ps (v,
vbool4(
true)) != 0;
3451 return _mm_movemask_ps(v.
simd()) == 0xf;
3459 return ! _mm_testz_ps (v, v);
3461 return _mm_movemask_ps(v) != 0;
3480 return ((_mm256_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3488 m_val[i] = value ? -1 : 0;
3499 for (
int i = 1; i < a.elements; ++i)
3500 cout <<
' ' << a[i];
3507 m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-
int(a)));
3508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3519 bool e,
bool f,
bool g,
bool h) {
3523 m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-
int(h), -
int(g), -
int(f), -
int(e),
3524 -
int(d), -
int(c), -
int(b), -
int(a)));
3525 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3541 bool e,
bool f,
bool g,
bool h) {
3542 load (a, b, c, d, e, f, g, h);
3546 int e,
int f,
int g,
int h) {
3547 load (
bool(a),
bool(b),
bool(c),
bool(d),
3548 bool(e),
bool(f),
bool(g),
bool(h));
3552 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3568 return _mm256_movemask_ps(
m_simd);
3584 m_simd = _mm256_setzero_ps();
3592 return _mm256_setzero_ps();
3601 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3603 __m256i anyval = _mm256_undefined_si256();
3604 return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3606 return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3620 for (
int i = 0; i <
n; ++i)
3621 values[i] =
m_val[i] ?
true :
false;
3627 return _mm256_castps256_ps128 (
simd());
3635 return _mm256_extractf128_ps (
simd(), 1);
3644 __m256
r = _mm256_castps128_ps256 (lo);
3645 m_simd = _mm256_insertf128_ps (r, hi, 1);
3664 return _mm256_and_ps (a.
simd(), b.
simd());
3672 return _mm256_or_ps (a.
simd(), b.
simd());
3680 return _mm256_xor_ps (a.
simd(), b.
simd());
3711 #if OIIO_SIMD_AVX >= 2
3712 return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3714 return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3722 return _mm256_xor_ps (a, b);
3729 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
3731 #if OIIO_SIMD_AVX >= 2
3733 return _mm256_permutevar8x32_ps (a.
simd(), index.
simd());
3735 return vbool8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3740 return shuffle<i,i,i,i,i,i,i,i>(
a);
3746 #if OIIO_SIMD_AVX && !_WIN32
3747 return _mm256_extract_epi32(_mm256_castps_si256(a.
simd()), i);
3755 #if OIIO_SIMD_AVX && !_WIN32
3756 int ival = -
int(val);
3757 return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.
simd()), ival, i));
3768 return _mm256_testc_ps (v,
vbool8(
true)) != 0;
3777 return ! _mm256_testz_ps (v, v);
3797 #if OIIO_SIMD_AVX >= 512
3798 return (
int(
m_simd) >> i) & 1;
3800 return (
m_bits >> i) & 1;
3807 bits &= (0xffff ^ (1<<i));
3808 bits |= (
int(value)<<i);
3815 for (
int i = 1; i < a.elements; ++i)
3816 cout <<
' ' << a[i];
3832 bool v4,
bool v5,
bool v6,
bool v7,
3833 bool v8,
bool v9,
bool v10,
bool v11,
3834 bool v12,
bool v13,
bool v14,
bool v15) {
3854 bool v4,
bool v5,
bool v6,
bool v7,
3855 bool v8,
bool v9,
bool v10,
bool v11,
3856 bool v12,
bool v13,
bool v14,
bool v15) {
3857 load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3861 int v4,
int v5,
int v6,
int v7,
3862 int v8,
int v9,
int v10,
int v11,
3863 int v12,
int v13,
int v14,
int v15) {
3864 load (
bool(v0),
bool(v1),
bool(v2),
bool(v3),
3865 bool(v4),
bool(v5),
bool(v6),
bool(v7),
3866 bool(v8),
bool(v9),
bool(v10),
bool(v11),
3867 bool(v12),
bool(v13),
bool(v14),
bool(v15));
3875 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3876 a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3892 #if OIIO_SIMD_AVX >= 512
3920 for (
int i = 0; i <
n; ++i)
3921 values[i] =
m_bits & (1<<i);
3927 #if OIIO_SIMD_AVX >= 512
3928 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()&0xff, -1));
3935 #if OIIO_SIMD_AVX >= 512
3936 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()>>8, -1));
3944 #if OIIO_SIMD_AVX >= 512
3945 return _mm512_knot (a.
simd());
3952 #if OIIO_SIMD_AVX >= 512
3953 return _mm512_kand (a.
simd(), b.
simd());
3960 #if OIIO_SIMD_AVX >= 512
3961 return _mm512_kor (a.
simd(), b.
simd());
3968 #if OIIO_SIMD_AVX >= 512
3969 return _mm512_kxor (a.
simd(), b.
simd());