70 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
71 # include <x86intrin.h>
72 #elif defined(__GNUC__) && defined(__ARM_NEON__)
73 # include <arm_neon.h>
79 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
83 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
84 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
85 # define OIIO_SIMD_SSE 4
91 # elif defined(__SSSE3__)
92 # define OIIO_SIMD_SSE 3
101 # define OIIO_SIMD_SSE 2
104 # define OIIO_SIMD_MAX_SIZE_BYTES 16
105 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
106 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
108 # define OIIO_SIMD_SSE 0
111 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
113 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
114 # define OIIO_SIMD_AVX 2
116 # define OIIO_SIMD_AVX 1
120 # undef OIIO_SIMD_MAX_SIZE_BYTES
121 # define OIIO_SIMD_MAX_SIZE_BYTES 32
122 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
123 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
124 # if defined(__AVX512F__)
125 # undef OIIO_SIMD_AVX
126 # define OIIO_SIMD_AVX 512
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 64
130 # define OIIO_SIMD 16
131 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
132 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
133 # define OIIO_AVX512F_ENABLED 1
135 # if defined(__AVX512DQ__)
136 # define OIIO_AVX512DQ_ENABLED 1
138 # define OIIO_AVX512DQ_ENABLED 0
140 # if defined(__AVX512PF__)
141 # define OIIO_AVX512PF_ENABLED 1
143 # define OIIO_AVX512PF_ENABLED 0
145 # if defined(__AVX512ER__)
146 # define OIIO_AVX512ER_ENABLED 1
148 # define OIIO_AVX512ER_ENABLED 0
150 # if defined(__AVX512CD__)
151 # define OIIO_AVX512CD_ENABLED 1
153 # define OIIO_AVX512CD_ENABLED 0
155 # if defined(__AVX512BW__)
156 # define OIIO_AVX512BW_ENABLED 1
158 # define OIIO_AVX512BW_ENABLED 0
160 # if defined(__AVX512VL__)
161 # define OIIO_AVX512VL_ENABLED 1
163 # define OIIO_AVX512VL_ENABLED 0
166 # define OIIO_SIMD_AVX 0
167 # define OIIO_AVX512VL_ENABLED 0
168 # define OIIO_AVX512DQ_ENABLED 0
169 # define OIIO_AVX512PF_ENABLED 0
170 # define OIIO_AVX512ER_ENABLED 0
171 # define OIIO_AVX512CD_ENABLED 0
172 # define OIIO_AVX512BW_ENABLED 0
176 # define OIIO_FMA_ENABLED 1
178 # define OIIO_FMA_ENABLED 0
180 #if defined(__AVX512IFMA__)
181 # define OIIO_AVX512IFMA_ENABLED 1
183 # define OIIO_AVX512IFMA_ENABLED 0
186 #if defined(__F16C__)
187 # define OIIO_F16C_ENABLED 1
189 # define OIIO_F16C_ENABLED 0
194 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
196 # define OIIO_SIMD_NEON 1
197 # define OIIO_SIMD_MAX_SIZE_BYTES 16
198 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
199 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
201 # define OIIO_SIMD_NEON 0
207 # define OIIO_SIMD4_ALIGN
208 # define OIIO_SIMD_MAX_SIZE_BYTES 16
211 #ifndef OIIO_SIMD8_ALIGN
212 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
214 #ifndef OIIO_SIMD16_ALIGN
215 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
224 #define OIIO_SIMD_HAS_MATRIX4 1
225 #define OIIO_SIMD_HAS_FLOAT8 1
226 #define OIIO_SIMD_HAS_SIMD8 1
227 #define OIIO_SIMD_HAS_SIMD16 1
281 template<>
struct simd_bool_t<4> {
typedef __m128
type; };
285 template<>
struct simd_raw_t<
int,8> {
typedef __m256i
type; };
286 template<>
struct simd_raw_t<float,8> {
typedef __m256
type; };
287 template<>
struct simd_bool_t<8> {
typedef __m256
type; };
290 #if OIIO_SIMD_AVX >= 512
291 template<>
struct simd_raw_t<
int,16> {
typedef __m512i
type; };
292 template<>
struct simd_raw_t<float,16> {
typedef __m512
type; };
293 template<>
struct simd_bool_t<16> {
typedef __mmask16
type; };
302 template<>
struct simd_raw_t<float,4> {
typedef float32x4_t
type; };
303 template<>
struct simd_bool_t<4> {
typedef uint32x4_t
type; };
309 template<
typename T,
int elements>
struct VecType {};
343 template<
typename T>
struct SimdTypeName {
static const char *
name() {
return "unknown"; } };
358 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
359 static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
360 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
361 static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
362 # define OIIO_SIMD_INT4_CONST(name,val) \
363 static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
364 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
365 static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
366 # define OIIO_SIMD_UINT4_CONST(name,val) \
367 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
368 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
369 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
371 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
372 static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
373 (val), (val), (val), (val) }
374 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
375 static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
376 (v4), (v5), (v6), (v7) }
377 # define OIIO_SIMD_INT8_CONST(name,val) \
378 static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
379 (val), (val), (val), (val) }
380 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
381 static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
382 (v4), (v5), (v6), (v7) }
383 # define OIIO_SIMD_UINT8_CONST(name,val) \
384 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
385 (val), (val), (val), (val) }
386 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
387 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
388 (v4), (v5), (v6), (v7) }
390 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
391 static const OIIO_SIMD16_ALIGN float name[16] = { \
392 (val), (val), (val), (val), (val), (val), (val), (val), \
393 (val), (val), (val), (val), (val), (val), (val), (val) }
394 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
395 static const OIIO_SIMD16_ALIGN float name[16] = { \
396 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
397 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
398 # define OIIO_SIMD_INT16_CONST(name,val) \
399 static const OIIO_SIMD16_ALIGN int name[16] = { \
400 (val), (val), (val), (val), (val), (val), (val), (val), \
401 (val), (val), (val), (val), (val), (val), (val), (val) }
402 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
403 static const OIIO_SIMD16_ALIGN int name[16] = { \
404 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
405 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
406 # define OIIO_SIMD_UINT16_CONST(name,val) \
407 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
408 (val), (val), (val), (val), (val), (val), (val), (val), \
409 (val), (val), (val), (val), (val), (val), (val), (val) }
410 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
411 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
412 (val), (val), (val), (val), (val), (val), (val), (val), \
413 (val), (val), (val), (val), (val), (val), (val), (val) }
420 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
421 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
422 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
423 for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
424 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
425 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
465 explicit vbool4 (
const bool *
a);
475 load (
bool(a),
bool(b),
bool(c),
bool(d));
524 void load (
bool a,
bool b,
bool c,
bool d);
561 template<
int i0,
int i1,
int i2,
int i3> vbool4
shuffle (
const vbool4&
a);
564 template<
int i> vbool4
shuffle (
const vbool4&
a);
568 template<
int i>
bool extract (
const vbool4&
a);
571 template<
int i> vbool4
insert (
const vbool4&
a,
bool val);
578 bool all (
const vbool4&
v);
579 bool any (
const vbool4&
v);
580 bool none (
const vbool4&
v);
583 inline bool all (
bool v) {
return v; }
608 vbool8 (
bool a,
bool b,
bool c,
bool d,
bool e,
bool f,
bool g,
bool h);
614 vbool8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
671 void load (
bool a,
bool b,
bool c,
bool d,
672 bool e,
bool f,
bool g,
bool h);
710 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
714 template<
int i> vbool8
shuffle (
const vbool8&
a);
718 template<
int i>
bool extract (
const vbool8&
a);
721 template<
int i> vbool8
insert (
const vbool8&
a,
bool val);
728 bool all (
const vbool8&
v);
729 bool any (
const vbool8&
v);
730 bool none (
const vbool8&
v);
758 vbool16 (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
759 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
765 vbool16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
766 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
821 void load (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
822 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
863 template<
int i>
bool extract (
const vbool16&
a);
866 template<
int i> vbool16
insert (
const vbool16&
a,
bool val);
873 bool all (
const vbool16&
v);
874 bool any (
const vbool16&
v);
875 bool none (
const vbool16&
v);
909 vint4 (
const int *vals);
912 explicit vint4 (
const unsigned short *vals);
915 explicit vint4 (
const short *vals);
918 explicit vint4 (
const unsigned char *vals);
921 explicit vint4 (
const char *vals);
988 void load (
int a,
int b,
int c,
int d);
1032 template<
int scale=4>
1035 template<
int scale=4>
1037 template<
int scale=4>
1041 template<
int scale=4>
1044 template<
int scale=4>
1046 template<
int scale=4>
1097 vint4
srl (
const vint4&
val,
const unsigned int bits);
1101 template<
int i0,
int i1,
int i2,
int i3> vint4
shuffle (
const vint4&
a);
1104 template<
int i> vint4
shuffle (
const vint4&
a);
1108 template<
int i>
int extract (
const vint4&
v);
1120 vint4
blend (
const vint4&
a,
const vint4&
b,
const vbool4&
mask);
1125 vint4
blend0 (
const vint4&
a,
const vbool4&
mask);
1135 vint4
select (
const vbool4&
mask,
const vint4&
a,
const vint4&
b);
1138 vint4
abs (
const vint4&
a);
1139 vint4
min (
const vint4&
a,
const vint4&
b);
1140 vint4
max (
const vint4&
a,
const vint4&
b);
1143 vint4
rotl (
const vint4&
x,
const int s);
1145 vint4
rotl32 (
const vint4&
x,
const unsigned int k);
1148 vint4
andnot (
const vint4&
a,
const vint4&
b);
1155 void transpose (vint4 &
a, vint4 &
b, vint4 &
c, vint4 &d);
1156 void transpose (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d,
1157 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1159 vint4
AxBxCxDx (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d);
1162 vint4
safe_mod (
const vint4&
a,
const vint4&
b);
1193 vint8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1196 vint8 (
const int *vals);
1199 explicit vint8 (
const unsigned short *vals);
1202 explicit vint8 (
const short *vals);
1205 explicit vint8 (
const unsigned char *vals);
1208 explicit vint8 (
const char *vals);
1284 void load (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1328 template<
int scale=4>
1331 template<
int scale=4>
1333 template<
int scale=4>
1337 template<
int scale=4>
1340 template<
int scale=4>
1342 template<
int scale=4>
1394 vint8
srl (
const vint8&
val,
const unsigned int bits);
1398 template<
int i0,
int i1,
int i2,
int i3,
1399 int i4,
int i5,
int i6,
int i7> vint8
shuffle (
const vint8&
a);
1402 template<
int i> vint8
shuffle (
const vint8&
a);
1406 template<
int i>
int extract (
const vint8&
v);
1409 template<
int i> vint8
insert (
const vint8&
a,
int val);
1421 vint8
blend (
const vint8&
a,
const vint8&
b,
const vbool8&
mask);
1426 vint8
blend0 (
const vint8&
a,
const vbool8&
mask);
1436 vint8
select (
const vbool8&
mask,
const vint8&
a,
const vint8&
b);
1439 vint8
abs (
const vint8&
a);
1440 vint8
min (
const vint8&
a,
const vint8&
b);
1441 vint8
max (
const vint8&
a,
const vint8&
b);
1444 vint8
rotl (
const vint8&
x,
const int s);
1446 vint8
rotl32 (
const vint8&
x,
const unsigned int k);
1449 vint8
andnot (
const vint8&
a,
const vint8&
b);
1457 vint8
safe_mod (
const vint8&
a,
const vint8&
b);
1486 vint16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1487 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1490 vint16 (
const int *vals);
1493 explicit vint16 (
const unsigned short *vals);
1496 explicit vint16 (
const short *vals);
1499 explicit vint16 (
const unsigned char *vals);
1502 explicit vint16 (
const char *vals);
1581 void load (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1582 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1626 template<
int scale=4>
1629 template<
int scale=4>
1631 template<
int scale=4>
1633 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
1637 template<
int scale=4>
1640 template<
int scale=4>
1642 template<
int scale=4>
1644 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
1696 vint16
srl (
const vint16&
val,
const unsigned int bits);
1699 template<
int i0,
int i1,
int i2,
int i3>
1703 template<
int i> vint16
shuffle4 (
const vint16&
a);
1706 template<
int i0,
int i1,
int i2,
int i3>
1710 template<
int i> vint16
shuffle (
const vint16&
a);
1714 template<
int i>
int extract (
const vint16&
v);
1717 template<
int i> vint16
insert (
const vint16&
a,
int val);
1729 vint16
blend (
const vint16&
a,
const vint16&
b,
const vbool16&
mask);
1734 vint16
blend0 (
const vint16&
a,
const vbool16&
mask);
1744 vint16
select (
const vbool16&
mask,
const vint16&
a,
const vint16&
b);
1747 vint16
abs (
const vint16&
a);
1748 vint16
min (
const vint16&
a,
const vint16&
b);
1749 vint16
max (
const vint16&
a,
const vint16&
b);
1752 vint16
rotl (
const vint16&
x,
const int s);
1754 vint16
rotl32 (
const vint16&
x,
const unsigned int k);
1757 vint16
andnot (
const vint16&
a,
const vint16&
b);
1765 vint16
safe_mod (
const vint16&
a,
const vint16&
b);
1896 void load (
float a,
float b,
float c,
float d=0.0
f);
1942 template<
int scale=4>
1945 template<
int scale=4>
1947 template<
int scale=4>
1951 template<
int scale=4>
1954 template<
int scale=4>
1956 template<
int scale=4>
2012 template<
int i0,
int i1,
int i2,
int i3> vfloat4
shuffle (
const vfloat4&
a);
2015 template<
int i> vfloat4
shuffle (
const vfloat4&
a);
2019 template<
int i>
float extract (
const vfloat4&
a);
2022 template<
int i> vfloat4
insert (
const vfloat4&
a,
float val);
2031 vfloat4
vdot (
const vfloat4 &
a,
const vfloat4 &
b);
2034 float dot (
const vfloat4 &
a,
const vfloat4 &
b);
2038 vfloat4
vdot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2041 float dot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2045 vfloat4
blend (
const vfloat4&
a,
const vfloat4&
b,
const vbool4&
mask);
2050 vfloat4
blend0 (
const vfloat4&
a,
const vbool4&
mask);
2059 vfloat4
safe_div (
const vfloat4 &
a,
const vfloat4 &
b);
2062 vfloat3
hdiv (
const vfloat4 &
a);
2067 vfloat4
select (
const vbool4&
mask,
const vfloat4&
a,
const vfloat4&
b);
2070 vfloat4
abs (
const vfloat4&
a);
2071 vfloat4
sign (
const vfloat4&
a);
2072 vfloat4
ceil (
const vfloat4&
a);
2073 vfloat4
floor (
const vfloat4&
a);
2074 vint4
ifloor (
const vfloat4&
a);
2084 vfloat4
round (
const vfloat4&
a);
2091 vint4
rint (
const vfloat4&
a);
2094 vfloat4
sqrt (
const vfloat4 &
a);
2095 vfloat4
rsqrt (
const vfloat4 &
a);
2097 vfloat4
min (
const vfloat4&
a,
const vfloat4&
b);
2098 vfloat4
max (
const vfloat4&
a,
const vfloat4&
b);
2099 template <
typename T>
T exp (
const T&
v);
2100 template <
typename T>
T log (
const T&
v);
2103 vfloat4
andnot (
const vfloat4&
a,
const vfloat4&
b);
2106 vfloat4
madd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2107 vfloat4
msub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2108 vfloat4
nmadd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2109 vfloat4
nmsub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2114 void transpose (vfloat4 &
a, vfloat4 &
b, vfloat4 &
c, vfloat4 &d);
2115 void transpose (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c,
const vfloat4& d,
2116 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2119 vfloat4
AxBxCxDx (
const vfloat4&
a,
const vfloat4&
b,
2120 const vfloat4&
c,
const vfloat4& d);
2269 vfloat3
abs (
const vfloat3&
a);
2270 vfloat3
sign (
const vfloat3&
a);
2271 vfloat3
ceil (
const vfloat3&
a);
2272 vfloat3
floor (
const vfloat3&
a);
2273 vfloat3
round (
const vfloat3&
a);
2283 #ifndef OIIO_SIMD_SSE
2291 m_row[0].load (M[0]);
2292 m_row[1].load (M[1]);
2293 m_row[2].load (M[2]);
2294 m_row[3].load (M[3]);
2303 m_row[0].load (f+0);
2304 m_row[1].load (f+4);
2305 m_row[2].load (f+8);
2306 m_row[3].load (f+12);
2316 m_row[0] =
a; m_row[1] =
b; m_row[2] =
c; m_row[3] = d;
2326 const float *
c,
const float *d) {
2328 m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2330 memcpy (m_mat[0], a, 4*
sizeof(
float));
2331 memcpy (m_mat[1], b, 4*
sizeof(
float));
2332 memcpy (m_mat[2], c, 4*
sizeof(
float));
2333 memcpy (m_mat[3], d, 4*
sizeof(
float));
2339 float f10,
float f11,
float f12,
float f13,
2340 float f20,
float f21,
float f22,
float f23,
2341 float f30,
float f31,
float f32,
float f33)
2344 m_row[0].load (f00, f01, f02, f03);
2345 m_row[1].load (f10, f11, f12, f13);
2346 m_row[2].load (f20, f21, f22, f23);
2347 m_row[3].load (f30, f31, f32, f33);
2349 m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2350 m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2351 m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2352 m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2402 vfloat3
transformp (
const matrix44 &M,
const vfloat3 &V);
2406 vfloat3
transformv (
const matrix44 &M,
const vfloat3 &V);
2410 vfloat3
transformvT (
const matrix44 &M,
const vfloat3 &V);
2440 float e,
float f,
float g,
float h) {
load(a,b,c,d,e,f,g,h); }
2532 void load (
float a,
float b,
float c,
float d,
2533 float e,
float f,
float g,
float h);
2579 template<
int scale=4>
2581 template<
int scale=4>
2584 template<
int scale=4>
2588 template<
int scale=4>
2591 template<
int scale=4>
2593 template<
int scale=4>
2635 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
2636 vfloat8
shuffle (
const vfloat8&
a);
2639 template<
int i> vfloat8
shuffle (
const vfloat8&
a);
2643 template<
int i>
float extract (
const vfloat8&
a);
2646 template<
int i> vfloat8
insert (
const vfloat8&
a,
float val);
2655 vfloat8
vdot (
const vfloat8 &
a,
const vfloat8 &
b);
2658 float dot (
const vfloat8 &
a,
const vfloat8 &
b);
2662 vfloat8
vdot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2665 float dot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2669 vfloat8
blend (
const vfloat8&
a,
const vfloat8&
b,
const vbool8&
mask);
2674 vfloat8
blend0 (
const vfloat8&
a,
const vbool8&
mask);
2683 vfloat8
safe_div (
const vfloat8 &
a,
const vfloat8 &
b);
2688 vfloat8
select (
const vbool8&
mask,
const vfloat8&
a,
const vfloat8&
b);
2691 vfloat8
abs (
const vfloat8&
a);
2692 vfloat8
sign (
const vfloat8&
a);
2693 vfloat8
ceil (
const vfloat8&
a);
2694 vfloat8
floor (
const vfloat8&
a);
2695 vint8
ifloor (
const vfloat8&
a);
2705 vfloat8
round (
const vfloat8&
a);
2712 vint8
rint (
const vfloat8&
a);
2715 vfloat8
sqrt (
const vfloat8 &
a);
2716 vfloat8
rsqrt (
const vfloat8 &
a);
2718 vfloat8
min (
const vfloat8&
a,
const vfloat8&
b);
2719 vfloat8
max (
const vfloat8&
a,
const vfloat8&
b);
2724 vfloat8
andnot (
const vfloat8&
a,
const vfloat8&
b);
2727 vfloat8
madd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2728 vfloat8
msub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2729 vfloat8
nmadd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2730 vfloat8
nmsub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2758 float v4,
float v5,
float v6,
float v7,
2759 float v8,
float v9,
float v10,
float v11,
2760 float v12,
float v13,
float v14,
float v15);
2856 float v4,
float v5,
float v6,
float v7,
2857 float v8,
float v9,
float v10,
float v11,
2858 float v12,
float v13,
float v14,
float v15);
2904 template<
int scale=4>
2907 template<
int scale=4>
2909 template<
int scale=4>
2911 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
2915 template<
int scale=4>
2918 template<
int scale=4>
2920 template<
int scale=4>
2922 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
2963 template<
int i0,
int i1,
int i2,
int i3>
2967 template<
int i> vfloat16
shuffle4 (
const vfloat16&
a);
2970 template<
int i0,
int i1,
int i2,
int i3>
2971 vfloat16
shuffle (
const vfloat16&
a);
2974 template<
int i> vfloat16
shuffle (
const vfloat16&
a);
2978 template<
int i>
float extract (
const vfloat16&
a);
2981 template<
int i> vfloat16
insert (
const vfloat16&
a,
float val);
2991 vfloat16
blend (
const vfloat16&
a,
const vfloat16&
b,
const vbool4&
mask);
2996 vfloat16
blend0 (
const vfloat16&
a,
const vbool4&
mask);
3005 vfloat16
safe_div (
const vfloat16 &
a,
const vfloat16 &
b);
3010 vfloat16
select (
const vbool16&
mask,
const vfloat16&
a,
const vfloat16&
b);
3013 vfloat16
abs (
const vfloat16&
a);
3014 vfloat16
sign (
const vfloat16&
a);
3015 vfloat16
ceil (
const vfloat16&
a);
3016 vfloat16
floor (
const vfloat16&
a);
3017 vint16
ifloor (
const vfloat16&
a);
3027 vfloat16
round (
const vfloat16&
a);
3034 vint16
rint (
const vfloat16&
a);
3037 vfloat16
sqrt (
const vfloat16 &
a);
3038 vfloat16
rsqrt (
const vfloat16 &
a);
3040 vfloat16
min (
const vfloat16&
a,
const vfloat16&
b);
3041 vfloat16
max (
const vfloat16&
a,
const vfloat16&
b);
3046 vfloat16
andnot (
const vfloat16&
a,
const vfloat16&
b);
3049 vfloat16
madd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3050 vfloat16
msub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3051 vfloat16
nmadd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3052 vfloat16
nmsub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3062 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3063 _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3073 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3074 _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3082 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3083 return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3090 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3091 return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3129 return ((_mm_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3143 m_val[i] = value ? -1 : 0;
3149 for (
int i = 1; i < a.elements; ++i)
3150 cout <<
' ' << a[i];
3157 m_simd = _mm_castsi128_ps(_mm_set1_epi32(-
int(a)));
3158 #elif OIIO_SIMD_NEON
3159 m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3171 m_simd = _mm_castsi128_ps(_mm_set_epi32(-
int(d), -
int(c), -
int(b), -
int(a)));
3183 load (a[0], a[1], a[2], a[3]);
3194 return _mm_movemask_ps(
m_simd);
3214 m_simd = _mm_setzero_ps();
3223 return _mm_setzero_ps();
3232 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3233 __m128i anyval = _mm_undefined_si128();
3235 __m128i anyval = _mm_setzero_si128();
3237 return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3249 for (
int i = 0; i <
n; ++i)
3250 values[i] =
m_val[i] ?
true :
false;
3258 #elif OIIO_SIMD_NEON
3259 return vmvnq_u32(a.
simd());
3267 return _mm_and_ps (a.
simd(), b.
simd());
3268 #elif OIIO_SIMD_NEON
3269 return vandq_u32(a.
simd(), b.
simd());
3277 return _mm_or_ps (a.
simd(), b.
simd());
3278 #elif OIIO_SIMD_NEON
3279 return vorrq_u32(a.
simd(), b.
simd());
3287 return _mm_xor_ps (a.
simd(), b.
simd());
3288 #elif OIIO_SIMD_NEON
3289 return veorq_u32(a.
simd(), b.
simd());
3312 #elif OIIO_SIMD_NEON
3313 return vmvnq_u32(a.
m_simd);
3321 return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3322 #elif OIIO_SIMD_NEON
3331 return _mm_xor_ps (a, b);
3332 #elif OIIO_SIMD_NEON
3344 template<
int i0,
int i1,
int i2,
int i3>
3346 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3350 #if OIIO_SIMD_SSE >= 3
3353 return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(
a)));
3356 return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(
a)));
3359 return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(
a)));
3364 template<
int i0,
int i1,
int i2,
int i3>
3366 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3370 #if OIIO_SIMD_SSE >= 3
3373 return _mm_moveldup_ps(a);
3376 return _mm_movehdup_ps(a);
3379 return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3386 template<
int i0,
int i1,
int i2,
int i3>
3389 return shuffle_sse<i0,i1,i2,i3> (a.
simd());
3391 return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3397 return shuffle<i,i,i,i>(
a);
3405 #if OIIO_SIMD_SSE >= 4
3406 return _mm_extract_epi32(_mm_castps_si128(a.
simd()), i);
3415 #if OIIO_SIMD_SSE >= 4
3416 int ival = -
int(val);
3417 return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3427 return _mm_testc_ps (v,
vbool4(
true)) != 0;
3429 return _mm_movemask_ps(v.
simd()) == 0xf;
3437 return ! _mm_testz_ps (v, v);
3439 return _mm_movemask_ps(v) != 0;
3458 return ((_mm256_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3466 m_val[i] = value ? -1 : 0;
3477 for (
int i = 1; i < a.elements; ++i)
3478 cout <<
' ' << a[i];
3485 m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-
int(a)));
3486 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3497 bool e,
bool f,
bool g,
bool h) {
3501 m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-
int(h), -
int(g), -
int(f), -
int(e),
3502 -
int(d), -
int(c), -
int(b), -
int(a)));
3503 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3519 bool e,
bool f,
bool g,
bool h) {
3520 load (a, b, c, d, e, f, g, h);
3524 int e,
int f,
int g,
int h) {
3525 load (
bool(a),
bool(b),
bool(c),
bool(d),
3526 bool(e),
bool(f),
bool(g),
bool(h));
3530 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3546 return _mm256_movemask_ps(
m_simd);
3562 m_simd = _mm256_setzero_ps();
3570 return _mm256_setzero_ps();
3579 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3581 __m256i anyval = _mm256_undefined_si256();
3582 return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3584 return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3598 for (
int i = 0; i <
n; ++i)
3599 values[i] =
m_val[i] ?
true :
false;
3605 return _mm256_castps256_ps128 (
simd());
3613 return _mm256_extractf128_ps (
simd(), 1);
3622 __m256
r = _mm256_castps128_ps256 (lo);
3623 m_simd = _mm256_insertf128_ps (r, hi, 1);
3642 return _mm256_and_ps (a.
simd(), b.
simd());
3650 return _mm256_or_ps (a.
simd(), b.
simd());
3658 return _mm256_xor_ps (a.
simd(), b.
simd());
3689 #if OIIO_SIMD_AVX >= 2
3690 return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3692 return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3700 return _mm256_xor_ps (a, b);
3707 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
3709 #if OIIO_SIMD_AVX >= 2
3710 vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3711 return _mm256_permutevar8x32_ps (a.
simd(), index.
simd());
3713 return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3718 return shuffle<i,i,i,i,i,i,i,i>(
a);
3724 #if OIIO_SIMD_AVX && !_WIN32
3725 return _mm256_extract_epi32(_mm256_castps_si256(a.
simd()), i);
3733 #if OIIO_SIMD_AVX && !_WIN32
3734 int ival = -
int(val);
3735 return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.
simd()), ival, i));
3746 return _mm256_testc_ps (v,
vbool8(
true)) != 0;
3755 return ! _mm256_testz_ps (v, v);
3775 #if OIIO_SIMD_AVX >= 512
3776 return (
int(
m_simd) >> i) & 1;
3778 return (
m_bits >> i) & 1;
3785 bits &= (0xffff ^ (1<<i));
3786 bits |= (
int(value)<<i);
3793 for (
int i = 1; i < a.elements; ++i)
3794 cout <<
' ' << a[i];
3810 bool v4,
bool v5,
bool v6,
bool v7,
3811 bool v8,
bool v9,
bool v10,
bool v11,
3812 bool v12,
bool v13,
bool v14,
bool v15) {
3832 bool v4,
bool v5,
bool v6,
bool v7,
3833 bool v8,
bool v9,
bool v10,
bool v11,
3834 bool v12,
bool v13,
bool v14,
bool v15) {
3835 load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3839 int v4,
int v5,
int v6,
int v7,
3840 int v8,
int v9,
int v10,
int v11,
3841 int v12,
int v13,
int v14,
int v15) {
3842 load (
bool(v0),
bool(v1),
bool(v2),
bool(v3),
3843 bool(v4),
bool(v5),
bool(v6),
bool(v7),
3844 bool(v8),
bool(v9),
bool(v10),
bool(v11),
3845 bool(v12),
bool(v13),
bool(v14),
bool(v15));
3853 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3854 a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3870 #if OIIO_SIMD_AVX >= 512
3898 for (
int i = 0; i <
n; ++i)
3899 values[i] =
m_bits & (1<<i);
3905 #if OIIO_SIMD_AVX >= 512
3906 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()&0xff, -1));
3913 #if OIIO_SIMD_AVX >= 512
3914 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()>>8, -1));
3922 #if OIIO_SIMD_AVX >= 512
3923 return _mm512_knot (a.
simd());
3930 #if OIIO_SIMD_AVX >= 512
3931 return _mm512_kand (a.
simd(), b.
simd());
3938 #if OIIO_SIMD_AVX >= 512
3939 return _mm512_kor (a.
simd(), b.
simd());
3946 #if OIIO_SIMD_AVX >= 512
3947 return _mm512_kxor (a.
simd(), b.
simd());
3973 #if OIIO_SIMD_AVX >= 512
3974 return _mm512_kxnor (a.
simd(), b.
simd());
3981 #if OIIO_SIMD_AVX >= 512
3982 return _mm512_kxor (a.
simd(), b.
simd());