HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd.h
Go to the documentation of this file.
1 // Copyright Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause and Apache-2.0
3 // https://github.com/AcademySoftwareFoundation/OpenImageIO
4 
5 /// @file simd.h
6 ///
7 /// @brief Classes for SIMD processing.
8 ///
9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
10 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
11 ///
12 /// Similar guide for ARM intrinsics:
13 /// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
14 ///
15 /// It helped me a lot to peruse the source of these packages:
16 /// Syrah: https://github.com/boulos/syrah
17 /// Embree: https://github.com/embree
18 /// Vectorial: https://github.com/scoopr/vectorial
19 /// Anger Fog: https://github.com/vectorclass/version2
20 ///
21 /// To find out which CPU features you have:
22 /// Linux: cat /proc/cpuinfo
23 /// OSX: sysctl machdep.cpu.features
24 ///
25 /// Additional web resources:
26 /// http://www.codersnotes.com/notes/maths-lib-2016/
27 /// https://www.agner.org/optimize/
28 /// https://www.corsix.org/content/converting-fp32-to-fp16
29 
30 // clang-format off
31 
32 #pragma once
33 #define OIIO_SIMD_H 1
34 
35 #include <algorithm>
36 #include <cmath>
37 #include <cstring>
38 
39 #ifdef OIIO_INTERNAL
40 # include <OpenImageIO/half.h>
41 #endif
42 
43 #include <OpenImageIO/dassert.h>
44 #include <OpenImageIO/platform.h>
45 #include <OpenImageIO/vecparam.h>
46 
47 #include <OpenImageIO/detail/fmt.h>
48 
49 
50 //////////////////////////////////////////////////////////////////////////
51 // Sort out which SIMD capabilities we have and set definitions
52 // appropriately. This is mostly for internal (within this file) use,
53 // but client applications using this header may find a few of the macros
54 // we define to be useful:
55 //
56 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
57 // hardware is available, this will hold the width in number of
58 // float SIMD "lanes" of widest SIMD registers available. For
59 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
60 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
61 // etc. Using SIMD classes wider than this should work (will be
62 // emulated with narrower SIMD or scalar operations), but is not
63 // expected to have high performance.
64 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
65 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
66 // higher (including AVX).
67 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
68 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
69 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
70 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
71 // available (generally will be OIIO_SIMD*4).
72 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
73 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
74 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
75 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
76 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
77 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
78 
79 #ifdef OIIO_NO_SIMD /* Request to disable all SIMD */
80 # define OIIO_NO_SSE 1
81 # define OIIO_NO_AVX 1
82 # define OIIO_NO_AVX2 1
83 # define OIIO_NO_NEON 1
84 #endif
85 
86 #if defined(_M_ARM64) || defined(__aarch64__) || defined(__aarch64)
87 # ifndef __ARM_NEON__
88 # define __ARM_NEON__
89 # endif
90 #endif
91 
92 // Disable Intel SIMD intrinsics on non-Intel architectures (including
93 // building for Cuda on an Intel host).
94 #if defined(_M_ARM64) || defined(__aarch64) || defined(__aarch64__) \
95  || defined(__CUDA_ARCH__)
96 # ifndef OIIO_NO_SSE
97 # define OIIO_NO_SSE 1
98 # endif
99 # ifndef OIIO_NO_AVX
100 # define OIIO_NO_AVX 1
101 # endif
102 # ifndef OIIO_NO_AVX2
103 # define OIIO_NO_AVX2 1
104 # endif
105 #endif
106 
107 #if !(defined(_M_ARM64) || defined(__aarch64) || defined(__aarch64__)) || defined(__CUDA_ARCH__)
108 # ifndef OIIO_NO_NEON
109 # define OIIO_NO_NEON 1
110 # endif
111 #endif
112 
113 #if defined(__CUDA_ARCH__)
114  // Cuda -- don't include any of these headers
115 #elif defined(_WIN32)
116 # include <intrin.h>
117 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
118 # include <x86intrin.h>
119 #elif defined(__GNUC__) && defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
120 # include <arm_neon.h>
121 #endif
122 
123 // Disable SSE for 32 bit Windows platforms, it's unreliable and hard for us
124 // to test thoroughly. We presume that anybody needing high performance
125 // badly enough to want SIMD also is on a 64 bit CPU.
126 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
127 #define OIIO_NO_SSE 1
128 #endif
129 
130 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
131 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
132 # define OIIO_SIMD_SSE 4
133  /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
134  * instructions specific to 4.2, but they are all related to string
135  * comparisons and CRCs, which don't currently seem relevant to OIIO,
136  * so for simplicity, we sweep this difference under the rug.
137  */
138 # elif defined(__SSSE3__)
139 # define OIIO_SIMD_SSE 3
140  /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
141  * there are a few older architectures that are SSE3 but not SSSE3,
142  * and this simplification means that these particular old platforms
143  * will only get SSE2 goodness out of our code. So be it. Anybody who
144  * cares about performance is probably using a 64 bit machine that's
145  * SSE 4.x or AVX by now.
146  */
147 # else
148 # define OIIO_SIMD_SSE 2
149 # endif
150 # define OIIO_SIMD 4
151 # define OIIO_SIMD_MAX_SIZE_BYTES 16
152 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
153 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
154 #else
155 # define OIIO_SIMD_SSE 0
156 #endif
157 
158 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
159  // N.B. Any machine with AVX will also have SSE
160 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
161 # define OIIO_SIMD_AVX 2
162 # else
163 # define OIIO_SIMD_AVX 1
164 # endif
165 # undef OIIO_SIMD
166 # define OIIO_SIMD 8
167 # undef OIIO_SIMD_MAX_SIZE_BYTES
168 # define OIIO_SIMD_MAX_SIZE_BYTES 32
169 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
170 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
171 # if defined(__AVX512F__)
172 # undef OIIO_SIMD_AVX
173 # define OIIO_SIMD_AVX 512
174 # undef OIIO_SIMD_MAX_SIZE_BYTES
175 # define OIIO_SIMD_MAX_SIZE_BYTES 64
176 # undef OIIO_SIMD
177 # define OIIO_SIMD 16
178 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
179 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
180 # define OIIO_AVX512F_ENABLED 1
181 # endif
182 # if defined(__AVX512DQ__)
183 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */
184 # else
185 # define OIIO_AVX512DQ_ENABLED 0
186 # endif
187 # if defined(__AVX512PF__)
188 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */
189 # else
190 # define OIIO_AVX512PF_ENABLED 0
191 # endif
192 # if defined(__AVX512ER__)
193 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */
194 # else
195 # define OIIO_AVX512ER_ENABLED 0
196 # endif
197 # if defined(__AVX512CD__)
198 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */
199 # else
200 # define OIIO_AVX512CD_ENABLED 0
201 # endif
202 # if defined(__AVX512BW__)
203 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */
204 # else
205 # define OIIO_AVX512BW_ENABLED 0
206 # endif
207 # if defined(__AVX512VL__)
208 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */
209 # else
210 # define OIIO_AVX512VL_ENABLED 0
211 # endif
212 #else
213 # define OIIO_SIMD_AVX 0
214 # define OIIO_AVX512VL_ENABLED 0
215 # define OIIO_AVX512DQ_ENABLED 0
216 # define OIIO_AVX512PF_ENABLED 0
217 # define OIIO_AVX512ER_ENABLED 0
218 # define OIIO_AVX512CD_ENABLED 0
219 # define OIIO_AVX512BW_ENABLED 0
220 #endif
221 
222 #if defined(__FMA__)
223 # define OIIO_FMA_ENABLED 1
224 #else
225 # define OIIO_FMA_ENABLED 0
226 #endif
227 #if defined(__AVX512IFMA__)
228 # define OIIO_AVX512IFMA_ENABLED 1
229 #else
230 # define OIIO_AVX512IFMA_ENABLED 0
231 #endif
232 
233 #if defined(__F16C__)
234 # define OIIO_F16C_ENABLED 1
235 #else
236 # define OIIO_F16C_ENABLED 0
237 #endif
238 
239 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
240 # define OIIO_SIMD 4
241 # define OIIO_SIMD_NEON 1
242 # define OIIO_SIMD_MAX_SIZE_BYTES 16
243 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
244 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
245 #else
246 # define OIIO_SIMD_NEON 0
247 #endif
248 
249 #ifndef OIIO_SIMD
250  // No SIMD available
251 # define OIIO_SIMD 0
252 # define OIIO_SIMD4_ALIGN
253 # define OIIO_SIMD_MAX_SIZE_BYTES 16
254 #endif
255 
256 #ifndef OIIO_SIMD8_ALIGN
257 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
258 #endif
259 #ifndef OIIO_SIMD16_ALIGN
260 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
261 #endif
262 
263 
264 // General features that client apps may want to test for, for conditional
265 // compilation. Will add to this over time as needed. Note that just
266 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
267 // the vfloat8 class (and friends) are in this version of simd.h, but that's
268 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
269 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */
270 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */
271 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */
272 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */
273 
274 
275 // Embarrassing hack: Xlib.h #define's True and False!
276 #ifdef True
277 # undef True
278 #endif
279 #ifdef False
280 # undef False
281 #endif
282 
283 
285 
286 namespace simd {
287 
288 //////////////////////////////////////////////////////////////////////////
289 // Forward declarations of our main SIMD classes
290 
291 class vbool4;
292 class vint4;
293 class vfloat4;
294 class vfloat3;
295 class matrix44;
296 class vbool8;
297 class vint8;
298 class vfloat8;
299 class vbool16;
300 class vint16;
301 class vfloat16;
302 
303 #if OIIO_DISABLE_DEPRECATED < OIIO_MAKE_VERSION(1,9,0) && !defined(OIIO_INTERNAL)
304 // Deprecated names -- remove these in 1.9
305 // These are removed from visibility for the OIIO codebase itself, or for any
306 // downstream project that defines OIIO_DISABLE_DEPRECATED to exclude
307 // declarations deprecated as of version 1.9 or later.
308 typedef vbool4 mask4; // old name
309 typedef vbool4 bool4;
310 typedef vbool8 bool8;
311 typedef vint4 int4;
312 typedef vint8 int8;
313 typedef vfloat3 float3;
314 typedef vfloat4 float4;
315 typedef vfloat8 float8;
316 #endif
317 
318 } // namespace simd
319 
320 
321 // Force has_subscript_N to understand that our simd::vfloat3 counts as a
322 // 3-vector, even though its padding to 4 values makes it look the wrong size.
323 template<> struct has_subscript_N<simd::vfloat3, float, 3> : public std::true_type { };
324 
325 
326 
327 namespace simd {
328 
329 //////////////////////////////////////////////////////////////////////////
330 // Template magic to determine the raw SIMD types involved, and other
331 // things helpful for metaprogramming.
332 
333 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
334 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
335 
336 #if OIIO_SIMD_SSE
337 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
338 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
339 template<> struct simd_bool_t<4> { typedef __m128 type; };
340 #endif
341 
342 #if OIIO_SIMD_AVX
343 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
344 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
345 template<> struct simd_bool_t<8> { typedef __m256 type; };
346 #endif
347 
348 #if OIIO_SIMD_AVX >= 512
349 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
350 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
351 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
352 #else
353 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
354 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
355 template<> struct simd_bool_t<16> { typedef uint16_t type; };
356 #endif
357 
358 #if OIIO_SIMD_NEON
359 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; };
360 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
361 template<> struct simd_bool_t<4> { typedef uint32x4_t type; };
362 #endif
363 
364 
365 /// Template to retrieve the vector type from the scalar. For example,
366 /// simd::VecType<int,4> will be vfloat4.
367 template<typename T,int elements> struct VecType {};
368 template<> struct VecType<int,1> { typedef int type; };
369 template<> struct VecType<float,1> { typedef float type; };
370 template<> struct VecType<int,4> { typedef vint4 type; };
371 template<> struct VecType<float,4> { typedef vfloat4 type; };
372 template<> struct VecType<float,3> { typedef vfloat3 type; };
373 template<> struct VecType<bool,4> { typedef vbool4 type; };
374 template<> struct VecType<int,8> { typedef vint8 type; };
375 template<> struct VecType<float,8> { typedef vfloat8 type; };
376 template<> struct VecType<bool,8> { typedef vbool8 type; };
377 template<> struct VecType<int,16> { typedef vint16 type; };
378 template<> struct VecType<float,16> { typedef vfloat16 type; };
379 template<> struct VecType<bool,16> { typedef vbool16 type; };
380 
381 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
382 /// anything but our SIMD types.
383 template<typename T> struct SimdSize { static const int size = 1; };
384 template<> struct SimdSize<vint4> { static const int size = 4; };
385 template<> struct SimdSize<vfloat4> { static const int size = 4; };
386 template<> struct SimdSize<vfloat3> { static const int size = 4; };
387 template<> struct SimdSize<vbool4> { static const int size = 4; };
388 template<> struct SimdSize<vint8> { static const int size = 8; };
389 template<> struct SimdSize<vfloat8> { static const int size = 8; };
390 template<> struct SimdSize<vbool8> { static const int size = 8; };
391 template<> struct SimdSize<vint16> { static const int size = 16; };
392 template<> struct SimdSize<vfloat16> { static const int size = 16; };
393 template<> struct SimdSize<vbool16> { static const int size = 16; };
394 
395 /// Template to retrieve the number of elements size of a SIMD type. Rigged
396 /// to be 1 for anything but our SIMD types.
397 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
398 template<> struct SimdElements<vfloat3> { static const int size = 3; };
399 
400 /// Template giving a printable name for each type
401 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
402 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } };
403 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } };
404 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } };
405 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } };
406 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } };
407 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } };
408 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } };
409 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } };
410 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } };
411 
412 /// Is a type T one of our SIMD-based types?
413 template<typename T> struct is_simd : std::false_type {};
414 template<> struct is_simd<vint4> : std::true_type {};
415 template<> struct is_simd<vfloat4> : std::true_type {};
416 template<> struct is_simd<vfloat3> : std::true_type {};
417 template<> struct is_simd<vbool4> : std::true_type {};
418 template<> struct is_simd<vint8> : std::true_type {};
419 template<> struct is_simd<vfloat8> : std::true_type {};
420 template<> struct is_simd<vbool8> : std::true_type {};
421 template<> struct is_simd<vint16> : std::true_type {};
422 template<> struct is_simd<vfloat16> : std::true_type {};
423 template<> struct is_simd<vbool16> : std::true_type {};
424 template<> struct is_simd<matrix44> : std::true_type {};
425 
426 
427 //////////////////////////////////////////////////////////////////////////
428 // Macros helpful for making static constants in code.
429 
430 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
431  static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
432 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
433  static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
434 # define OIIO_SIMD_INT4_CONST(name,val) \
435  static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
436 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
437  static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
438 # define OIIO_SIMD_UINT4_CONST(name,val) \
439  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
440 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
441  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
442 
443 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
444  static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
445  (val), (val), (val), (val) }
446 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
447  static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
448  (v4), (v5), (v6), (v7) }
449 # define OIIO_SIMD_INT8_CONST(name,val) \
450  static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
451  (val), (val), (val), (val) }
452 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
453  static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
454  (v4), (v5), (v6), (v7) }
455 # define OIIO_SIMD_UINT8_CONST(name,val) \
456  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
457  (val), (val), (val), (val) }
458 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
459  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
460  (v4), (v5), (v6), (v7) }
461 
462 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
463  static const OIIO_SIMD16_ALIGN float name[16] = { \
464  (val), (val), (val), (val), (val), (val), (val), (val), \
465  (val), (val), (val), (val), (val), (val), (val), (val) }
466 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
467  static const OIIO_SIMD16_ALIGN float name[16] = { \
468  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
469  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
470 # define OIIO_SIMD_INT16_CONST(name,val) \
471  static const OIIO_SIMD16_ALIGN int name[16] = { \
472  (val), (val), (val), (val), (val), (val), (val), (val), \
473  (val), (val), (val), (val), (val), (val), (val), (val) }
474 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
475  static const OIIO_SIMD16_ALIGN int name[16] = { \
476  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
477  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
478 # define OIIO_SIMD_UINT16_CONST(name,val) \
479  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
480  (val), (val), (val), (val), (val), (val), (val), (val), \
481  (val), (val), (val), (val), (val), (val), (val), (val) }
482 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
483  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
484  (val), (val), (val), (val), (val), (val), (val), (val), \
485  (val), (val), (val), (val), (val), (val), (val), (val) }
486 
487 
488 //////////////////////////////////////////////////////////////////////////
489 // Some macros just for use in this file (#undef-ed at the end) making
490 // it more succinct to express per-element operations.
491 
492 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
493 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
494 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
495  for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
496 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
497 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
498 
499 
500 
501 //////////////////////////////////////////////////////////////////////////
502 //////////////////////////////////////////////////////////////////////////
503 // The public declarations of the main SIMD classes follow: boolN, intN,
504 // floatN, matrix44.
505 //
506 // These class declarations are intended to be brief and self-documenting,
507 // and give all the information that users or client applications need to
508 // know to use these classes.
509 //
510 // No implementations are given inline except for the briefest, completely
511 // generic methods that don't have any architecture-specific overloads.
512 // After the class definitions, there will be an immense pile of full
513 // implementation definitions, which casual users are not expected to
514 // understand.
515 //////////////////////////////////////////////////////////////////////////
516 //////////////////////////////////////////////////////////////////////////
517 
518 
519 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
520 /// SIMD instructions when available. This is what is naturally produced by
521 /// SIMD comparison operators on the vfloat4 and vint4 types.
522 class vbool4 {
523 public:
524  static const char* type_name() { return "vbool4"; }
525  typedef bool value_t; ///< Underlying equivalent scalar value type
526  enum { elements = 4 }; ///< Number of scalar elements
527  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
528  enum { bits = elements*32 }; ///< Total number of bits
529  typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used
530  static constexpr size_t size() noexcept { return elements; }
531 
532  /// Default constructor (contents undefined)
533  vbool4 () { }
534 
535  /// Construct from a single value (store it in all slots)
536  vbool4 (bool a) { load(a); }
537 
538  explicit vbool4 (const bool *a);
539 
540  /// Construct from 4 bool values
541  vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
542 
543  /// Copy construct from another vbool4
544  vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
545 
546  /// Construct from 4 int values
547  vbool4 (int a, int b, int c, int d) {
548  load (bool(a), bool(b), bool(c), bool(d));
549  }
550 
551  /// Construct from a SIMD int (is each element nonzero?)
552  vbool4 (const vint4 &i);
553 
554  /// Construct from the underlying SIMD type
555  vbool4 (const simd_t& m) : m_simd(m) { }
556 
557  /// Return the raw SIMD type
558  operator simd_t () const { return m_simd; }
559  simd_t simd () const { return m_simd; }
560  simd_t& simd () { return m_simd; }
561 
562  /// Extract the bitmask
563  int bitmask () const;
564 
565  /// Convert from integer bitmask to a true vbool4
566  static vbool4 from_bitmask (int bitmask);
567 
568  /// Set all components to false
569  void clear ();
570 
571  /// Return a vbool4 the is 'false' for all values
572  static const vbool4 False ();
573 
574  /// Return a vbool4 the is 'true' for all values
575  static const vbool4 True ();
576 
577  /// Assign one value to all components
578  const vbool4 & operator= (bool a) { load(a); return *this; }
579 
580  /// Assignment of another vbool4
581  const vbool4 & operator= (const vbool4 & other);
582 
583  /// Component access (get)
584  int operator[] (int i) const;
585 
586  /// Component access (set).
587  void setcomp (int i, bool value);
588 
589  /// Component access (set).
590  /// NOTE: avoid this unsafe construct. It will go away some day.
591  int& operator[] (int i);
592 
593  /// Helper: load a single value into all components.
594  void load (bool a);
595 
596  /// Helper: load separate values into each component.
597  void load (bool a, bool b, bool c, bool d);
598 
599  /// Helper: store the values into memory as bools.
600  void store (bool *values) const;
601 
602  /// Store the first n values into memory.
603  void store (bool *values, int n) const;
604 
605  /// Logical/bitwise operators, component-by-component
606  friend vbool4 operator! (const vbool4& a);
607  friend vbool4 operator& (const vbool4& a, const vbool4& b);
608  friend vbool4 operator| (const vbool4& a, const vbool4& b);
609  friend vbool4 operator^ (const vbool4& a, const vbool4& b);
610  friend vbool4 operator~ (const vbool4& a);
611  friend const vbool4& operator&= (vbool4& a, const vbool4& b);
612  friend const vbool4& operator|= (vbool4& a, const vbool4& b);
613  friend const vbool4& operator^= (vbool4& a, const vbool4& b);
614 
615  /// Comparison operators, component by component
616  friend vbool4 operator== (const vbool4& a, const vbool4& b);
617  friend vbool4 operator!= (const vbool4& a, const vbool4& b);
618 
619  /// Stream output
620  friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
621 
622 private:
623  // The actual data representation
624  union {
627  };
628 };
629 
630 
631 
632 /// Helper: shuffle/swizzle with constant (templated) indices.
633 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
634 template<int i0, int i1, int i2, int i3>
635 OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
636 
637 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
638 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
639 
640 /// Helper: as rapid as possible extraction of one component, when the
641 /// index is fixed.
642 template<int i> OIIO_FORCEINLINE bool extract (const vbool4& a);
643 
644 /// Helper: substitute val for a[i]
645 template<int i> OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val);
646 
647 /// Logical reduction across all components.
648 bool reduce_and (const vbool4& v);
649 bool reduce_or (const vbool4& v);
650 
651 // Are all/any/no components true?
652 bool all (const vbool4& v);
653 bool any (const vbool4& v);
654 bool none (const vbool4& v);
655 
656 // It's handy to have this defined for regular bool as well
657 inline bool all (bool v) { return v; }
658 
659 
660 
661 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
662 /// SIMD instructions when available. This is what is naturally produced by
663 /// SIMD comparison operators on the vfloat8 and vint8 types.
664 class vbool8 {
665 public:
666  static const char* type_name() { return "vbool8"; }
667  typedef bool value_t; ///< Underlying equivalent scalar value type
668  enum { elements = 8 }; ///< Number of scalar elements
669  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
670  enum { bits = elements*32 }; ///< Total number of bits
671  typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used
672  static constexpr size_t size() noexcept { return elements; }
673 
674  /// Default constructor (contents undefined)
675  vbool8 () { }
676 
677  /// Construct from a single value (store it in all slots)
678  vbool8 (bool a) { load (a); }
679 
680  explicit vbool8 (const bool *values);
681 
682  /// Construct from 8 bool values
683  vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
684 
685  /// Copy construct from another vbool8
686  vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
687 
688  /// Construct from 8 int values
689  vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
690 
691  /// Construct from a SIMD int (is each element nonzero?)
692  vbool8 (const vint8 &i);
693 
694  /// Construct from two vbool4's
695  vbool8 (const vbool4 &lo, const vbool4 &hi);
696 
697  /// Construct from the underlying SIMD type
698  vbool8 (const simd_t& m) : m_simd(m) { }
699 
700  /// Return the raw SIMD type
701  operator simd_t () const { return m_simd; }
702  simd_t simd () const { return m_simd; }
703  simd_t& simd () { return m_simd; }
704 
705  /// Extract the bitmask
706  int bitmask () const;
707 
708  /// Convert from integer bitmask to a true vbool8
709  static vbool8 from_bitmask (int bitmask);
710 
711  /// Set all components to false
712  void clear ();
713 
714  /// Return a vbool8 the is 'false' for all values
715  static const vbool8 False ();
716 
717  /// Return a vbool8 the is 'true' for all values
718  static const vbool8 True ();
719 
720  /// Assign one value to all components
721  const vbool8 & operator= (bool a);
722 
723  /// Assignment of another vbool8
724  const vbool8 & operator= (const vbool8 & other);
725 
726  /// Component access (get)
727  int operator[] (int i) const;
728 
729  /// Component access (set).
730  void setcomp (int i, bool value);
731 
732  /// Component access (set).
733  /// NOTE: avoid this unsafe construct. It will go away some day.
734  int& operator[] (int i);
735 
736  /// Extract the lower precision vbool4
737  vbool4 lo () const;
738 
739  /// Extract the higher precision vbool4
740  vbool4 hi () const;
741 
742  /// Helper: load a single value into all components.
743  void load (bool a);
744 
745  /// Helper: load separate values into each component.
746  void load (bool a, bool b, bool c, bool d,
747  bool e, bool f, bool g, bool h);
748 
749  /// Helper: store the values into memory as bools.
750  void store (bool *values) const;
751 
752  /// Store the first n values into memory.
753  void store (bool *values, int n) const;
754 
755  /// Logical/bitwise operators, component-by-component
756  friend vbool8 operator! (const vbool8& a);
757  friend vbool8 operator& (const vbool8& a, const vbool8& b);
758  friend vbool8 operator| (const vbool8& a, const vbool8& b);
759  friend vbool8 operator^ (const vbool8& a, const vbool8& b);
760  friend vbool8 operator~ (const vbool8& a);
761  friend const vbool8& operator&= (vbool8& a, const vbool8& b);
762  friend const vbool8& operator|= (vbool8& a, const vbool8& b);
763  friend const vbool8& operator^= (vbool8& a, const vbool8& b);
764 
765  /// Comparison operators, component by component
766  friend vbool8 operator== (const vbool8& a, const vbool8& b);
767  friend vbool8 operator!= (const vbool8& a, const vbool8& b);
768 
769  /// Stream output
770  friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
771 
772 private:
773  // The actual data representation
774  union {
778  };
779 };
780 
781 
782 
783 /// Helper: shuffle/swizzle with constant (templated) indices.
784 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
785 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
786 OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
787 
788 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
789 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
790 
791 /// Helper: as rapid as possible extraction of one component, when the
792 /// index is fixed.
793 template<int i> OIIO_FORCEINLINE bool extract (const vbool8& a);
794 
795 /// Helper: substitute val for a[i]
796 template<int i> OIIO_FORCEINLINE vbool8 insert (const vbool8& a, bool val);
797 
798 /// Logical reduction across all components.
799 bool reduce_and (const vbool8& v);
800 bool reduce_or (const vbool8& v);
801 
802 // Are all/any/no components true?
803 bool all (const vbool8& v);
804 bool any (const vbool8& v);
805 bool none (const vbool8& v);
806 
807 
808 
809 
810 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
811 /// by SIMD instructions when available. This is what is naturally produced
812 /// by SIMD comparison operators on the vfloat16 and vint16 types.
813 class vbool16 {
814 public:
815  static const char* type_name() { return "vbool16"; }
816  typedef bool value_t; ///< Underlying equivalent scalar value type
817  enum { elements = 16 }; ///< Number of scalar elements
818  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
819  enum { bits = 16 }; ///< Total number of bits
820  typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used
821  static constexpr size_t size() noexcept { return elements; }
822 
823  /// Default constructor (contents undefined)
824  vbool16 () { }
825 
826  /// Construct from a single value (store it in all slots)
827  vbool16 (bool a) { load (a); }
828 
829  explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
830 
831  explicit vbool16 (const bool *values);
832 
833  /// Construct from 16 bool values
834  vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
835  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
836 
837  /// Copy construct from another vbool16
838  vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
839 
840  /// Construct from 16 int values
841  vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
842  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
843 
844  /// Construct from a SIMD int (is each element nonzero?)
845  vbool16 (const vint16 &i);
846 
847  /// Construct from two vbool8's
848  vbool16 (const vbool8 &lo, const vbool8 &hi);
849 
850  /// Construct from four vbool4's
851  vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
852 
853  /// Construct from the underlying SIMD type
854  vbool16 (const simd_t& m) : m_simd(m) { }
855 
856  /// Return the raw SIMD type
857  operator simd_t () const { return m_simd; }
858  simd_t simd () const { return m_simd; }
859  simd_t& simd () { return m_simd; }
860 
861  int bitmask () const;
862 
863  /// Convert from integer bitmask to a true vbool16
864  static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
865 
866  /// Set all components to false
867  void clear ();
868 
869  /// Return a vbool16 the is 'false' for all values
870  static const vbool16 False ();
871 
872  /// Return a vbool16 the is 'true' for all values
873  static const vbool16 True ();
874 
875  /// Assign one value to all components
876  const vbool16 & operator= (bool a);
877 
878  /// Assignment of another vbool16
879  const vbool16 & operator= (const vbool16 & other);
880 
881  /// Component access (get)
882  int operator[] (int i) const;
883 
884  /// Component access (set).
885  void setcomp (int i, bool value);
886 
887  /// Extract the lower precision vbool8
888  vbool8 lo () const;
889 
890  /// Extract the higher precision vbool8
891  vbool8 hi () const;
892 
893  /// Helper: load a single value into all components.
894  void load (bool a);
895 
896  /// Helper: load separate values into each component.
897  void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
898  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
899 
900  /// Helper: load all components from a bitmask in an int.
901  void load_bitmask (int a);
902 
903  /// Helper: store the values into memory as bools.
904  void store (bool *values) const;
905 
906  /// Store the first n values into memory.
907  void store (bool *values, int n) const;
908 
909  /// Logical/bitwise operators, component-by-component
910  friend vbool4 operator! (const vbool4& a);
911  friend vbool16 operator! (const vbool16& a);
912  friend vbool16 operator& (const vbool16& a, const vbool16& b);
913  friend vbool16 operator| (const vbool16& a, const vbool16& b);
914  friend vbool16 operator^ (const vbool16& a, const vbool16& b);
915  friend vbool16 operator~ (const vbool16& a);
916  friend const vbool16& operator&= (vbool16& a, const vbool16& b);
917  friend const vbool16& operator|= (vbool16& a, const vbool16& b);
918  friend const vbool16& operator^= (vbool16& a, const vbool16& b);
919 
920  /// Comparison operators, component by component
921  friend vbool16 operator== (const vbool16& a, const vbool16& b);
922  friend vbool16 operator!= (const vbool16& a, const vbool16& b);
923 
924  /// Stream output
925  friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
926 
927 private:
928  // The actual data representation
929  union {
931  uint16_t m_bits;
932  };
933 };
934 
935 
936 
937 /// Helper: as rapid as possible extraction of one component, when the
938 /// index is fixed.
939 template<int i> OIIO_FORCEINLINE bool extract (const vbool16& a);
940 
941 /// Helper: substitute val for a[i]
942 template<int i> OIIO_FORCEINLINE vbool16 insert (const vbool16& a, bool val);
943 
944 /// Logical reduction across all components.
945 bool reduce_and (const vbool16& v);
946 bool reduce_or (const vbool16& v);
947 
948 // Are all/any/no components true?
949 bool all (const vbool16& v);
950 bool any (const vbool16& v);
951 bool none (const vbool16& v);
952 
953 
954 
955 
956 
957 /// Integer 4-vector, accelerated by SIMD instructions when available.
958 class vint4 {
959 public:
960  static const char* type_name() { return "vint4"; }
961  typedef int value_t; ///< Underlying equivalent scalar value type
962  enum { elements = 4 }; ///< Number of scalar elements
963  enum { paddedelements =4 }; ///< Number of scalar elements for full pad
964  enum { bits = 128 }; ///< Total number of bits
965  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
966  typedef vbool4 vbool_t; ///< bool type of the same length
967  typedef vfloat4 vfloat_t; ///< float type of the same length
968  typedef vint4 vint_t; ///< int type of the same length
969  OIIO_DEPRECATED("use vbool_t (1.8)")
970  typedef vbool4 bool_t; // old name (deprecated 1.8)
971  OIIO_DEPRECATED("use vfloat_t (1.8)")
972  typedef vfloat4 float_t; // old name (deprecated 1.8)
973  static constexpr size_t size() noexcept { return elements; }
974 
975  /// Default constructor (contents undefined)
976  vint4 () { }
977 
978  /// Construct from a single value (store it in all slots)
979  vint4 (int a);
980 
981  /// Construct from 2 values -- (a,a,b,b)
982  vint4 (int a, int b);
983 
984  /// Construct from 4 values
985  vint4 (int a, int b, int c, int d);
986 
987  /// Construct from a pointer to values
988  vint4 (const int *vals);
989 
990  /// Construct from a pointer to unsigned short values
991  explicit vint4 (const unsigned short *vals);
992 
993  /// Construct from a pointer to signed short values
994  explicit vint4 (const short *vals);
995 
996  /// Construct from a pointer to unsigned char values (0 - 255)
997  explicit vint4 (const unsigned char *vals);
998 
999  /// Construct from a pointer to signed char values (-128 - 127)
1000  explicit vint4 (const char *vals);
1001 
1002  /// Copy construct from another vint4
1003  vint4 (const vint4 & other) { m_simd = other.m_simd; }
1004 
1005  /// Convert a vfloat to an vint. Equivalent to i = (int)f;
1006  explicit vint4 (const vfloat4& f); // implementation below
1007 
1008  /// Construct from the underlying SIMD type
1009  vint4 (const simd_t& m) : m_simd(m) { }
1010 
1011  /// Return the raw SIMD type
1012  operator simd_t () const { return m_simd; }
1013  simd_t simd () const { return m_simd; }
1014  simd_t& simd () { return m_simd; }
1015 
1016  /// Return a pointer to the underlying scalar type
1017  const value_t* data () const { return (const value_t*)this; }
1018  value_t* data () { return (value_t*)this; }
1019 
1020  /// Sset all components to 0
1021  void clear () ;
1022 
1023  /// Return an vint4 with all components set to 0
1024  static const vint4 Zero ();
1025 
1026  /// Return an vint4 with all components set to 1
1027  static const vint4 One ();
1028 
1029  /// Return an vint4 with all components set to -1 (aka 0xffffffff)
1030  static const vint4 NegOne ();
1031 
1032  /// Return an vint4 with incremented components (e.g., 0,1,2,3).
1033  /// Optional arguments can give a non-zero starting point and step size.
1034  static const vint4 Iota (int start=0, int step=1);
1035 
1036  /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
1037  static const vint4 Giota ();
1038 
1039  /// Assign one value to all components.
1040  const vint4 & operator= (int a);
1041 
1042  /// Assignment from another vint4
1043  const vint4 & operator= (const vint4& other) ;
1044 
1045  /// Component access (get)
1046  int operator[] (int i) const;
1047 
1048  /// Component access (set)
1049  int& operator[] (int i);
1050 
1051  /// Component access (set).
1052  void setcomp (int i, int value);
1053 
1054  value_t x () const;
1055  value_t y () const;
1056  value_t z () const;
1057  value_t w () const;
1058  void set_x (value_t val);
1059  void set_y (value_t val);
1060  void set_z (value_t val);
1061  void set_w (value_t val);
1062 
1063  /// Helper: load a single int into all components
1064  void load (int a);
1065 
1066  /// Helper: load separate values into each component.
1067  void load (int a, int b, int c, int d);
1068 
1069  /// Load from an array of 4 values
1070  void load (const int *values);
1071 
1072  void load (const int *values, int n) ;
1073 
1074  /// Load from an array of 4 unsigned short values, convert to vint4
1075  void load (const unsigned short *values) ;
1076 
1077  /// Load from an array of 4 unsigned short values, convert to vint4
1078  void load (const short *values);
1079 
1080  /// Load from an array of 4 unsigned char values, convert to vint4
1081  void load (const unsigned char *values);
1082 
1083  /// Load from an array of 4 unsigned char values, convert to vint4
1084  void load (const char *values);
1085 
1086  /// Store the values into memory
1087  void store (int *values) const;
1088 
1089  /// Store the first n values into memory
1090  void store (int *values, int n) const;
1091 
1092  /// Store the least significant 16 bits of each element into adjacent
1093  /// unsigned shorts.
1094  void store (unsigned short *values) const;
1095 
1096  /// Store the least significant 8 bits of each element into adjacent
1097  /// unsigned chars.
1098  void store (unsigned char *values) const;
1099 
1100  /// Masked load -- read from values[] where mask is 1, load zero where
1101  /// mask is 0.
1102  void load_mask (int mask, const value_t *values);
1103  void load_mask (const vbool_t& mask, const value_t *values);
1104 
1105  /// Masked store -- write to values[] where mask is enabled, don't
1106  /// touch values[] where it's not.
1107  void store_mask (int mask, value_t *values) const;
1108  void store_mask (const vbool_t& mask, value_t *values) const;
1109 
1110  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1111  template<int scale=4>
1112  void gather (const value_t *baseptr, const vint_t& vindex);
1113  /// Gather elements defined by the mask, leave others unchanged.
1114  template<int scale=4>
1115  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1116  template<int scale=4>
1117  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1118 
1119  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1120  template<int scale=4>
1121  void scatter (value_t *baseptr, const vint_t& vindex) const;
1122  /// Scatter elements defined by the mask
1123  template<int scale=4>
1124  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1125  template<int scale=4>
1126  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1127 
1128  // Arithmetic operators (component-by-component)
1129  friend vint4 operator+ (const vint4& a, const vint4& b);
1130  friend vint4 operator- (const vint4& a);
1131  friend vint4 operator- (const vint4& a, const vint4& b);
1132  friend vint4 operator* (const vint4& a, const vint4& b);
1133  friend vint4 operator/ (const vint4& a, const vint4& b);
1134  friend vint4 operator% (const vint4& a, const vint4& b);
1135  friend const vint4 & operator+= (vint4& a, const vint4& b);
1136  friend const vint4 & operator-= (vint4& a, const vint4& b);
1137  friend const vint4 & operator*= (vint4& a, const vint4& b);
1138  friend const vint4 & operator/= (vint4& a, const vint4& b);
1139  friend const vint4 & operator%= (vint4& a, const vint4& b);
1140  // Bitwise operators (component-by-component)
1141  friend vint4 operator& (const vint4& a, const vint4& b);
1142  friend vint4 operator| (const vint4& a, const vint4& b);
1143  friend vint4 operator^ (const vint4& a, const vint4& b);
1144  friend const vint4& operator&= (vint4& a, const vint4& b);
1145  friend const vint4& operator|= (vint4& a, const vint4& b);
1146  friend const vint4& operator^= (vint4& a, const vint4& b);
1147  friend vint4 operator~ (const vint4& a);
1148  friend vint4 operator<< (const vint4& a, unsigned int bits);
1149  friend vint4 operator>> (const vint4& a, unsigned int bits);
1150  friend const vint4& operator<<= (vint4& a, unsigned int bits);
1151  friend const vint4& operator>>= (vint4& a, unsigned int bits);
1152  // Comparison operators (component-by-component)
1153  friend vbool4 operator== (const vint4& a, const vint4& b);
1154  friend vbool4 operator!= (const vint4& a, const vint4& b);
1155  friend vbool4 operator< (const vint4& a, const vint4& b);
1156  friend vbool4 operator> (const vint4& a, const vint4& b);
1157  friend vbool4 operator>= (const vint4& a, const vint4& b);
1158  friend vbool4 operator<= (const vint4& a, const vint4& b);
1159 
1160  /// Stream output
1161  friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1162 
1163 private:
1164  // The actual data representation
1165  union {
1168  };
1169 };
1170 
1171 
1172 
1173 // Shift right logical -- unsigned shift. This differs from operator>>
1174 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1175 // srl((1<<31),1) == 1<<30.
1176 vint4 srl (const vint4& val, const unsigned int bits);
1177 
1178 /// Helper: shuffle/swizzle with constant (templated) indices.
1179 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1180 template<int i0, int i1, int i2, int i3>
1181 OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1182 
1183 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1184 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1185 
1186 /// Helper: as rapid as possible extraction of one component, when the
1187 /// index is fixed.
1188 template<int i> OIIO_FORCEINLINE int extract (const vint4& v);
1189 
1190 /// The sum of all components, returned in all components.
1191 vint4 vreduce_add (const vint4& v);
1192 
1193 // Reduction across all components
1194 int reduce_add (const vint4& v);
1195 int reduce_and (const vint4& v);
1196 int reduce_or (const vint4& v);
1197 
1198 /// Use a bool mask to select between components of a (if mask[i] is false)
1199 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1200 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1201 
1202 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1203 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1204 /// blend(0,a,mask).
1205 vint4 blend0 (const vint4& a, const vbool4& mask);
1206 
1207 /// Use a bool mask to select between components of a (if mask[i] is false)
1208 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1209 /// blend(0,a,!mask), or blend(a,0,mask).
1210 vint4 blend0not (const vint4& a, const vbool4& mask);
1211 
1212 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1213 /// synonym for blend with arguments rearranged, but this is more clear
1214 /// because the arguments are symmetric to scalar (cond ? a : b).
1215 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1216 
1217 // Per-element math
1218 vint4 abs (const vint4& a);
1219 vint4 min (const vint4& a, const vint4& b);
1220 vint4 max (const vint4& a, const vint4& b);
1221 
1222 /// Circular bit rotate by s bits, for N values at once.
1223 vint4 rotl (const vint4& x, const int s);
1224 // DEPRECATED(2.1)
1225 vint4 rotl32 (const vint4& x, const unsigned int k);
1226 
1227 /// andnot(a,b) returns ((~a) & b)
1228 vint4 andnot (const vint4& a, const vint4& b);
1229 
1230 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1231 vint4 bitcast_to_int (const vbool4& x);
1232 vint4 bitcast_to_int (const vfloat4& x);
1233 vfloat4 bitcast_to_float (const vint4& x);
1234 
1235 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1236 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1237  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1238 
1239 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1240 
1241 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1242 vint4 safe_mod (const vint4& a, const vint4& b);
1243 vint4 safe_mod (const vint4& a, int b);
1244 
1245 
1246 
1247 
1248 /// Integer 8-vector, accelerated by SIMD instructions when available.
1249 class vint8 {
1250 public:
1251  static const char* type_name() { return "vint8"; }
1252  typedef int value_t; ///< Underlying equivalent scalar value type
1253  enum { elements = 8 }; ///< Number of scalar elements
1254  enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1255  enum { bits = elements*32 }; ///< Total number of bits
1256  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1257  typedef vbool8 vbool_t; ///< bool type of the same length
1258  typedef vfloat8 vfloat_t; ///< float type of the same length
1259  typedef vint8 vint_t; ///< int type of the same length
1260  OIIO_DEPRECATED("use vbool_t (1.8)")
1261  typedef vbool8 bool_t; // old name (deprecated 1.8)
1262  OIIO_DEPRECATED("use vfloat_t (1.8)")
1263  typedef vfloat8 float_t; // old name (deprecated 1.8)
1264  static constexpr size_t size() noexcept { return elements; }
1265 
1266  /// Default constructor (contents undefined)
1267  vint8 () { }
1268 
1269  /// Construct from a single value (store it in all slots)
1270  vint8 (int a);
1271 
1272  /// Construct from 2 values -- (a,a,b,b)
1273  vint8 (int a, int b);
1274 
1275  /// Construct from 8 values (won't work for vint8)
1276  vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1277 
1278  /// Construct from a pointer to values
1279  vint8 (const int *vals);
1280 
1281  /// Construct from a pointer to unsigned short values
1282  explicit vint8 (const unsigned short *vals);
1283 
1284  /// Construct from a pointer to signed short values
1285  explicit vint8 (const short *vals);
1286 
1287  /// Construct from a pointer to unsigned char values (0 - 255)
1288  explicit vint8 (const unsigned char *vals);
1289 
1290  /// Construct from a pointer to signed char values (-128 - 127)
1291  explicit vint8 (const char *vals);
1292 
1293  /// Copy construct from another vint8
1294  vint8 (const vint8 & other) { m_simd = other.m_simd; }
1295 
1296  /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1297  explicit vint8 (const vfloat8& f); // implementation below
1298 
1299  /// Construct from two vint4's
1300  vint8 (const vint4 &lo, const vint4 &hi);
1301 
1302  /// Construct from the underlying SIMD type
1303  vint8 (const simd_t& m) : m_simd(m) { }
1304 
1305  /// Return the raw SIMD type
1306  operator simd_t () const { return m_simd; }
1307  simd_t simd () const { return m_simd; }
1308  simd_t& simd () { return m_simd; }
1309 
1310  /// Return a pointer to the underlying scalar type
1311  const value_t* data () const { return (const value_t*)this; }
1312  value_t* data () { return (value_t*)this; }
1313 
1314  /// Sset all components to 0
1315  void clear () ;
1316 
1317  /// Return an vint8 with all components set to 0
1318  static const vint8 Zero ();
1319 
1320  /// Return an vint8 with all components set to 1
1321  static const vint8 One ();
1322 
1323  /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1324  static const vint8 NegOne ();
1325 
1326  /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1327  /// Optional arguments can give a non-zero starting point and step size.
1328  static const vint8 Iota (int start=0, int step=1);
1329 
1330  /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1331  static const vint8 Giota ();
1332 
1333  /// Assign one value to all components.
1334  const vint8 & operator= (int a);
1335 
1336  /// Assignment from another vint8
1337  const vint8 & operator= (const vint8& other) ;
1338 
1339  /// Component access (get)
1340  int operator[] (int i) const;
1341 
1342  /// Component access (set)
1343  int& operator[] (int i);
1344 
1345  /// Component access (set).
1346  void setcomp (int i, int value);
1347 
1348  value_t x () const;
1349  value_t y () const;
1350  value_t z () const;
1351  value_t w () const;
1352  void set_x (value_t val);
1353  void set_y (value_t val);
1354  void set_z (value_t val);
1355  void set_w (value_t val);
1356 
1357  /// Extract the lower precision vint4
1358  vint4 lo () const;
1359 
1360  /// Extract the higher precision vint4
1361  vint4 hi () const;
1362 
1363  /// Helper: load a single int into all components
1364  void load (int a);
1365 
1366  /// Load separate values into each component.
1367  void load (int a, int b, int c, int d, int e, int f, int g, int h);
1368 
1369  /// Load from an array of 8 values
1370  void load (const int *values);
1371 
1372  void load (const int *values, int n) ;
1373 
1374  /// Load from an array of 8 unsigned short values, convert to vint8
1375  void load (const unsigned short *values) ;
1376 
1377  /// Load from an array of 8 unsigned short values, convert to vint8
1378  void load (const short *values);
1379 
1380  /// Load from an array of 8 unsigned char values, convert to vint8
1381  void load (const unsigned char *values);
1382 
1383  /// Load from an array of 8 unsigned char values, convert to vint8
1384  void load (const char *values);
1385 
1386  /// Store the values into memory
1387  void store (int *values) const;
1388 
1389  /// Store the first n values into memory
1390  void store (int *values, int n) const;
1391 
1392  /// Store the least significant 16 bits of each element into adjacent
1393  /// unsigned shorts.
1394  void store (unsigned short *values) const;
1395 
1396  /// Store the least significant 8 bits of each element into adjacent
1397  /// unsigned chars.
1398  void store (unsigned char *values) const;
1399 
1400  /// Masked load -- read from values[] where mask is 1, load zero where
1401  /// mask is 0.
1402  void load_mask (int mask, const value_t *values);
1403  void load_mask (const vbool_t& mask, const value_t *values);
1404 
1405  /// Masked store -- write to values[] where mask is enabled, don't
1406  /// touch values[] where it's not.
1407  void store_mask (int mask, value_t *values) const;
1408  void store_mask (const vbool_t& mask, value_t *values) const;
1409 
1410  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1411  template<int scale=4>
1412  void gather (const value_t *baseptr, const vint_t& vindex);
1413  /// Gather elements defined by the mask, leave others unchanged.
1414  template<int scale=4>
1415  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1416  template<int scale=4>
1417  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1418 
1419  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1420  template<int scale=4>
1421  void scatter (value_t *baseptr, const vint_t& vindex) const;
1422  /// Scatter elements defined by the mask
1423  template<int scale=4>
1424  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1425  template<int scale=4>
1426  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1427 
1428  // Arithmetic operators (component-by-component)
1429  friend vint8 operator+ (const vint8& a, const vint8& b);
1430  friend vint8 operator- (const vint8& a);
1431  friend vint8 operator- (const vint8& a, const vint8& b);
1432  friend vint8 operator* (const vint8& a, const vint8& b);
1433  friend vint8 operator/ (const vint8& a, const vint8& b);
1434  friend vint8 operator% (const vint8& a, const vint8& b);
1435  friend const vint8 & operator+= (vint8& a, const vint8& b);
1436  friend const vint8 & operator-= (vint8& a, const vint8& b);
1437  friend const vint8 & operator*= (vint8& a, const vint8& b);
1438  friend const vint8 & operator/= (vint8& a, const vint8& b);
1439  friend const vint8 & operator%= (vint8& a, const vint8& b);
1440  // Bitwise operators (component-by-component)
1441  friend vint8 operator& (const vint8& a, const vint8& b);
1442  friend vint8 operator| (const vint8& a, const vint8& b);
1443  friend vint8 operator^ (const vint8& a, const vint8& b);
1444  friend const vint8& operator&= (vint8& a, const vint8& b);
1445  friend const vint8& operator|= (vint8& a, const vint8& b);
1446  friend const vint8& operator^= (vint8& a, const vint8& b);
1447  friend vint8 operator~ (const vint8& a);
1448  friend vint8 operator<< (const vint8& a, unsigned int bits);
1449  friend vint8 operator>> (const vint8& a, unsigned int bits);
1450  friend const vint8& operator<<= (vint8& a, unsigned int bits);
1451  friend const vint8& operator>>= (vint8& a, unsigned int bits);
1452  // Comparison operators (component-by-component)
1453  friend vbool8 operator== (const vint8& a, const vint8& b);
1454  friend vbool8 operator!= (const vint8& a, const vint8& b);
1455  friend vbool8 operator< (const vint8& a, const vint8& b);
1456  friend vbool8 operator> (const vint8& a, const vint8& b);
1457  friend vbool8 operator>= (const vint8& a, const vint8& b);
1458  friend vbool8 operator<= (const vint8& a, const vint8& b);
1459 
1460  /// Stream output
1461  friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1462 
1463 private:
1464  // The actual data representation
1465  union {
1469  };
1470 };
1471 
1472 
1473 
1474 // Shift right logical -- unsigned shift. This differs from operator>>
1475 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1476 // srl((1<<31),1) == 1<<30.
1477 vint8 srl (const vint8& val, const unsigned int bits);
1478 
1479 /// Helper: shuffle/swizzle with constant (templated) indices.
1480 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1481 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
1482 OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1483 
1484 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1485 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1486 
1487 /// Helper: as rapid as possible extraction of one component, when the
1488 /// index is fixed.
1489 template<int i> OIIO_FORCEINLINE int extract (const vint8& v);
1490 
1491 /// Helper: substitute val for a[i]
1492 template<int i> OIIO_FORCEINLINE vint8 insert (const vint8& a, int val);
1493 
1494 /// The sum of all components, returned in all components.
1495 vint8 vreduce_add (const vint8& v);
1496 
1497 // Reduction across all components
1498 int reduce_add (const vint8& v);
1499 int reduce_and (const vint8& v);
1500 int reduce_or (const vint8& v);
1501 
1502 /// Use a bool mask to select between components of a (if mask[i] is false)
1503 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1504 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1505 
1506 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1507 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1508 /// blend(0,a,mask).
1509 vint8 blend0 (const vint8& a, const vbool8& mask);
1510 
1511 /// Use a bool mask to select between components of a (if mask[i] is false)
1512 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1513 /// blend(0,a,!mask), or blend(a,0,mask).
1514 vint8 blend0not (const vint8& a, const vbool8& mask);
1515 
1516 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1517 /// synonym for blend with arguments rearranged, but this is more clear
1518 /// because the arguments are symmetric to scalar (cond ? a : b).
1519 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1520 
1521 // Per-element math
1522 vint8 abs (const vint8& a);
1523 vint8 min (const vint8& a, const vint8& b);
1524 vint8 max (const vint8& a, const vint8& b);
1525 
1526 /// Circular bit rotate by s bits, for N values at once.
1527 vint8 rotl (const vint8& x, const int s);
1528 // DEPRECATED(2.1)
1529 vint8 rotl32 (const vint8& x, const unsigned int k);
1530 
1531 /// andnot(a,b) returns ((~a) & b)
1532 vint8 andnot (const vint8& a, const vint8& b);
1533 
1534 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1535 vint8 bitcast_to_int (const vbool8& x);
1536 vint8 bitcast_to_int (const vfloat8& x);
1537 vfloat8 bitcast_to_float (const vint8& x);
1538 
1539 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1540 vint8 safe_mod (const vint8& a, const vint8& b);
1541 vint8 safe_mod (const vint8& a, int b);
1542 
1543 
1544 
1545 
1546 
1547 /// Integer 16-vector, accelerated by SIMD instructions when available.
1548 class vint16 {
1549 public:
1550  static const char* type_name() { return "vint16"; }
1551  typedef int value_t; ///< Underlying equivalent scalar value type
1552  enum { elements = 16 }; ///< Number of scalar elements
1553  enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1554  enum { bits = 128 }; ///< Total number of bits
1555  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1556  typedef vbool16 vbool_t; ///< bool type of the same length
1557  typedef vfloat16 vfloat_t; ///< float type of the same length
1558  typedef vint16 vint_t; ///< int type of the same length
1559  OIIO_DEPRECATED("use vbool_t (1.8)")
1560  typedef vbool16 bool_t; // old name (deprecated 1.8)
1561  OIIO_DEPRECATED("use vfloat_t (1.8)")
1562  typedef vfloat16 float_t; // old name (deprecated 1.8)
1563  static constexpr size_t size() noexcept { return elements; }
1564 
1565  /// Default constructor (contents undefined)
1566  vint16 () { }
1567 
1568  /// Construct from a single value (store it in all slots)
1569  vint16 (int a);
1570 
1571  /// Construct from 16 values (won't work for vint16)
1572  vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1573  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1574 
1575  /// Construct from a pointer to values
1576  vint16 (const int *vals);
1577 
1578  /// Construct from a pointer to unsigned short values
1579  explicit vint16 (const unsigned short *vals);
1580 
1581  /// Construct from a pointer to signed short values
1582  explicit vint16 (const short *vals);
1583 
1584  /// Construct from a pointer to unsigned char values (0 - 255)
1585  explicit vint16 (const unsigned char *vals);
1586 
1587  /// Construct from a pointer to signed char values (-128 - 127)
1588  explicit vint16 (const char *vals);
1589 
1590  /// Copy construct from another vint16
1591  vint16 (const vint16 & other) { m_simd = other.m_simd; }
1592 
1593  /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1594  explicit vint16 (const vfloat16& f); // implementation below
1595 
1596  /// Construct from two vint8's
1597  vint16 (const vint8 &lo, const vint8 &hi);
1598 
1599  /// Construct from four vint4's
1600  vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1601 
1602  /// Construct from the underlying SIMD type
1603  vint16 (const simd_t& m) : m_simd(m) { }
1604 
1605  /// Return the raw SIMD type
1606  operator simd_t () const { return m_simd; }
1607  simd_t simd () const { return m_simd; }
1608  simd_t& simd () { return m_simd; }
1609 
1610  /// Return a pointer to the underlying scalar type
1611  const value_t* data () const { return (const value_t*)this; }
1612  value_t* data () { return (value_t*)this; }
1613 
1614  /// Sset all components to 0
1615  void clear () ;
1616 
1617  /// Return an vint16 with all components set to 0
1618  static const vint16 Zero ();
1619 
1620  /// Return an vint16 with all components set to 1
1621  static const vint16 One ();
1622 
1623  /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1624  static const vint16 NegOne ();
1625 
1626  /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1627  /// Optional arguments can give a non-zero starting point and step size.
1628  static const vint16 Iota (int start=0, int step=1);
1629 
1630  /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1631  static const vint16 Giota ();
1632 
1633  /// Assign one value to all components.
1634  const vint16 & operator= (int a);
1635 
1636  /// Assignment from another vint16
1637  const vint16 & operator= (const vint16& other) ;
1638 
1639  /// Component access (get)
1640  int operator[] (int i) const;
1641 
1642  /// Component access (set)
1643  int& operator[] (int i);
1644 
1645  /// Component access (set).
1646  void setcomp (int i, int value);
1647 
1648  value_t x () const;
1649  value_t y () const;
1650  value_t z () const;
1651  value_t w () const;
1652  void set_x (value_t val);
1653  void set_y (value_t val);
1654  void set_z (value_t val);
1655  void set_w (value_t val);
1656 
1657  /// Extract the lower precision vint8
1658  vint8 lo () const;
1659 
1660  /// Extract the higher precision vint8
1661  vint8 hi () const;
1662 
1663  /// Helper: load a single int into all components
1664  void load (int a);
1665 
1666  /// Load separate values into each component.
1667  void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1668  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1669 
1670  /// Load from an array of 16 values
1671  void load (const int *values);
1672 
1673  void load (const int *values, int n) ;
1674 
1675  /// Load from an array of 16 unsigned short values, convert to vint16
1676  void load (const unsigned short *values) ;
1677 
1678  /// Load from an array of 16 unsigned short values, convert to vint16
1679  void load (const short *values);
1680 
1681  /// Load from an array of 16 unsigned char values, convert to vint16
1682  void load (const unsigned char *values);
1683 
1684  /// Load from an array of 16 unsigned char values, convert to vint16
1685  void load (const char *values);
1686 
1687  /// Store the values into memory
1688  void store (int *values) const;
1689 
1690  /// Store the first n values into memory
1691  void store (int *values, int n) const;
1692 
1693  /// Store the least significant 16 bits of each element into adjacent
1694  /// unsigned shorts.
1695  void store (unsigned short *values) const;
1696 
1697  /// Store the least significant 8 bits of each element into adjacent
1698  /// unsigned chars.
1699  void store (unsigned char *values) const;
1700 
1701  /// Masked load -- read from values[] where mask is 1, load zero where
1702  /// mask is 0.
1703  void load_mask (const vbool_t &mask, const value_t *values);
1704  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1705 
1706  /// Masked store -- write to values[] where mask is enabled, don't
1707  /// touch values[] where it's not.
1708  void store_mask (const vbool_t &mask, value_t *values) const;
1709  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1710 
1711  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1712  template<int scale=4>
1713  void gather (const value_t *baseptr, const vint_t& vindex);
1714  /// Gather elements defined by the mask, leave others unchanged.
1715  template<int scale=4>
1716  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1717  template<int scale=4>
1718  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1719  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1720  }
1721 
1722  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1723  template<int scale=4>
1724  void scatter (value_t *baseptr, const vint_t& vindex) const;
1725  /// Scatter elements defined by the mask
1726  template<int scale=4>
1727  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1728  template<int scale=4>
1729  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1730  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1731  }
1732 
1733  // Arithmetic operators (component-by-component)
1734  friend vint16 operator+ (const vint16& a, const vint16& b);
1735  friend vint16 operator- (const vint16& a);
1736  friend vint16 operator- (const vint16& a, const vint16& b);
1737  friend vint16 operator* (const vint16& a, const vint16& b);
1738  friend vint16 operator/ (const vint16& a, const vint16& b);
1739  friend vint16 operator% (const vint16& a, const vint16& b);
1740  friend const vint16 & operator+= (vint16& a, const vint16& b);
1741  friend const vint16 & operator-= (vint16& a, const vint16& b);
1742  friend const vint16 & operator*= (vint16& a, const vint16& b);
1743  friend const vint16 & operator/= (vint16& a, const vint16& b);
1744  friend const vint16 & operator%= (vint16& a, const vint16& b);
1745  // Bitwise operators (component-by-component)
1746  friend vint16 operator& (const vint16& a, const vint16& b);
1747  friend vint16 operator| (const vint16& a, const vint16& b);
1748  friend vint16 operator^ (const vint16& a, const vint16& b);
1749  friend const vint16& operator&= (vint16& a, const vint16& b);
1750  friend const vint16& operator|= (vint16& a, const vint16& b);
1751  friend const vint16& operator^= (vint16& a, const vint16& b);
1752  friend vint16 operator~ (const vint16& a);
1753  friend vint16 operator<< (const vint16& a, unsigned int bits);
1754  friend vint16 operator>> (const vint16& a, unsigned int bits);
1755  friend const vint16& operator<<= (vint16& a, unsigned int bits);
1756  friend const vint16& operator>>= (vint16& a, unsigned int bits);
1757  // Comparison operators (component-by-component)
1758  friend vbool16 operator== (const vint16& a, const vint16& b);
1759  friend vbool16 operator!= (const vint16& a, const vint16& b);
1760  friend vbool16 operator< (const vint16& a, const vint16& b);
1761  friend vbool16 operator> (const vint16& a, const vint16& b);
1762  friend vbool16 operator>= (const vint16& a, const vint16& b);
1763  friend vbool16 operator<= (const vint16& a, const vint16& b);
1764 
1765  /// Stream output
1766  friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1767 
1768 private:
1769  // The actual data representation
1770  union {
1774  };
1775 };
1776 
1777 
1778 
1779 /// Shift right logical -- unsigned shift. This differs from operator>>
1780 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1781 /// srl((1<<31),1) == 1<<30.
1782 vint16 srl (const vint16& val, const unsigned int bits);
1783 
1784 /// Shuffle groups of 4
1785 template<int i0, int i1, int i2, int i3>
1786 vint16 shuffle4 (const vint16& a);
1787 
1788 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1789 template<int i> vint16 shuffle4 (const vint16& a);
1790 
1791 /// Shuffle within each group of 4
1792 template<int i0, int i1, int i2, int i3>
1793 vint16 shuffle (const vint16& a);
1794 
1795 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1796 template<int i> vint16 shuffle (const vint16& a);
1797 
1798 /// Helper: as rapid as possible extraction of one component, when the
1799 /// index is fixed.
1800 template<int i> OIIO_FORCEINLINE int extract (const vint16& v);
1801 
1802 /// Helper: substitute val for a[i]
1803 template<int i> OIIO_FORCEINLINE vint16 insert (const vint16& a, int val);
1804 
1805 /// The sum of all components, returned in all components.
1806 vint16 vreduce_add (const vint16& v);
1807 
1808 // Reduction across all components
1809 int reduce_add (const vint16& v);
1810 int reduce_and (const vint16& v);
1811 int reduce_or (const vint16& v);
1812 
1813 /// Use a bool mask to select between components of a (if mask[i] is false)
1814 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1815 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1816 
1817 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1818 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1819 /// blend(0,a,mask).
1820 vint16 blend0 (const vint16& a, const vbool16& mask);
1821 
1822 /// Use a bool mask to select between components of a (if mask[i] is false)
1823 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1824 /// blend(0,a,!mask), or blend(a,0,mask).
1825 vint16 blend0not (const vint16& a, const vbool16& mask);
1826 
1827 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1828 /// synonym for blend with arguments rearranged, but this is more clear
1829 /// because the arguments are symmetric to scalar (cond ? a : b).
1830 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1831 
1832 // Per-element math
1833 vint16 abs (const vint16& a);
1834 vint16 min (const vint16& a, const vint16& b);
1835 vint16 max (const vint16& a, const vint16& b);
1836 
1837 /// Circular bit rotate by s bits, for N values at once.
1838 vint16 rotl (const vint16& x, const int s);
1839 // DEPRECATED(2.1)
1840 vint16 rotl32 (const vint16& x, const unsigned int k);
1841 
1842 /// andnot(a,b) returns ((~a) & b)
1843 vint16 andnot (const vint16& a, const vint16& b);
1844 
1845 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1846 vint16 bitcast_to_int (const vbool16& x);
1847 vint16 bitcast_to_int (const vfloat16& x);
1848 vfloat16 bitcast_to_float (const vint16& x);
1849 
1850 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1851 vint16 safe_mod (const vint16& a, const vint16& b);
1852 vint16 safe_mod (const vint16& a, int b);
1853 
1854 
1855 
1856 
1857 
1858 /// Floating point 4-vector, accelerated by SIMD instructions when
1859 /// available.
1860 class vfloat4 {
1861 public:
1862  static const char* type_name() { return "vfloat4"; }
1863  typedef float value_t; ///< Underlying equivalent scalar value type
1864  enum { elements = 4 }; ///< Number of scalar elements
1865  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1866  enum { bits = elements*32 }; ///< Total number of bits
1867  typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used
1868  typedef vfloat4 vfloat_t; ///< SIMD int type
1869  typedef vint4 vint_t; ///< SIMD int type
1870  typedef vbool4 vbool_t; ///< SIMD bool type
1871  OIIO_DEPRECATED("use vbool_t (1.8)")
1872  typedef vint4 int_t; // old name (deprecated 1.8)
1873  OIIO_DEPRECATED("use vfloat_t (1.8)")
1874  typedef vbool4 bool_t; // old name (deprecated 1.8)
1875  static constexpr size_t size() noexcept { return elements; }
1876 
1877  /// Default constructor (contents undefined)
1878  vfloat4 () { }
1879 
1880  /// Construct from a single value (store it in all slots)
1881  vfloat4 (float a) { load(a); }
1882 
1883  /// Construct from 3 or 4 values
1884  vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1885 
1886  /// Construct from a pointer to 4 values
1887  vfloat4 (const float *f) { load (f); }
1888 
1889  /// Copy construct from another vfloat4
1890  vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1891 
1892  /// Construct from an vint4 (promoting all components to float)
1893  explicit vfloat4 (const vint4& ival);
1894 
1895  /// Construct from the underlying SIMD type
1896  vfloat4 (const simd_t& m) : m_simd(m) { }
1897 
1898  /// Construct from a generic subscripted 3-vector, including Imath::V3f.
1901  explicit vfloat4 (const V& v) { load (v[0], v[1], v[2]); }
1902 
1903  /// Construct from a generic subscripted or xyzw 4-vector, including
1904  /// Imath::V4f.
1908  explicit vfloat4(const V& v) { load ((const value_t *)&v); }
1909 
1910  /// Return the raw SIMD type
1911  operator simd_t () const { return m_simd; }
1912  simd_t simd () const { return m_simd; }
1913  simd_t& simd () { return m_simd; }
1914 
1915  /// Return a pointer to the underlying scalar type
1916  const value_t* data () const { return (const value_t*)this; }
1917  value_t* data () { return (value_t*)this; }
1918 
1919 #ifdef INCLUDED_IMATHVEC_H
1920  /// Cast to a Imath::V3f
1921  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1922 
1923  /// Cast to a Imath::V4f
1924  const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1925 #endif
1926 
1927  /// Construct from a pointer to 4 unsigned short values
1928  explicit vfloat4 (const unsigned short *vals) { load(vals); }
1929 
1930  /// Construct from a pointer to 4 short values
1931  explicit vfloat4 (const short *vals) { load(vals); }
1932 
1933  /// Construct from a pointer to 4 unsigned char values
1934  explicit vfloat4 (const unsigned char *vals) { load(vals); }
1935 
1936  /// Construct from a pointer to 4 char values
1937  explicit vfloat4 (const char *vals) { load(vals); }
1938 
1939 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1940  /// Construct from a pointer to 4 half (16 bit float) values
1941  explicit vfloat4 (const half *vals) { load(vals); }
1942 #endif
1943 
1944  /// Assign a single value to all components
1945  const vfloat4 & operator= (float a) { load(a); return *this; }
1946 
1947  /// Assign a vfloat4
1948  const vfloat4 & operator= (vfloat4 other) {
1949  m_simd = other.m_simd;
1950  return *this;
1951  }
1952 
1953  /// Return a vfloat4 with all components set to 0.0
1954  static const vfloat4 Zero ();
1955 
1956  /// Return a vfloat4 with all components set to 1.0
1957  static const vfloat4 One ();
1958 
1959  /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1960  /// Optional argument can give a non-zero starting point and non-1 step.
1961  static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1962 
1963  /// Set all components to 0.0
1964  void clear ();
1965 
1966  /// Assign from a generic subscripted or xyzw 4-vector, including an
1967  /// Imath::V4f.
1971  const vfloat4 & operator= (const V& v) {
1972  load ((const float *)&v);
1973  return *this;
1974  }
1975 
1976  /// Assign from a generic subscripted 3-vector, including an
1977  /// Imath::V3f.
1980  const vfloat4 & operator= (const V& v) {
1981  load (v[0], v[1], v[2], 0.0f);
1982  return *this;
1983  }
1984 
1985  /// Component access (get)
1986  float operator[] (int i) const;
1987  /// Component access (set)
1988  float& operator[] (int i);
1989 
1990  /// Component access (set).
1991  void setcomp (int i, float value);
1992 
1993  value_t x () const;
1994  value_t y () const;
1995  value_t z () const;
1996  value_t w () const;
1997  void set_x (value_t val);
1998  void set_y (value_t val);
1999  void set_z (value_t val);
2000  void set_w (value_t val);
2001 
2002  /// Helper: load a single value into all components
2003  void load (float val);
2004 
2005  /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
2006  void load (float a, float b, float c, float d=0.0f);
2007 
2008  /// Load from an array of 4 values
2009  void load (const float *values);
2010 
2011  /// Load from a partial array of <=4 values. Unassigned values are
2012  /// undefined.
2013  void load (const float *values, int n);
2014 
2015  /// Load from an array of 4 unsigned short values, convert to float
2016  void load (const unsigned short *values);
2017 
2018  /// Load from an array of 4 short values, convert to float
2019  void load (const short *values);
2020 
2021  /// Load from an array of 4 unsigned char values, convert to float
2022  void load (const unsigned char *values);
2023 
2024  /// Load from an array of 4 char values, convert to float
2025  void load (const char *values);
2026 
2027 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2028  /// Load from an array of 4 half values, convert to float
2029  void load (const half *values);
2030 #endif /* _HALF_H_ or _IMATH_H_ */
2031 
2032  /// Load the first 2 elements from lo[0..1] and the second two elements
2033  /// from hi[0..1].
2034  void load_pairs(const float* lo, const float* hi);
2035 
2036  void store (float *values) const;
2037 
2038  /// Store the first n values into memory
2039  void store (float *values, int n) const;
2040 
2041 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2042  void store (half *values) const;
2043 #endif
2044 
2045  /// Masked load -- read from values[] where mask is 1, load zero where
2046  /// mask is 0.
2047  void load_mask (int mask, const value_t *values);
2048  void load_mask (const vbool_t& mask, const value_t *values);
2049 
2050  /// Masked store -- write to values[] where mask is enabled, don't
2051  /// touch values[] where it's not.
2052  void store_mask (int mask, value_t *values) const;
2053  void store_mask (const vbool_t& mask, value_t *values) const;
2054 
2055  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2056  template<int scale=4>
2057  void gather (const value_t *baseptr, const vint_t& vindex);
2058  /// Gather elements defined by the mask, leave others unchanged.
2059  template<int scale=4>
2060  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2061  template<int scale=4>
2062  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2063 
2064  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2065  template<int scale=4>
2066  void scatter (value_t *baseptr, const vint_t& vindex) const;
2067  /// Scatter elements defined by the mask
2068  template<int scale=4>
2069  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2070  template<int scale=4>
2071  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2072 
2073  // Arithmetic operators
2074  friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
2075  const vfloat4 & operator+= (const vfloat4& a);
2076  vfloat4 operator- () const;
2077  friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
2078  const vfloat4 & operator-= (const vfloat4& a);
2079  friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
2080  friend vfloat4 operator* (const vfloat4& a, float b);
2081  friend vfloat4 operator* (float a, const vfloat4& b);
2082  const vfloat4 & operator*= (const vfloat4& a);
2083  const vfloat4 & operator*= (float val);
2084  friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
2085  const vfloat4 & operator/= (const vfloat4& a);
2086  const vfloat4 & operator/= (float val);
2087 
2088  // Comparison operations
2089  friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
2090  friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
2091  friend vbool4 operator< (const vfloat4& a, const vfloat4& b);
2092  friend vbool4 operator> (const vfloat4& a, const vfloat4& b);
2093  friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
2094  friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
2095 
2096  // Some oddball items that are handy
2097 
2098  /// Combine the first two components of A with the first two components
2099  /// of B.
2100  friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
2101 
2102  /// Combine the first two components of A with the first two components
2103  /// of B, but interleaved.
2104  friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
2105 
2106  /// Return xyz components, plus 0 for w
2107  vfloat4 xyz0 () const;
2108 
2109  /// Return xyz components, plus 1 for w
2110  vfloat4 xyz1 () const;
2111 
2112  /// Stream output
2113  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
2114 
2115 protected:
2116  // The actual data representation
2117  union {
2120  };
2121 };
2122 
2123 
2124 /// Helper: shuffle/swizzle with constant (templated) indices.
2125 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2126 template<int i0, int i1, int i2, int i3>
2127 OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2128 
2129 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2130 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2131 
2132 /// Return { a[i0], a[i1], b[i2], b[i3] }, where i0..i3 are the extracted
2133 /// 2-bit indices packed into the template parameter i (going from the low
2134 /// 2-bit pair to the high 2-bit pair).
2135 template<int i> OIIO_FORCEINLINE vfloat4
2136 shuffle(const vfloat4& a, const vfloat4& b);
2137 
2138 /// Helper: as rapid as possible extraction of one component, when the
2139 /// index is fixed.
2140 template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
2141 
2142 /// Helper: substitute val for a[i]
2143 template<int i> OIIO_FORCEINLINE vfloat4 insert (const vfloat4& a, float val);
2144 
2145 /// The sum of all components, returned in all components.
2146 vfloat4 vreduce_add (const vfloat4& v);
2147 
2148 /// The sum of all components, returned as a scalar.
2149 float reduce_add (const vfloat4& v);
2150 
2151 /// Return the float dot (inner) product of a and b in every component.
2152 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2153 
2154 /// Return the float dot (inner) product of a and b.
2155 float dot (const vfloat4 &a, const vfloat4 &b);
2156 
2157 /// Return the float 3-component dot (inner) product of a and b in
2158 /// all components.
2159 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2160 
2161 /// Return the float 3-component dot (inner) product of a and b.
2162 float dot3 (const vfloat4 &a, const vfloat4 &b);
2163 
2164 /// Use a bool mask to select between components of a (if mask[i] is false)
2165 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2166 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2167 
2168 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2169 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2170 /// blend(0,a,mask).
2171 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2172 
2173 /// Use a bool mask to select between components of a (if mask[i] is false)
2174 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2175 /// blend(0,a,!mask), or blend(a,0,mask).
2176 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2177 
2178 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2179 /// that is 0, return 0 rather than Inf.
2180 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2181 
2182 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2183 vfloat3 hdiv (const vfloat4 &a);
2184 
2185 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2186 /// synonym for blend with arguments rearranged, but this is more clear
2187 /// because the arguments are symmetric to scalar (cond ? a : b).
2188 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2189 
2190 // Per-element math
2191 vfloat4 abs (const vfloat4& a); ///< absolute value (float)
2192 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative
2193 vfloat4 ceil (const vfloat4& a);
2194 vfloat4 floor (const vfloat4& a);
2195 vint4 ifloor (const vfloat4& a); ///< (int)floor
2196 OIIO_DEPRECATED("use ifloor (1.8)")
2197 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2198 
2199 /// Per-element round to nearest integer.
2200 /// CAVEAT: the rounding when mid-way between integers may differ depending
2201 /// on hardware. Intel SSE/AVX does "banker's rounding" (to nearest even
2202 /// integer) but std::round() says to round away from 0 regardless of
2203 /// current rounding mode (but that is multiple instructions on x64).
2204 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2205 /// match std::round().
2206 vfloat4 round (const vfloat4& a);
2207 
2208 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2209 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2210 /// C++ std::rint() which says to use the current rounding mode.
2211 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2212 /// match std::rint().
2213 vint4 rint (const vfloat4& a);
2214 
2215 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a
2216 vfloat4 sqrt (const vfloat4 &a);
2217 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt
2218 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt
2219 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2220 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2221 template <typename T> OIIO_FORCEINLINE T exp (const T& v); // template for all SIMD variants
2222 template <typename T> OIIO_FORCEINLINE T log (const T& v);
2223 
2224 /// andnot(a,b) returns ((~a) & b)
2225 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2226 
2227 // Fused multiply and add (or subtract):
2228 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2229 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2230 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2231 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2232 
2233 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2234 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2235 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2236 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2237 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2238  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2239 
2240 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2241 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2242  const vfloat4& c, const vfloat4& d);
2243 
2244 
2245 
2246 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2247 /// The way it differs from vfloat4 is that all of he load functions only
2248 /// load three values, and all the stores only store 3 values. The vast
2249 /// majority of ops just fall back to the vfloat4 version, and so will
2250 /// operate on the 4th component, but we won't care about that results.
2251 class vfloat3 : public vfloat4 {
2252 public:
2253  static const char* type_name() { return "vfloat3"; }
2254  enum { elements = 3 }; ///< Number of scalar elements
2255  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2256  static constexpr size_t size() noexcept { return elements; }
2257 
2258  /// Default constructor (contents undefined)
2259  vfloat3 () { }
2260 
2261  /// Construct from a single value (store it in all slots)
2262  vfloat3 (float a) { load(a); }
2263 
2264  /// Construct from 3 values
2265  vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2266 
2267  /// Construct from a pointer to 3 values
2268  vfloat3 (const float *f) { load (f); }
2269 
2270  /// Construct from something that looks like a generic 3-vector class,
2271  /// having .x, .y, .z float elements and nothing more. This should be able
2272  /// to capture from an Imath::V3f or an OIIO::V3fParam.
2274  vfloat3(const V& v) : vfloat3(v.x, v.y, v.z) { }
2275 
2276  /// Construct from something that looks like a generic 3-vector class,
2277  /// having an operator[] that returns a float and is the size of 3 floats.
2280  vfloat3(const V& v) : vfloat3(v[0], v[1], v[2]) { }
2281 
2282  /// Copy construct from another vfloat3
2283  vfloat3 (const vfloat3 &other);
2284 
2285  /// Construct from a vfloat4. Note: it will not zero out the internal
2286  /// 4th component, but rather accept on faith that the vfloat4 you are
2287  /// giving it is a valid vfloat3. Be careful!
2288  explicit vfloat3 (const vfloat4 &other);
2289 
2290 #if OIIO_SIMD
2291  /// Construct from the underlying SIMD type. Note: it will not zero out
2292  /// the internal 4th component, but rather accept on faith that the
2293  /// vfloat4 you are giving it is a valid vfloat3. Be careful!
2294  explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2295 #endif
2296 
2297 #ifdef INCLUDED_IMATHVEC_H
2298  /// Cast to a Imath::V3f
2299  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2300 #endif
2301 
2302  /// Construct from a pointer to 4 unsigned short values
2303  explicit vfloat3 (const unsigned short *vals) { load(vals); }
2304 
2305  /// Construct from a pointer to 4 short values
2306  explicit vfloat3 (const short *vals) { load(vals); }
2307 
2308  /// Construct from a pointer to 4 unsigned char values
2309  explicit vfloat3 (const unsigned char *vals) { load(vals); }
2310 
2311  /// Construct from a pointer to 4 char values
2312  explicit vfloat3 (const char *vals) { load(vals); }
2313 
2314 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2315  /// Construct from a pointer to 4 half (16 bit float) values
2316  explicit vfloat3 (const half *vals) { load(vals); }
2317 #endif
2318 
2319  /// Assign a single value to all components
2320  const vfloat3 & operator= (float a) { load(a); return *this; }
2321 
2322  /// Return a vfloat3 with all components set to 0.0
2323  static const vfloat3 Zero ();
2324 
2325  /// Return a vfloat3 with all components set to 1.0
2326  static const vfloat3 One ();
2327 
2328  /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2329  /// Optional argument can give a non-zero starting point and non-1 step.
2330  static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2331 
2332  /// Helper: load a single value into all components
2333  void load (float val);
2334 
2335  /// Load from an array of 4 values
2336  void load (const float *values);
2337 
2338  /// Load from an array of 4 values
2339  void load (const float *values, int n);
2340 
2341  /// Load from an array of 4 unsigned short values, convert to float
2342  void load (const unsigned short *values);
2343 
2344  /// Load from an array of 4 short values, convert to float
2345  void load (const short *values);
2346 
2347  /// Load from an array of 4 unsigned char values, convert to float
2348  void load (const unsigned char *values);
2349 
2350  /// Load from an array of 4 char values, convert to float
2351  void load (const char *values);
2352 
2353 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2354  /// Load from an array of 4 half values, convert to float
2355  void load (const half *values);
2356 #endif /* _HALF_H_ or _IMATH_H_ */
2357 
2358  void store (float *values) const;
2359 
2360  void store (float *values, int n) const;
2361 
2362 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2363  void store (half *values) const;
2364 #endif
2365 
2366  /// Store into a generic subscripted or xyz 3-vector, including Imath::V3f.
2370  void store(V& vec) const {
2371  store((value_t *)&vec);
2372  }
2373 
2374  // Math operators -- define in terms of vfloat3.
2375  friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2376  const vfloat3 & operator+= (const vfloat3& a);
2377  vfloat3 operator- () const;
2378  friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2379  const vfloat3 & operator-= (const vfloat3& a);
2380  friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2381  friend vfloat3 operator* (const vfloat3& a, float b);
2382  friend vfloat3 operator* (float a, const vfloat3& b);
2383  const vfloat3 & operator*= (const vfloat3& a);
2384  const vfloat3 & operator*= (float a);
2385  friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2386  const vfloat3 & operator/= (const vfloat3& a);
2387  const vfloat3 & operator/= (float a);
2388 
2389  /// Square of the length of the vector
2390  float length2() const;
2391  /// Length of the vector
2392  float length() const;
2393 
2394  /// Return a normalized version of the vector.
2395  vfloat3 normalized () const;
2396  /// Return a fast, approximate normalized version of the vector.
2397  vfloat3 normalized_fast () const;
2398  /// Normalize in place.
2399  void normalize() { *this = normalized(); }
2400 
2401  /// Stream output
2402  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2403 };
2404 
2405 
2406 
2407 // Per-element math on float3
2408 vfloat3 abs (const vfloat3& a);
2409 vfloat3 sign (const vfloat3& a);
2410 vfloat3 ceil (const vfloat3& a);
2411 vfloat3 floor (const vfloat3& a);
2412 vfloat3 round (const vfloat3& a);
2413 
2414 
2415 
2416 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2417 /// not in registers) isomorphic to Imath::M44f.
2418 class matrix44 {
2419 public:
2420  static const char* type_name() { return "matrix44"; }
2421  typedef float value_t; ///< Underlying equivalent scalar value type
2422  enum { rows = 4, cols = 4 };
2423  static constexpr int elements = 16;
2424 
2425  // Uninitialized
2427 
2428  /// Copy constructor
2430  m_row[0] = M[0];
2431  m_row[1] = M[1];
2432  m_row[2] = M[2];
2433  m_row[3] = M[3];
2434  }
2435 
2436  /// Construct from a float array
2437  OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2438  m_row[0].load (f+0);
2439  m_row[1].load (f+4);
2440  m_row[2].load (f+8);
2441  m_row[3].load (f+12);
2442  }
2443 
2444  /// Construct from an OIIO::M44fParam (including an Imath::M44f)
2445  OIIO_FORCEINLINE matrix44(M44fParam M) : matrix44(M.data()) { }
2446 
2447  /// Construct from 4 vfloat4 rows
2448  OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2449  const vfloat4& c, const vfloat4& d) {
2450  m_row[0] = a;
2451  m_row[1] = b;
2452  m_row[2] = c;
2453  m_row[3] = d;
2454  }
2455  /// Construct from 4 float[4] rows
2456  OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2457  const float *c, const float *d) {
2458  m_row[0].load(a);
2459  m_row[1].load(b);
2460  m_row[2].load(c);
2461  m_row[3].load(d);
2462  }
2463 
2464  /// Construct from 16 floats
2465  OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2466  float f10, float f11, float f12, float f13,
2467  float f20, float f21, float f22, float f23,
2468  float f30, float f31, float f32, float f33)
2469  {
2470  m_row[0].load (f00, f01, f02, f03);
2471  m_row[1].load (f10, f11, f12, f13);
2472  m_row[2].load (f20, f21, f22, f23);
2473  m_row[3].load (f30, f31, f32, f33);
2474  }
2475 
2476 #ifdef INCLUDED_IMATHMATRIX_H
2477  /// Present as an Imath::M44f
2478  const Imath::M44f& M44f() const;
2479 #endif
2480 
2481  /// Return one row
2482  const vfloat4& operator[] (int i) const;
2483 
2484  /// Assignment
2485  const matrix44& operator= (const matrix44& m);
2486 
2487  /// Return the transposed matrix
2488  matrix44 transposed () const;
2489 
2490  /// Transform 3-point V by 4x4 matrix M.
2491  vfloat3 transformp (const vfloat3 &V) const;
2492 
2493  /// Transform 3-vector V by 4x4 matrix M.
2494  vfloat3 transformv (const vfloat3 &V) const;
2495 
2496  /// Transform 3-vector V by the transpose of 4x4 matrix M.
2497  vfloat3 transformvT (const vfloat3 &V) const;
2498 
2499  friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2500  friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2501 
2502  bool operator== (const matrix44& m) const;
2503 
2504  bool operator!= (const matrix44& m) const;
2505 
2506  bool operator== (M44fParam m) const ;
2507  friend bool operator== (M44fParam a, const matrix44 &b);
2508  bool operator!= (M44fParam m) const;
2509  friend bool operator!= (M44fParam a, const matrix44 &b);
2510 
2511  /// Return the inverse of the matrix.
2512  matrix44 inverse() const;
2513 
2514  /// Stream output
2515  friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2516 
2517  const float* data() const { return m_vals[0]; }
2518 
2519 private:
2520  union {
2523  };
2524 };
2525 
2526 /// Transform 3-point V by 4x4 matrix M.
2527 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2528 
2529 /// Transform 3-vector V by 4x4 matrix M.
2530 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2531 
2532 // Transform 3-vector by the transpose of 4x4 matrix M.
2533 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2534 
2535 vfloat3 transformp (M44fParam M, const vfloat3 &V);
2536 vfloat3 transformv (M44fParam M, const vfloat3 &V);
2537 vfloat3 transformvT (M44fParam M, const vfloat3 &V);
2538 
2539 
2540 
2541 /// Floating point 8-vector, accelerated by SIMD instructions when
2542 /// available.
2543 class vfloat8 {
2544 public:
2545  static const char* type_name() { return "vfloat8"; }
2546  typedef float value_t; ///< Underlying equivalent scalar value type
2547  enum { elements = 8 }; ///< Number of scalar elements
2548  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2549  enum { bits = elements*32 }; ///< Total number of bits
2550  typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used
2551  typedef vfloat8 vfloat_t; ///< SIMD int type
2552  typedef vint8 vint_t; ///< SIMD int type
2553  typedef vbool8 vbool_t; ///< SIMD bool type
2554  OIIO_DEPRECATED("use vint_t (1.8)")
2555  typedef vint8 int_t; // old name (deprecated 1.8)
2556  OIIO_DEPRECATED("use vbool_t (1.8)")
2557  typedef vbool8 bool_t; // old name (deprecated 1.8)
2558  static constexpr size_t size() noexcept { return elements; }
2559 
2560  /// Default constructor (contents undefined)
2561  vfloat8 () { }
2562 
2563  /// Construct from a single value (store it in all slots)
2564  vfloat8 (float a) { load(a); }
2565 
2566  /// Construct from 8 values
2567  vfloat8 (float a, float b, float c, float d,
2568  float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2569 
2570  /// Construct from a pointer to 8 values
2571  vfloat8 (const float *f) { load (f); }
2572 
2573  /// Copy construct from another vfloat8
2574  vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2575 
2576  /// Construct from an int vector (promoting all components to float)
2577  explicit vfloat8 (const vint8& ival);
2578 
2579  /// Construct from two vfloat4's
2580  vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2581 
2582  /// Construct from the underlying SIMD type
2583  vfloat8 (const simd_t& m) : m_simd(m) { }
2584 
2585  /// Return the raw SIMD type
2586  operator simd_t () const { return m_simd; }
2587  simd_t simd () const { return m_simd; }
2588  simd_t& simd () { return m_simd; }
2589 
2590  /// Return a pointer to the underlying scalar type
2591  const value_t* data () const { return (const value_t*)this; }
2592  value_t* data () { return (value_t*)this; }
2593 
2594  /// Construct from a pointer to unsigned short values
2595  explicit vfloat8 (const unsigned short *vals) { load(vals); }
2596 
2597  /// Construct from a pointer to short values
2598  explicit vfloat8 (const short *vals) { load(vals); }
2599 
2600  /// Construct from a pointer to unsigned char values
2601  explicit vfloat8 (const unsigned char *vals) { load(vals); }
2602 
2603  /// Construct from a pointer to char values
2604  explicit vfloat8 (const char *vals) { load(vals); }
2605 
2606 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2607  /// Construct from a pointer to half (16 bit float) values
2608  explicit vfloat8 (const half *vals) { load(vals); }
2609 #endif
2610 
2611  /// Assign a single value to all components
2612  const vfloat8& operator= (float a) { load(a); return *this; }
2613 
2614  /// Assign a vfloat8
2615  const vfloat8& operator= (vfloat8 other) {
2616  m_simd = other.m_simd;
2617  return *this;
2618  }
2619 
2620  /// Return a vfloat8 with all components set to 0.0
2621  static const vfloat8 Zero ();
2622 
2623  /// Return a vfloat8 with all components set to 1.0
2624  static const vfloat8 One ();
2625 
2626  /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2627  /// Optional argument can give a non-zero starting point and non-1 step.
2628  static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2629 
2630  /// Set all components to 0.0
2631  void clear ();
2632 
2633  /// Component access (get)
2634  float operator[] (int i) const;
2635  /// Component access (set)
2636  float& operator[] (int i);
2637 
2638  /// Component access (set).
2639  void setcomp (int i, float value);
2640 
2641  value_t x () const;
2642  value_t y () const;
2643  value_t z () const;
2644  value_t w () const;
2645  void set_x (value_t val);
2646  void set_y (value_t val);
2647  void set_z (value_t val);
2648  void set_w (value_t val);
2649 
2650  /// Extract the lower precision vfloat4
2651  vfloat4 lo () const;
2652 
2653  /// Extract the higher precision vfloat4
2654  vfloat4 hi () const;
2655 
2656  /// Helper: load a single value into all components
2657  void load (float val);
2658 
2659  /// Helper: load 8 values
2660  void load (float a, float b, float c, float d,
2661  float e, float f, float g, float h);
2662 
2663  /// Load from an array of values
2664  void load (const float *values);
2665 
2666  /// Load from a partial array of <=8 values. Unassigned values are
2667  /// undefined.
2668  void load (const float *values, int n);
2669 
2670  /// Load from an array of 8 unsigned short values, convert to float
2671  void load (const unsigned short *values);
2672 
2673  /// Load from an array of 8 short values, convert to float
2674  void load (const short *values);
2675 
2676  /// Load from an array of 8 unsigned char values, convert to float
2677  void load (const unsigned char *values);
2678 
2679  /// Load from an array of 8 char values, convert to float
2680  void load (const char *values);
2681 
2682 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2683  /// Load from an array of 8 half values, convert to float
2684  void load (const half *values);
2685 #endif /* _HALF_H_ or _IMATH_H_ */
2686 
2687  void store (float *values) const;
2688 
2689  /// Store the first n values into memory
2690  void store (float *values, int n) const;
2691 
2692 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2693  void store (half *values) const;
2694 #endif
2695 
2696  /// Masked load -- read from values[] where mask is 1, load zero where
2697  /// mask is 0.
2698  void load_mask (int mask, const value_t *values);
2699  void load_mask (const vbool_t& mask, const value_t *values);
2700 
2701  /// Masked store -- write to values[] where mask is enabled, don't
2702  /// touch values[] where it's not.
2703  void store_mask (int mask, value_t *values) const;
2704  void store_mask (const vbool_t& mask, value_t *values) const;
2705 
2706  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2707  template<int scale=4>
2708  void gather (const value_t *baseptr, const vint_t& vindex);
2709  template<int scale=4>
2710  // Fastest way to fill with all 1 bits is to cmp any value to itself.
2711  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2712  template<int scale=4>
2713  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2714 
2715  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2716  template<int scale=4>
2717  void scatter (value_t *baseptr, const vint_t& vindex) const;
2718  /// Scatter elements defined by the mask
2719  template<int scale=4>
2720  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2721  template<int scale=4>
2722  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2723 
2724  // Arithmetic operators (component-by-component)
2725  friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2726  friend vfloat8 operator- (const vfloat8& a);
2727  friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2728  friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2729  friend vfloat8 operator* (const vfloat8& a, float b);
2730  friend vfloat8 operator* (float a, const vfloat8& b);
2731  friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2732  friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2733  friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2734  friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2735  friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2736  friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2737 
2738  // Comparison operations
2739  friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2740  friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2741  friend vbool8 operator< (const vfloat8& a, const vfloat8& b);
2742  friend vbool8 operator> (const vfloat8& a, const vfloat8& b);
2743  friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2744  friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2745 
2746  // Some oddball items that are handy
2747 
2748  /// Stream output
2749  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2750 
2751 protected:
2752  // The actual data representation
2753  union {
2757  };
2758 };
2759 
2760 
2761 /// Helper: shuffle/swizzle with constant (templated) indices.
2762 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2763 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2764 OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2765 
2766 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2767 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2768 
2769 /// Helper: as rapid as possible extraction of one component, when the
2770 /// index is fixed.
2771 template<int i> OIIO_FORCEINLINE float extract (const vfloat8& a);
2772 
2773 /// Helper: substitute val for a[i]
2774 template<int i> OIIO_FORCEINLINE vfloat8 insert (const vfloat8& a, float val);
2775 
2776 /// The sum of all components, returned in all components.
2777 vfloat8 vreduce_add (const vfloat8& v);
2778 
2779 /// The sum of all components, returned as a scalar.
2780 float reduce_add (const vfloat8& v);
2781 
2782 /// Return the float dot (inner) product of a and b in every component.
2783 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2784 
2785 /// Return the float dot (inner) product of a and b.
2786 float dot (const vfloat8 &a, const vfloat8 &b);
2787 
2788 /// Return the float 3-component dot (inner) product of a and b in
2789 /// all components.
2790 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2791 
2792 /// Return the float 3-component dot (inner) product of a and b.
2793 float dot3 (const vfloat8 &a, const vfloat8 &b);
2794 
2795 /// Use a bool mask to select between components of a (if mask[i] is false)
2796 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2797 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2798 
2799 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2800 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2801 /// blend(0,a,mask).
2802 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2803 
2804 /// Use a bool mask to select between components of a (if mask[i] is false)
2805 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2806 /// blend(0,a,!mask), or blend(a,0,mask).
2807 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2808 
2809 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2810 /// that is 0, return 0 rather than Inf.
2811 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2812 
2813 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2814 /// synonym for blend with arguments rearranged, but this is more clear
2815 /// because the arguments are symmetric to scalar (cond ? a : b).
2816 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2817 
2818 // Per-element math
2819 vfloat8 abs (const vfloat8& a); ///< absolute value (float)
2820 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative
2821 vfloat8 ceil (const vfloat8& a);
2822 vfloat8 floor (const vfloat8& a);
2823 vint8 ifloor (const vfloat8& a); ///< (int)floor
2824 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2825 
2826 /// Per-element round to nearest integer.
2827 /// CAVEAT: the rounding when mid-way between integers may differ depending
2828 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2829 /// integer) but std::round() says to round away from 0 regardless of
2830 /// current rounding mode (but that is multiple instructions on x64).
2831 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2832 /// match std::round().
2833 vfloat8 round (const vfloat8& a);
2834 
2835 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2836 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2837 /// C++ std::rint() which says to use the current rounding mode.
2838 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2839 /// match std::rint().
2840 vint8 rint (const vfloat8& a);
2841 
2842 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a
2843 vfloat8 sqrt (const vfloat8 &a);
2844 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt
2845 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt
2846 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2847 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2848 // vfloat8 exp (const vfloat8& v); // See template with vfloat4
2849 // vfloat8 log (const vfloat8& v); // See template with vfloat4
2850 
2851 /// andnot(a,b) returns ((~a) & b)
2852 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2853 
2854 // Fused multiply and add (or subtract):
2855 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2856 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2857 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2858 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2859 
2860 
2861 
2862 /// Floating point 16-vector, accelerated by SIMD instructions when
2863 /// available.
2864 class vfloat16 {
2865 public:
2866  static const char* type_name() { return "vfloat16"; }
2867  typedef float value_t; ///< Underlying equivalent scalar value type
2868  enum { elements = 16 }; ///< Number of scalar elements
2869  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2870  enum { bits = elements*32 }; ///< Total number of bits
2871  typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used
2872  typedef vfloat16 vfloat_t; ///< SIMD int type
2873  typedef vint16 vint_t; ///< SIMD int type
2874  typedef vbool16 vbool_t; ///< SIMD bool type
2875  OIIO_DEPRECATED("use vint_t (1.8)")
2876  typedef vint16 int_t; // old name (deprecated 1.8)
2877  OIIO_DEPRECATED("use vbool_t (1.8)")
2878  typedef vbool16 bool_t; // old name (deprecated 1.8)
2879  static constexpr size_t size() noexcept { return elements; }
2880 
2881  /// Default constructor (contents undefined)
2882  vfloat16 () { }
2883 
2884  /// Construct from a single value (store it in all slots)
2885  vfloat16 (float a) { load(a); }
2886 
2887  /// Construct from 16 values
2888  vfloat16 (float v0, float v1, float v2, float v3,
2889  float v4, float v5, float v6, float v7,
2890  float v8, float v9, float v10, float v11,
2891  float v12, float v13, float v14, float v15);
2892 
2893  /// Construct from a pointer to 16 values
2894  vfloat16 (const float *f) { load (f); }
2895 
2896  /// Copy construct from another vfloat16
2897  vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2898 
2899  /// Construct from an int vector (promoting all components to float)
2900  explicit vfloat16 (const vint16& ival);
2901 
2902  /// Construct from two vfloat8's
2903  vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2904 
2905  /// Construct from four vfloat4's
2906  vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2907 
2908  /// Construct from the underlying SIMD type
2909  vfloat16 (const simd_t& m) : m_simd(m) { }
2910 
2911  /// Return the raw SIMD type
2912  operator simd_t () const { return m_simd; }
2913  simd_t simd () const { return m_simd; }
2914  simd_t& simd () { return m_simd; }
2915 
2916  /// Return a pointer to the underlying scalar type
2917  const value_t* data () const { return (const value_t*)this; }
2918  value_t* data () { return (value_t*)this; }
2919 
2920  /// Construct from a pointer to unsigned short values
2921  explicit vfloat16 (const unsigned short *vals) { load(vals); }
2922 
2923  /// Construct from a pointer to short values
2924  explicit vfloat16 (const short *vals) { load(vals); }
2925 
2926  /// Construct from a pointer to unsigned char values
2927  explicit vfloat16 (const unsigned char *vals) { load(vals); }
2928 
2929  /// Construct from a pointer to char values
2930  explicit vfloat16 (const char *vals) { load(vals); }
2931 
2932 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2933  /// Construct from a pointer to half (16 bit float) values
2934  explicit vfloat16 (const half *vals) { load(vals); }
2935 #endif
2936 
2937  /// Assign a single value to all components
2938  const vfloat16& operator= (float a) { load(a); return *this; }
2939 
2940  /// Assign a vfloat16
2941  const vfloat16& operator= (vfloat16 other) {
2942  m_simd = other.m_simd;
2943  return *this;
2944  }
2945 
2946  /// Return a vfloat16 with all components set to 0.0
2947  static const vfloat16 Zero ();
2948 
2949  /// Return a vfloat16 with all components set to 1.0
2950  static const vfloat16 One ();
2951 
2952  /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2953  /// Optional argument can give a non-zero starting point and non-1 step.
2954  static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2955 
2956  /// Set all components to 0.0
2957  void clear ();
2958 
2959  /// Component access (get)
2960  float operator[] (int i) const;
2961  /// Component access (set)
2962  float& operator[] (int i);
2963 
2964  /// Component access (set).
2965  void setcomp (int i, float value);
2966 
2967  value_t x () const;
2968  value_t y () const;
2969  value_t z () const;
2970  value_t w () const;
2971  void set_x (value_t val);
2972  void set_y (value_t val);
2973  void set_z (value_t val);
2974  void set_w (value_t val);
2975 
2976  /// Extract the lower precision vfloat8
2977  vfloat8 lo () const;
2978 
2979  /// Extract the higher precision vfloat8
2980  vfloat8 hi () const;
2981 
2982  /// Helper: load a single value into all components
2983  void load (float val);
2984 
2985  /// Load separate values into each component.
2986  void load (float v0, float v1, float v2, float v3,
2987  float v4, float v5, float v6, float v7,
2988  float v8, float v9, float v10, float v11,
2989  float v12, float v13, float v14, float v15);
2990 
2991  /// Load from an array of values
2992  void load (const float *values);
2993 
2994  /// Load from a partial array of <=16 values. Unassigned values are
2995  /// undefined.
2996  void load (const float *values, int n);
2997 
2998  /// Load from an array of 16 unsigned short values, convert to float
2999  void load (const unsigned short *values);
3000 
3001  /// Load from an array of 16 short values, convert to float
3002  void load (const short *values);
3003 
3004  /// Load from an array of 16 unsigned char values, convert to float
3005  void load (const unsigned char *values);
3006 
3007  /// Load from an array of 16 char values, convert to float
3008  void load (const char *values);
3009 
3010 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
3011  /// Load from an array of 16 half values, convert to float
3012  void load (const half *values);
3013 #endif /* _HALF_H_ or _IMATH_H_ */
3014 
3015  void store (float *values) const;
3016 
3017  /// Store the first n values into memory
3018  void store (float *values, int n) const;
3019 
3020 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
3021  void store (half *values) const;
3022 #endif
3023 
3024  /// Masked load -- read from values[] where mask is 1, load zero where
3025  /// mask is 0.
3026  void load_mask (const vbool_t &mask, const value_t *values);
3027  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
3028 
3029  /// Masked store -- write to values[] where mask is enabled, don't
3030  /// touch values[] where it's not.
3031  void store_mask (const vbool_t &mask, value_t *values) const;
3032  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
3033 
3034  /// Load values from addresses (char*)basepatr + vindex[i]*scale
3035  template<int scale=4>
3036  void gather (const value_t *baseptr, const vint_t& vindex);
3037  /// Gather elements defined by the mask, leave others unchanged.
3038  template<int scale=4>
3039  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
3040  template<int scale=4>
3041  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
3042  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
3043  }
3044 
3045  /// Store values at addresses (char*)basepatr + vindex[i]*scale
3046  template<int scale=4>
3047  void scatter (value_t *baseptr, const vint_t& vindex) const;
3048  /// Scatter elements defined by the mask
3049  template<int scale=4>
3050  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
3051  template<int scale=4>
3052  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
3053  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
3054  }
3055 
3056  // Arithmetic operators (component-by-component)
3057  friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
3058  friend vfloat16 operator- (const vfloat16& a);
3059  friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
3060  friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
3061  friend vfloat16 operator* (const vfloat16& a, float b);
3062  friend vfloat16 operator* (float a, const vfloat16& b);
3063  friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
3064  friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
3065  friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
3066  friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
3067  friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
3068  friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
3069 
3070  // Comparison operations
3071  friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
3072  friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
3073  friend vbool16 operator< (const vfloat16& a, const vfloat16& b);
3074  friend vbool16 operator> (const vfloat16& a, const vfloat16& b);
3075  friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
3076  friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
3077 
3078  // Some oddball items that are handy
3079 
3080  /// Stream output
3081  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
3082 
3083 protected:
3084  // The actual data representation
3085  union {
3089  };
3090 };
3091 
3092 
3093 /// Shuffle groups of 4
3094 template<int i0, int i1, int i2, int i3>
3095 OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
3096 
3097 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
3098 template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
3099 
3100 /// Shuffle within each group of 4
3101 template<int i0, int i1, int i2, int i3>
3102 OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);
3103 
3104 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3105 template<int i> vfloat16 shuffle (const vfloat16& a);
3106 
3107 /// Helper: as rapid as possible extraction of one component, when the
3108 /// index is fixed.
3109 template<int i> OIIO_FORCEINLINE float extract (const vfloat16& a);
3110 
3111 /// Helper: substitute val for a[i]
3112 template<int i> OIIO_FORCEINLINE vfloat16 insert (const vfloat16& a, float val);
3113 
3114 /// The sum of all components, returned in all components.
3115 vfloat16 vreduce_add (const vfloat16& v);
3116 
3117 /// The sum of all components, returned as a scalar.
3118 float reduce_add (const vfloat16& v);
3119 
3120 /// Use a bool mask to select between components of a (if mask[i] is false)
3121 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
3122 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
3123 
3124 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
3125 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
3126 /// blend(0,a,mask).
3127 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
3128 
3129 /// Use a bool mask to select between components of a (if mask[i] is false)
3130 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
3131 /// blend(0,a,!mask), or blend(a,0,mask).
3132 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
3133 
3134 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
3135 /// that is 0, return 0 rather than Inf.
3136 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
3137 
3138 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
3139 /// synonym for blend with arguments rearranged, but this is more clear
3140 /// because the arguments are symmetric to scalar (cond ? a : b).
3141 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
3142 
3143 // Per-element math
3144 vfloat16 abs (const vfloat16& a); ///< absolute value (float)
3145 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative
3146 vfloat16 ceil (const vfloat16& a);
3147 vfloat16 floor (const vfloat16& a);
3148 vint16 ifloor (const vfloat16& a); ///< (int)floor
3149 OIIO_DEPRECATED("use ifloor (1.8)")
3150 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias
3151 
3152 /// Per-element round to nearest integer.
3153 /// CAVEAT: the rounding when mid-way between integers may differ depending
3154 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
3155 /// integer) but std::round() says to round away from 0 regardless of
3156 /// current rounding mode (but that is multiple instructions on x64).
3157 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3158 /// match std::round().
3159 vfloat16 round (const vfloat16& a);
3160 
3161 /// Per-element round to nearest integer (equivalent to vint(round(a))).
3162 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
3163 /// C++ std::rint() which says to use the current rounding mode.
3164 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3165 /// match std::rint().
3166 vint16 rint (const vfloat16& a);
3167 
3168 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a
3169 vfloat16 sqrt (const vfloat16 &a);
3170 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt
3171 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt
3172 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
3173 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
3174 // vfloat16 exp (const vfloat16& v); // See template with vfloat4
3175 // vfloat16 log (const vfloat16& v); // See template with vfloat4
3176 
3177 /// andnot(a,b) returns ((~a) & b)
3178 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
3179 
3180 // Fused multiply and add (or subtract):
3181 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
3182 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
3183 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
3184 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
3185 
3186 
3187 
3188 // Odds and ends, other CPU hardware tricks
3189 
3190 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3191 // able, otherwise false (because it's not available on that platform).
3192 inline bool set_flush_zero_mode (bool on) {
3193 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3194  _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3195  return true;
3196 #endif
3197  return false;
3198 }
3199 
3200 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3201 // able, otherwise false (because it's not available on that platform).
3202 inline bool set_denorms_zero_mode (bool on) {
3203 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3204  _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3205  return true;
3206 #endif
3207  return false;
3208 }
3209 
3210 // Get the flush_zero_mode CPU flag on x86.
3211 inline bool get_flush_zero_mode () {
3212 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3213  return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3214 #endif
3215  return false;
3216 }
3217 
3218 // Get the denorms_zero_mode CPU flag on x86.
3219 inline bool get_denorms_zero_mode () {
3220 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3221  return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3222 #endif
3223  return false;
3224 }
3225 
3226 
3227 
3228 
3229 
3230 
3231 //////////////////////////////////////////////////////////////////////////
3232 //////////////////////////////////////////////////////////////////////////
3233 //
3234 // Gory implementation details follow.
3235 //
3236 // ^^^ All declarations and documentation is above ^^^
3237 //
3238 // vvv Below is the implementation, often considerably cluttered with
3239 // #if's for each architecture, and unapologitic use of intrinsics and
3240 // every manner of dirty trick we can think of to make things fast.
3241 // Some of this isn't pretty. We won't recapitulate comments or
3242 // documentation of what the functions are supposed to do, please
3243 // consult the declarations above for that.
3244 //
3245 // Here be dragons.
3246 //
3247 //////////////////////////////////////////////////////////////////////////
3248 //////////////////////////////////////////////////////////////////////////
3249 
3250 
3251 
3252 //////////////////////////////////////////////////////////////////////
3253 // vbool4 implementation
3254 
3255 
3257  OIIO_DASSERT(i >= 0 && i < elements);
3258 #if OIIO_SIMD_SSE
3259  return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3260 #else
3261  return m_val[i];
3262 #endif
3263 }
3264 
3266  OIIO_DASSERT(i >= 0 && i < elements);
3267  return m_val[i];
3268 }
3269 
3270 
3272  OIIO_DASSERT(i >= 0 && i < elements);
3273  m_val[i] = value ? -1 : 0;
3274 }
3275 
3276 
3277 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3278  cout << a[0];
3279  for (int i = 1; i < a.elements; ++i)
3280  cout << ' ' << a[i];
3281  return cout;
3282 }
3283 
3284 
3286 #if OIIO_SIMD_SSE
3287  m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3288 #elif OIIO_SIMD_NEON
3289  m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3290 #else
3291  int val = -int(a);
3292  SIMD_CONSTRUCT (val);
3293 #endif
3294 }
3295 
3296 
3297 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3298 #if OIIO_SIMD_SSE
3299  // N.B. -- we need to reverse the order because of our convention
3300  // of storing a,b,c,d in the same order in memory.
3301  m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3302 #elif OIIO_SIMD_NEON
3303  int values[4] = { -int(a), -int(b), -int(c), -int(d) };
3304  m_simd = vld1q_u32((const uint32_t*)values);
3305  // this if we were using int:
3306  // m_simd = vld1q_s32(values);
3307 #else
3308  m_val[0] = -int(a);
3309  m_val[1] = -int(b);
3310  m_val[2] = -int(c);
3311  m_val[3] = -int(d);
3312 #endif
3313 }
3314 
3316  load (a[0], a[1], a[2], a[3]);
3317 }
3318 
3320  m_simd = other.m_simd;
3321  return *this;
3322 }
3323 
3324 
3326 #if OIIO_SIMD_SSE
3327  return _mm_movemask_ps(m_simd);
3328 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3329  const int shifts[4] { 0, 1, 2, 3 };
3330  const int32x4_t shift = vld1q_s32(shifts);
3331  uint32x4_t t = vshrq_n_u32(m_simd, 31);
3332  return vaddvq_u32(vshlq_u32(t, shift));
3333 #else
3334  int r = 0;
3335  for (int i = 0; i < elements; ++i)
3336  if (m_val[i])
3337  r |= 1<<i;
3338  return r;
3339 #endif
3340 }
3341 
3342 
3344 vbool4::from_bitmask (int bitmask) {
3345  // I think this is a fast conversion from int bitmask to vbool4
3346  return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3347 }
3348 
3349 
3351 #if OIIO_SIMD_SSE
3352  m_simd = _mm_setzero_ps();
3353 #else
3354  *this = false;
3355 #endif
3356 }
3357 
3358 
3360 #if OIIO_SIMD_SSE
3361  return _mm_setzero_ps();
3362 #else
3363  return false;
3364 #endif
3365 }
3366 
3368  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3369 #if OIIO_SIMD_SSE
3370 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3371  __m128i anyval = _mm_undefined_si128();
3372 # else
3373  __m128i anyval = _mm_setzero_si128();
3374 # endif
3375  return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3376 #else
3377  return true;
3378 #endif
3379 }
3380 
3382  SIMD_DO (values[i] = m_val[i] ? true : false);
3383 }
3384 
3385 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3386  OIIO_DASSERT (n >= 0 && n <= elements);
3387  for (int i = 0; i < n; ++i)
3388  values[i] = m_val[i] ? true : false;
3389 }
3390 
3391 
3392 
3394 #if OIIO_SIMD_SSE
3395  return _mm_xor_ps (a.simd(), vbool4::True());
3396 #elif OIIO_SIMD_NEON
3397  return vmvnq_u32(a.simd());
3398 #else
3399  SIMD_RETURN (vbool4, a[i] ^ (-1));
3400 #endif
3401 }
3402 
3404 #if OIIO_SIMD_SSE
3405  return _mm_and_ps (a.simd(), b.simd());
3406 #elif OIIO_SIMD_NEON
3407  return vandq_u32(a.simd(), b.simd());
3408 #else
3409  SIMD_RETURN (vbool4, a[i] & b[i]);
3410 #endif
3411 }
3412 
3414 #if OIIO_SIMD_SSE
3415  return _mm_or_ps (a.simd(), b.simd());
3416 #elif OIIO_SIMD_NEON
3417  return vorrq_u32(a.simd(), b.simd());
3418 #else
3419  SIMD_RETURN (vbool4, a[i] | b[i]);
3420 #endif
3421 }
3422 
3424 #if OIIO_SIMD_SSE
3425  return _mm_xor_ps (a.simd(), b.simd());
3426 #elif OIIO_SIMD_NEON
3427  return veorq_u32(a.simd(), b.simd());
3428 #else
3429  SIMD_RETURN (vbool4, a[i] ^ b[i]);
3430 #endif
3431 }
3432 
3433 
3435  return a = a & b;
3436 }
3437 
3439  return a = a | b;
3440 }
3441 
3443  return a = a ^ b;
3444 }
3445 
3447 #if OIIO_SIMD_SSE
3448  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3449  return _mm_xor_ps (a.simd(), vbool4::True());
3450 #elif OIIO_SIMD_NEON
3451  return vmvnq_u32(a.m_simd);
3452 #else
3453  SIMD_RETURN (vbool4, ~a[i]);
3454 #endif
3455 }
3456 
3458 #if OIIO_SIMD_SSE
3459  return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3460 #elif OIIO_SIMD_NEON
3461  return vceqq_u32 (a.m_simd, b.m_simd);
3462 #else
3463  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3464 #endif
3465 }
3466 
3468 #if OIIO_SIMD_SSE
3469  return _mm_xor_ps (a, b);
3470 #elif OIIO_SIMD_NEON
3471  return !(a == b);
3472 #else
3473  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3474 #endif
3475 }
3476 
3477 
3478 
3479 
3480 #if OIIO_SIMD_SSE
3481 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b)
3482 template<int i0, int i1, int i2, int i3>
3483 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3484  return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3485 }
3486 #endif
3487 
3488 #if OIIO_SIMD_SSE >= 3
3489 // SSE3 has intrinsics for a few special cases
3490 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3491  return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3492 }
3493 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3494  return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3495 }
3496 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3497  return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3498 }
3499 #endif
3500 
3501 #if OIIO_SIMD_SSE
3502 template<int i0, int i1, int i2, int i3>
3503 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3504  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3505 }
3506 #endif
3507 
3508 #if OIIO_SIMD_SSE >= 3
3509 // SSE3 has intrinsics for a few special cases
3510 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3511  return _mm_moveldup_ps(a);
3512 }
3513 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3514  return _mm_movehdup_ps(a);
3515 }
3516 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3517  return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3518 }
3519 #endif
3520 
3521 
3522 /// Helper: shuffle/swizzle with constant (templated) indices.
3523 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3524 template<int i0, int i1, int i2, int i3>
3526 #if OIIO_SIMD_SSE
3527  return shuffle_sse<i0,i1,i2,i3> (a.simd());
3528 #else
3529  return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3530 #endif
3531 }
3532 
3533 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3534 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3535  return shuffle<i,i,i,i>(a);
3536 }
3537 
3538 
3539 /// Helper: as rapid as possible extraction of one component, when the
3540 /// index is fixed.
3541 template<int i>
3543 #if OIIO_SIMD_SSE >= 4
3544  return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only
3545 #elif OIIO_SIMD_NEON
3546  return vgetq_lane_u32(a, i);
3547  // this if we were using int:
3548  // return vgetq_lane_s32(a, i);
3549 #else
3550  return a[i];
3551 #endif
3552 }
3553 
3554 /// Helper: substitute val for a[i]
3555 template<int i>
3557 #if OIIO_SIMD_SSE >= 4
3558  int ival = -int(val);
3559  return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3560 #elif OIIO_SIMD_NEON
3561  uint32_t ival = uint32_t(val ? -1 : 0);
3562  return vld1q_lane_u32(&ival, a, i);
3563  // this if we were using int:
3564  // int ival = -int(val);
3565  // return vld1q_lane_s32(&ival, a, i);
3566 #else
3567  vbool4 tmp = a;
3568  tmp[i] = -int(val);
3569  return tmp;
3570 #endif
3571 }
3572 
3574 #if OIIO_SIMD_AVX
3575  return _mm_testc_ps (v, vbool4(true)) != 0;
3576 #elif OIIO_SIMD_SSE
3577  return _mm_movemask_ps(v.simd()) == 0xf;
3578 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3579  uint32x4_t t = vshrq_n_u32(v.simd(), 31);
3580  return vaddvq_u32(t) == 4;
3581 #else
3582  SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3583 #endif
3584 }
3585 
3587 #if OIIO_SIMD_AVX
3588  return ! _mm_testz_ps (v, v);
3589 #elif OIIO_SIMD_SSE
3590  return _mm_movemask_ps(v) != 0;
3591 #elif OIIO_SIMD_NEON && defined(__aarch64__)
3592  uint32x4_t t = vshrq_n_u32(v.simd(), 31);
3593  return vaddvq_u32(t) != 0;
3594 #else
3595  SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3596 #endif
3597 }
3598 
3599 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3600 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3601 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3602 
3603 
3604 
3605 //////////////////////////////////////////////////////////////////////
3606 // vbool8 implementation
3607 
3608 
3610  OIIO_DASSERT(i >= 0 && i < elements);
3611 #if OIIO_SIMD_AVX
3612  return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3613 #else
3614  return m_val[i];
3615 #endif
3616 }
3617 
3619  OIIO_DASSERT(i >= 0 && i < elements);
3620  m_val[i] = value ? -1 : 0;
3621 }
3622 
3624  OIIO_DASSERT(i >= 0 && i < elements);
3625  return m_val[i];
3626 }
3627 
3628 
3629 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3630  cout << a[0];
3631  for (int i = 1; i < a.elements; ++i)
3632  cout << ' ' << a[i];
3633  return cout;
3634 }
3635 
3636 
3638 #if OIIO_SIMD_AVX
3639  m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3640 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3641  m_4[0].load(a);
3642  m_4[1].load(a);
3643 #else
3644  int val = -int(a);
3645  SIMD_CONSTRUCT (val);
3646 #endif
3647 }
3648 
3649 
3650 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3651  bool e, bool f, bool g, bool h) {
3652 #if OIIO_SIMD_AVX
3653  // N.B. -- we need to reverse the order because of our convention
3654  // of storing a,b,c,d in the same order in memory.
3655  m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3656  -int(d), -int(c), -int(b), -int(a)));
3657 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3658  m_4[0].load(a, b, c, d);
3659  m_4[1].load(e, f, g, h);
3660 #else
3661  m_val[0] = -int(a);
3662  m_val[1] = -int(b);
3663  m_val[2] = -int(c);
3664  m_val[3] = -int(d);
3665  m_val[4] = -int(e);
3666  m_val[5] = -int(f);
3667  m_val[6] = -int(g);
3668  m_val[7] = -int(h);
3669 #endif
3670 }
3671 
3672 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3673  bool e, bool f, bool g, bool h) {
3674  load (a, b, c, d, e, f, g, h);
3675 }
3676 
3677 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d,
3678  int e, int f, int g, int h) {
3679  load (bool(a), bool(b), bool(c), bool(d),
3680  bool(e), bool(f), bool(g), bool(h));
3681 }
3682 
3684  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3685 }
3686 
3687 
3689  load(a);
3690  return *this;
3691 }
3692 
3694  m_simd = other.m_simd;
3695  return *this;
3696 }
3697 
3699 #if OIIO_SIMD_AVX
3700  return _mm256_movemask_ps(m_simd);
3701 #else
3702  return lo().bitmask() | (hi().bitmask() << 4);
3703 #endif
3704 }
3705 
3706 
3708 vbool8::from_bitmask (int bitmask) {
3709  // I think this is a fast conversion from int bitmask to vbool8
3710  return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3711 }
3712 
3713 
3715 #if OIIO_SIMD_AVX
3716  m_simd = _mm256_setzero_ps();
3717 #else
3718  *this = false;
3719 #endif
3720 }
3721 
3723 #if OIIO_SIMD_AVX
3724  return _mm256_setzero_ps();
3725 #else
3726  return false;
3727 #endif
3728 }
3729 
3730 
3732 #if OIIO_SIMD_AVX
3733 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3734  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3735  __m256i anyval = _mm256_undefined_si256();
3736  return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3737 # else
3738  return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3739 # endif
3740 #else
3741  return true;
3742 #endif
3743 }
3744 
3745 
3747  SIMD_DO (values[i] = m_val[i] ? true : false);
3748 }
3749 
3750 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3751  OIIO_DASSERT (n >= 0 && n <= elements);
3752  for (int i = 0; i < n; ++i)
3753  values[i] = m_val[i] ? true : false;
3754 }
3755 
3756 
3758 #if OIIO_SIMD_AVX
3759  return _mm256_castps256_ps128 (simd());
3760 #else
3761  return m_4[0];
3762 #endif
3763 }
3764 
3766 #if OIIO_SIMD_AVX
3767  return _mm256_extractf128_ps (simd(), 1);
3768 #else
3769  return m_4[1];
3770 #endif
3771 }
3772 
3773 
3775 #if OIIO_SIMD_AVX
3776  __m256 r = _mm256_castps128_ps256 (lo);
3777  m_simd = _mm256_insertf128_ps (r, hi, 1);
3778  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3779 #else
3780  m_4[0] = lo;
3781  m_4[1] = hi;
3782 #endif
3783 }
3784 
3785 
3787 #if OIIO_SIMD_AVX
3788  return _mm256_xor_ps (a.simd(), vbool8::True());
3789 #else
3790  SIMD_RETURN (vbool8, a[i] ^ (-1));
3791 #endif
3792 }
3793 
3795 #if OIIO_SIMD_AVX
3796  return _mm256_and_ps (a.simd(), b.simd());
3797 #else
3798  SIMD_RETURN (vbool8, a[i] & b[i]);
3799 #endif
3800 }
3801 
3803 #if OIIO_SIMD_AVX
3804  return _mm256_or_ps (a.simd(), b.simd());
3805 #else
3806  SIMD_RETURN (vbool8, a[i] | b[i]);
3807 #endif
3808 }
3809 
3811 #if OIIO_SIMD_AVX
3812  return _mm256_xor_ps (a.simd(), b.simd());
3813 #else
3814  SIMD_RETURN (vbool8, a[i] ^ b[i]);
3815 #endif
3816 }
3817 
3818 
3820  return a = a & b;
3821 }
3822 
3824  return a = a | b;
3825 }
3826 
3828  return a = a ^ b;
3829 }
3830 
3831 
3833 #if OIIO_SIMD_AVX
3834  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3835  return _mm256_xor_ps (a.simd(), vbool8::True());
3836 #else
3837  SIMD_RETURN (vbool8, ~a[i]);
3838 #endif
3839 }
3840 
3841 
3843 #if OIIO_SIMD_AVX >= 2
3844  return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3845 #elif OIIO_SIMD_AVX
3846  return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3847 #else
3848  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3849 #endif
3850 }
3851 
3853 #if OIIO_SIMD_AVX
3854  return _mm256_xor_ps (a, b);
3855 #else
3856  SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3857 #endif
3858 }
3859 
3860 
3861 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3863 #if OIIO_SIMD_AVX >= 2
3864  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3865  return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3866 #else
3867  return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3868 #endif
3869 }
3870 
3871 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3872  return shuffle<i,i,i,i,i,i,i,i>(a);
3873 }
3874 
3875 
3876 template<int i>
3878 #if OIIO_SIMD_AVX && !_WIN32
3879  return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only
3880 #else
3881  return a[i];
3882 #endif
3883 }
3884 
3885 template<int i>
3887 #if OIIO_SIMD_AVX && !_WIN32
3888  int ival = -int(val);
3889  return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3890 #else
3891  vbool8 tmp = a;
3892  tmp[i] = -int(val);
3893  return tmp;
3894 #endif
3895 }
3896 
3897 
3899 #if OIIO_SIMD_AVX
3900  return _mm256_testc_ps (v, vbool8(true)) != 0;
3901  // return _mm256_movemask_ps(v.simd()) == 0xff;
3902 #else
3903  SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3904 #endif
3905 }
3906 
3908 #if OIIO_SIMD_AVX
3909  return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h !
3910  // return _mm256_movemask_ps(v) != 0;
3911 #else
3912  SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3913 #endif
3914 }
3915 
3916 
3917 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3918 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3919 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3920 
3921 
3922 
3923 //////////////////////////////////////////////////////////////////////
3924 // vbool16 implementation
3925 
3926 
3928  OIIO_DASSERT(i >= 0 && i < elements);
3929 #if OIIO_SIMD_AVX >= 512
3930  return (int(m_simd) >> i) & 1;
3931 #else
3932  return (m_bits >> i) & 1;
3933 #endif
3934 }
3935 
3937  OIIO_DASSERT(i >= 0 && i < elements);
3938  int bits = m_bits;
3939  bits &= (0xffff ^ (1<<i));
3940  bits |= (int(value)<<i);
3941  m_bits = bits;
3942 }
3943 
3944 
3945 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3946  cout << a[0];
3947  for (int i = 1; i < a.elements; ++i)
3948  cout << ' ' << a[i];
3949  return cout;
3950 }
3951 
3952 
3954  m_simd = a ? 0xffff : 0;
3955 }
3956 
3957 
3959  m_simd = simd_t(a);
3960 }
3961 
3962 
3963 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3964  bool v4, bool v5, bool v6, bool v7,
3965  bool v8, bool v9, bool v10, bool v11,
3966  bool v12, bool v13, bool v14, bool v15) {
3967  m_simd = simd_t((int(v0) << 0) |
3968  (int(v1) << 1) |
3969  (int(v2) << 2) |
3970  (int(v3) << 3) |
3971  (int(v4) << 4) |
3972  (int(v5) << 5) |
3973  (int(v6) << 6) |
3974  (int(v7) << 7) |
3975  (int(v8) << 8) |
3976  (int(v9) << 9) |
3977  (int(v10) << 10) |
3978  (int(v11) << 11) |
3979  (int(v12) << 12) |
3980  (int(v13) << 13) |
3981  (int(v14) << 14) |
3982  (int(v15) << 15));
3983 }
3984 
3985 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3986  bool v4, bool v5, bool v6, bool v7,
3987  bool v8, bool v9, bool v10, bool v11,
3988  bool v12, bool v13, bool v14, bool v15) {
3989  load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3990 }
3991 
3993  int v4, int v5, int v6, int v7,
3994  int v8, int v9, int v10, int v11,
3995  int v12, int v13, int v14, int v15) {
3996  load (bool(v0), bool(v1), bool(v2), bool(v3),
3997  bool(v4), bool(v5), bool(v6), bool(v7),
3998  bool(v8), bool(v9), bool(v10), bool(v11),
3999  bool(v12), bool(v13), bool(v14), bool(v15));
4000 }
4001 
4003  load_bitmask (a.bitmask() | (b.bitmask() << 8));
4004 }
4005 
4007  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
4008  a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
4009 }
4010 
4011 
4013  load(a);
4014  return *this;
4015 }
4016 
4018  m_simd = other.m_simd;
4019  return *this;
4020 }
4021 
4022 
4024 #if OIIO_SIMD_AVX >= 512
4025  return int(m_simd);
4026 #else
4027  return int(m_bits);
4028 #endif
4029 }
4030 
4031 
4033  m_simd = simd_t(0);
4034 }
4035 
4037  return simd_t(0);
4038 }
4039 
4040 
4042  return simd_t(0xffff);
4043 }
4044 
4045 
4047  SIMD_DO (values[i] = m_bits & (1<<i));
4048 }
4049 
4050 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
4051  OIIO_DASSERT (n >= 0 && n <= elements);
4052  for (int i = 0; i < n; ++i)
4053  values[i] = m_bits & (1<<i);
4054 }
4055 
4056 
4057 
4059 #if OIIO_SIMD_AVX >= 512
4060  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
4061 #else
4062  SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
4063 #endif
4064 }
4065 
4067 #if OIIO_SIMD_AVX >= 512
4068  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
4069 #else
4070  SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
4071 #endif
4072 }
4073 
4074 
4076 #if OIIO_SIMD_AVX >= 512
4077  return _mm512_knot (a.simd());
4078 #else
4079  return vbool16 (a.m_bits ^ 0xffff);
4080 #endif
4081 }
4082 
4084 #if OIIO_SIMD_AVX >= 512
4085  return _mm512_kand (a.simd(), b.simd());
4086 #else
4087  return vbool16 (a.m_bits & b.m_bits);
4088 #endif
4089 }
4090 
4092 #if OIIO_SIMD_AVX >= 512
4093  return _mm512_kor (a.simd(), b.simd());
4094 #else
4095  return vbool16 (a.m_bits | b.m_bits);
4096 #endif
4097 }
4098 
4100 #if OIIO_SIMD_AVX >= 512
4101  return _mm512_kxor (a.simd(), b.simd());
4102 #else
4103  return vbool16 (a.m_bits ^ b.m_bits);
4104 #endif
4105 }
4106 
4107 
4109  return a = a & b;
4110 }
4111 
4113  return a = a | b;
4114 }
4115 
4117  return a = a ^ b;
4118 }
4119 
4120 
4122  return a ^ vbool16::True();
4123 }
4124 
4125 
4127 #if OIIO_SIMD_AVX >= 512
4128  return _mm512_kxnor (a.simd(), b.simd());
4129 #else
4130  return vbool16 (!(a.m_bits ^ b.m_bits));
4131 #endif
4132 }
4133 
4135 #if OIIO_SIMD_AVX >= 512
4136  return _mm512_kxor (a.simd(), b.simd());
4137 #else
4138  return vbool16 (a.m_bits ^ b.m_bits);
4139 #endif
4140 }
4141 
4142 
4143 template<int i>
4145  return a[i];
4146 }
4147 
4148 template<int i>
4150  vbool16 tmp = a;
4151  tmp.setcomp (i, val);
4152  return tmp;
4153 }
4154 
4155 
4157  return v.bitmask() == 0xffff;
4158 }
4159 
4161  return v.bitmask() != 0;
4162 }
4163 
4164 
4165 OIIO_FORCEINLINE bool all (const vbool16& v) { return reduce_and(v) == true; }
4166 OIIO_FORCEINLINE bool any (const vbool16& v) { return reduce_or(v) == true; }
4167 OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
4168 
4169 
4170 
4171 
4172 
4173 
4174 //////////////////////////////////////////////////////////////////////
4175 // vint4 implementation
4176 
4178  m_simd = other.m_simd;
4179  return *this;
4180 }
4181 
4184  return m_val[i];
4185 }
4186 
4189  return m_val[i];
4190 }
4191 
4194  m_val[i] = val;
4195 }
4196 
4197 
4199 #if OIIO_SIMD_SSE
4200  m_simd = _mm_set1_epi32 (a);
4201 #elif OIIO_SIMD_NEON
4202  m_simd = vdupq_n_s32 (a);
4203 #else
4204  SIMD_CONSTRUCT (a);
4205 #endif
4206 }
4207 
4208 
4209 
4210 OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d) {
4211 #if OIIO_SIMD_SSE
4212  m_simd = _mm_set_epi32 (d, c, b, a);
4213 #elif OIIO_SIMD_NEON
4214  int values[4] = { a, b, c, d };
4215  m_simd = vld1q_s32 (values);
4216 #else
4217  m_val[0] = a;
4218  m_val[1] = b;
4219  m_val[2] = c;
4220  m_val[3] = d;
4221 #endif
4222 }
4223 
4224 
4225 // OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d,
4226 // int e, int f, int g, int h) {
4227 // load (a, b, c, d);
4228 // }
4229 
4230 
4231 
4233 #if OIIO_SIMD_SSE
4234  m_simd = _mm_loadu_si128 ((const simd_t *)values);
4235 #elif OIIO_SIMD_NEON
4236  m_simd = vld1q_s32 (values);
4237 #else
4238  SIMD_CONSTRUCT (values[i]);
4239 #endif
4240 }
4241 
4242 
4243 OIIO_FORCEINLINE void vint4::load (const int *values, int n)
4244 {
4245  OIIO_DASSERT (n >= 0 && n <= elements);
4246 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4247  m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
4248 #elif OIIO_SIMD_SSE
4249  switch (n) {
4250  case 1:
4251  m_simd = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4252  break;
4253  case 2:
4254  // Trickery: load one double worth of bits!
4255  m_simd = _mm_castpd_si128 (_mm_load_sd ((const double*)values));
4256  break;
4257  case 3:
4258  // Trickery: load one double worth of bits, then a float,
4259  // and combine, casting to ints.
4260  m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((const double*)values)),
4261  _mm_load_ss ((const float *)values + 2)));
4262  break;
4263  case 4:
4264  m_simd = _mm_loadu_si128 ((const simd_t *)values);
4265  break;
4266  default:
4267  clear ();
4268  break;
4269  }
4270 #else
4271  for (int i = 0; i < n; ++i)
4272  m_val[i] = values[i];
4273  for (int i = n; i < elements; ++i)
4274  m_val[i] = 0;
4275 #endif
4276 }
4277 
4278 
4279 OIIO_FORCEINLINE void vint4::load (const unsigned short *values) {
4280 #if OIIO_SIMD_SSE >= 4
4281  // Trickery: load one double worth of bits = 4 ushorts!
4282  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4283  m_simd = _mm_cvtepu16_epi32 (a);
4284 #else
4285  SIMD_CONSTRUCT (values[i]);
4286 #endif
4287 }
4288 
4289 
4290 OIIO_FORCEINLINE void vint4::load (const short *values) {
4291 #if OIIO_SIMD_SSE >= 4
4292  // Trickery: load one double worth of bits = 4 shorts!
4293  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4294  m_simd = _mm_cvtepi16_epi32 (a);
4295 #else
4296  SIMD_CONSTRUCT (values[i]);
4297 #endif
4298 }
4299 
4300 
4301 OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {
4302 #if OIIO_SIMD_SSE >= 4
4303  // Trickery: load one float worth of bits = 4 uchars!
4304  simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4305  m_simd = _mm_cvtepu8_epi32 (a);
4306 #elif OIIO_SIMD_SSE >= 2
4307  // Trickery: load one float worth of bits = 4 uchars!
4308  simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4309  a = _mm_unpacklo_epi8(a, _mm_setzero_si128());
4310  m_simd = _mm_unpacklo_epi16(a, _mm_setzero_si128());
4311 #else
4312  SIMD_CONSTRUCT (values[i]);
4313 #endif
4314 }
4315 
4316 
4318 #if OIIO_SIMD_SSE >= 4
4319  // Trickery: load one float worth of bits = 4 chars!
4320  simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4321  m_simd = _mm_cvtepi8_epi32 (a);
4322 #else
4323  SIMD_CONSTRUCT (values[i]);
4324 #endif
4325 }
4326 
4327 
4329 
4330 OIIO_FORCEINLINE vint4::vint4 (int a, int b) { load(a,a,b,b); }
4331 
4332 OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d) { load(a,b,c,d); }
4333 
4334 // OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d,
4335 // int e, int f, int g, int h) {
4336 // load(a,b,c,d,e,f,g,h);
4337 // }
4338 
4339 OIIO_FORCEINLINE vint4::vint4 (const int *vals) { load (vals); }
4340 OIIO_FORCEINLINE vint4::vint4 (const unsigned short *vals) { load(vals); }
4341 OIIO_FORCEINLINE vint4::vint4 (const short *vals) { load(vals); }
4342 OIIO_FORCEINLINE vint4::vint4 (const unsigned char *vals) { load(vals); }
4343 OIIO_FORCEINLINE vint4::vint4 (const char *vals) { load(vals); }
4344 
4345 OIIO_FORCEINLINE const vint4 & vint4::operator= (int a) { load(a); return *this; }
4346 
4347 
4349 #if OIIO_SIMD_SSE
4350  // Use an unaligned store -- it's just as fast when the memory turns
4351  // out to be aligned, nearly as fast even when unaligned. Not worth
4352  // the headache of using stores that require alignment.
4353  _mm_storeu_si128 ((simd_t *)values, m_simd);
4354 #elif OIIO_SIMD_NEON
4355  vst1q_s32(values, m_simd);
4356 #else
4357  SIMD_DO (values[i] = m_val[i]);
4358 #endif
4359 }
4360 
4361 
4363 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4364  m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
4365 #elif OIIO_SIMD_AVX >= 2
4366  m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
4367 #else
4368  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
4369 #endif
4370 }
4371 
4372 
4374 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4375  m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
4376 #elif OIIO_SIMD_AVX >= 2
4377  m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask));
4378 #else
4379  SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
4380 #endif
4381 }
4382 
4383 
4385 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4386  _mm_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
4387 #elif OIIO_SIMD_AVX >= 2
4388  _mm_maskstore_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
4389 #else
4390  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
4391 #endif
4392 }
4393 
4394 
4396 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4397  _mm_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
4398 #elif OIIO_SIMD_AVX >= 2
4399  _mm_maskstore_epi32 (values, _mm_castps_si128(mask), m_simd);
4400 #else
4401  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
4402 #endif
4403 }
4404 
4405 
4406 template <int scale>
4407 OIIO_FORCEINLINE void
4408 vint4::gather (const value_t *baseptr, const vint_t& vindex)
4409 {
4410 #if OIIO_SIMD_AVX >= 2
4411  m_simd = _mm_i32gather_epi32 (baseptr, vindex, scale);
4412 #else
4413  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
4414 #endif
4415 }
4416 
4417 template<int scale>
4418 OIIO_FORCEINLINE void
4419 vint4::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
4420 {
4421 #if OIIO_SIMD_AVX >= 2
4422  m_simd = _mm_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm_cvtps_epi32(mask), scale);
4423 #else
4424  SIMD_DO (if (mask[i]) m_val[i] = *(const value_t *)((const char *)baseptr + vindex[i]*scale));
4425 #endif
4426 }
4427 
4428 template<int scale>
4429 OIIO_FORCEINLINE void
4430 vint4::scatter (value_t *baseptr, const vint_t& vindex) const
4431 {
4432 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4433  // FIXME: disable because it benchmarks slower than the dumb way
4434  _mm_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
4435 #else
4436  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4437 #endif
4438 }
4439 
4440 template<int scale>
4441 OIIO_FORCEINLINE void
4443  const vint_t& vindex) const
4444 {
4445 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4446  // FIXME: disable because it benchmarks slower than the dumb way
4447  _mm_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
4448 #else
4449  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4450 #endif
4451 }
4452 
4453 
4455 #if OIIO_SIMD_SSE
4456  m_simd = _mm_setzero_si128();
4457 #else
4458  *this = 0;
4459 #endif
4460 }
4461 
4462 
4463 
4465 #if OIIO_SIMD_SSE
4466  return _mm_setzero_si128();
4467 #else
4468  return 0;
4469 #endif
4470 }
4471 
4472 
4473 OIIO_FORCEINLINE const vint4 vint4::One () { return vint4(1); }
4474 
4476 #if OIIO_SIMD_SSE
4477  // Fastest way to fill an __m128 with all 1 bits is to cmpeq_epi8
4478  // any value to itself.
4479 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
4480  __m128i anyval = _mm_undefined_si128();
4481 # else
4482  __m128i anyval = _mm_setzero_si128();
4483 # endif
4484  return _mm_cmpeq_epi8 (anyval, anyval);
4485 #else
4486  return vint4(-1);
4487 #endif
4488 }
4489 
4490 
4491 
4492 OIIO_FORCEINLINE const vint4 vint4::Iota (int start, int step) {
4493  return vint4 (start+0*step, start+1*step, start+2*step, start+3*step);
4494 }
4495 
4496 
4498  return vint4 (1<<0, 1<<1, 1<<2, 1<<3);
4499 }
4500 
4501 
4503 #if OIIO_SIMD_SSE
4504  return _mm_add_epi32 (a.simd(), b.simd());
4505 #elif OIIO_SIMD_NEON
4506  return vaddq_s32(a.m_simd, b.m_simd);
4507 #else
4508  SIMD_RETURN (vint4, a[i] + b[i]);
4509 #endif
4510 }
4511 
4513  return a = a + b;
4514 }
4515 
4516 
4518 #if OIIO_SIMD_SSE
4519  return _mm_sub_epi32 (_mm_setzero_si128(), a);
4520 #elif OIIO_SIMD_NEON
4521  return vnegq_s32(a.m_simd);
4522 #else
4523  SIMD_RETURN (vint4, -a[i]);
4524 #endif
4525 }
4526 
4527 
4529 #if OIIO_SIMD_SSE
4530  return _mm_sub_epi32 (a.simd(), b.simd());
4531 #elif OIIO_SIMD_NEON
4532  return vsubq_s32(a.m_simd, b.m_simd);
4533 #else
4534  SIMD_RETURN (vint4, a[i] - b[i]);
4535 #endif
4536 }
4537 
4538 
4540  return a = a - b;
4541 }
4542 
4543 
4544 #if OIIO_SIMD_SSE
4545 // Shamelessly lifted from Syrah which lifted from Manta which lifted it
4546 // from intel.com
4547 OIIO_FORCEINLINE __m128i mul_epi32 (__m128i a, __m128i b) {
4548 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4549  return _mm_mullo_epi32(a, b);
4550 #else
4551  // Prior to SSE 4.1, there is no _mm_mullo_epi32 instruction, so we have
4552  // to fake it.
4553  __m128i t0;
4554  __m128i t1;
4555  t0 = _mm_mul_epu32 (a, b);
4556  t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1),
4557  _mm_shuffle_epi32 (b, 0xB1));
4558  t0 = _mm_shuffle_epi32 (t0, 0xD8);
4559  t1 = _mm_shuffle_epi32 (t1, 0xD8);
4560  return _mm_unpacklo_epi32 (t0, t1);
4561 #endif
4562 }
4563 #endif
4564 
4565 
4567 #if OIIO_SIMD_SSE
4568  return mul_epi32 (a.simd(), b.simd());
4569 #elif OIIO_SIMD_NEON
4570  return vmulq_s32(a.m_simd, b.m_simd);
4571 #else
4572  SIMD_RETURN (vint4, a[i] * b[i]);
4573 #endif
4574 }
4575 
4576 
4577 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, const vint4& b) { return a = a * b; }
4578 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, int b) { return a = a * b; }
4579 
4580 
4582  // NO INTEGER DIVISION IN SSE!
4583  SIMD_RETURN (vint4, a[i] / b[i]);
4584 }
4585 
4586 
4587 OIIO_FORCEINLINE const vint4& operator/= (vint4& a, const vint4& b) { return a = a / b; }
4588 
4590  // NO INTEGER MODULUS IN SSE!
4591  SIMD_RETURN (vint4, a[i] % b[i]);
4592 }
4593 
4594 
4595 
4596 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, const vint4& b) { return a = a % b; }
4597 
4598 
4600  // NO INTEGER MODULUS in SSE!
4601  SIMD_RETURN (vint4, a[i] % w);
4602 }
4603 
4604 
4605 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, int b) { return a = a % b; }
4606 
4607 
4609 #if OIIO_SIMD_SSE
4610  return _mm_and_si128 (a.simd(), b.simd());
4611 #elif OIIO_SIMD_NEON
4612  return vandq_s32(a.simd(), b.simd());
4613 #else
4614  SIMD_RETURN (vint4, a[i] & b[i]);
4615 #endif
4616 }
4617 
4618 
4619 OIIO_FORCEINLINE const vint4& operator&= (vint4& a, const vint4& b) { return a = a & b; }
4620 
4621 
4622 
4624 #if OIIO_SIMD_SSE
4625  return _mm_or_si128 (a.simd(), b.simd());
4626 #elif OIIO_SIMD_NEON
4627  return vorrq_s32(a.simd(), b.simd());
4628 #else
4629  SIMD_RETURN (vint4, a[i] | b[i]);
4630 #endif
4631 }
4632 
4633 OIIO_FORCEINLINE const vint4& operator|= (vint4& a, const vint4& b) { return a = a | b; }
4634 
4635 
4637 #if OIIO_SIMD_SSE
4638  return _mm_xor_si128 (a.simd(), b.simd());
4639 #elif OIIO_SIMD_NEON
4640  return veorq_s32(a.simd(), b.simd());
4641 #else
4642  SIMD_RETURN (vint4, a[i] ^ b[i]);
4643 #endif
4644 }
4645 
4646 
4647 OIIO_FORCEINLINE const vint4& operator^= (vint4& a, const vint4& b) { return a = a ^ b; }
4648 
4649 
4651 #if OIIO_SIMD_SSE
4652  return a ^ a.NegOne();
4653 #elif OIIO_SIMD_NEON
4654  return vmvnq_s32(a.m_simd);
4655 #else
4656  SIMD_RETURN (vint4, ~a[i]);
4657 #endif
4658 }
4659 
4660 OIIO_FORCEINLINE vint4 operator<< (const vint4& a, unsigned int bits) {
4661 #if OIIO_SIMD_SSE
4662  return _mm_slli_epi32 (a, bits);
4663 #elif OIIO_SIMD_NEON
4664  return vshlq_s32(a.m_simd, vdupq_n_s32(bits));
4665 #else
4666  SIMD_RETURN (vint4, a[i] << bits);
4667 #endif
4668 }
4669 
4670 OIIO_FORCEINLINE const vint4& operator<<= (vint4& a, const unsigned int bits) {
4671  return a = a << bits;
4672 }
4673 
4674 
4675 OIIO_FORCEINLINE vint4 operator>> (const vint4& a, const unsigned int bits) {
4676 #if OIIO_SIMD_SSE
4677  return _mm_srai_epi32 (a, bits);
4678 #elif OIIO_SIMD_NEON
4679  return vshlq_s32(a.m_simd, vdupq_n_s32(-(int)bits));
4680 #else
4681  SIMD_RETURN (vint4, a[i] >> bits);
4682 #endif
4683 }
4684 
4685 OIIO_FORCEINLINE const vint4& operator>>= (vint4& a, const unsigned int bits) {
4686  return a = a >> bits;
4687 }
4688 
4689 
4690 OIIO_FORCEINLINE vint4 srl (const vint4& a, const unsigned int bits) {
4691 #if OIIO_SIMD_SSE
4692  return _mm_srli_epi32 (a, bits);
4693 #elif OIIO_SIMD_NEON
4694  uint32x4_t au = vreinterpretq_u32_s32(a);
4695  au = vshlq_u32(au, vdupq_n_s32(-(int)bits));
4696  return vreinterpretq_s32_u32(au);
4697 #else
4698  SIMD_RETURN (vint4, int ((unsigned int)(a[i]) >> bits));
4699 #endif
4700 }
4701 
4702 
4704 #if OIIO_SIMD_SSE
4705  return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b));
4706 #elif OIIO_SIMD_NEON
4707  return vceqq_s32 (a, b);
4708 #else
4709  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
4710 #endif
4711 }
4712 
4714  return ! (a == b);
4715 }
4716 
4717 
4719 #if OIIO_SIMD_SSE
4720  return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b));
4721 #elif OIIO_SIMD_NEON
4722  return vcgtq_s32 (a, b);
4723 #else
4724  SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
4725 #endif
4726 }
4727 
4729 #if OIIO_SIMD_SSE
4730  return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b));
4731 #elif OIIO_SIMD_NEON
4732  return vcltq_s32 (a, b);
4733 #else
4734  SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
4735 #endif
4736 }
4737 
4739  return (b < a) | (a == b);
4740 }
4741 
4743  return (b > a) | (a == b);
4744 }
4745 
4746 inline std::ostream& operator<< (std::ostream& cout, const vint4& val) {
4747  cout << val[0];
4748  for (int i = 1; i < val.elements; ++i)
4749  cout << ' ' << val[i];
4750  return cout;
4751 }
4752 
4753 
4754 OIIO_FORCEINLINE void vint4::store (int *values, int n) const {
4755  OIIO_DASSERT (n >= 0 && n <= elements);
4756 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4757  // This SHOULD be fast, but in my benchmarks, it is slower!
4758  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
4759  // Re-test this periodically with new Intel hardware.
4760  _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)), m_simd);
4761 #elif OIIO_SIMD
4762  // For full SIMD, there is a speed advantage to storing all components.
4763  if (n == elements)
4764  store (values);
4765  else
4766  for (int i = 0; i < n; ++i)
4767  values[i] = m_val[i];
4768 #else
4769  for (int i = 0; i < n; ++i)
4770  values[i] = m_val[i];
4771 #endif
4772 }
4773 
4774 
4775 
4776 OIIO_FORCEINLINE void vint4::store (unsigned short *values) const {
4777 #if OIIO_AVX512VL_ENABLED
4778  _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf), m_simd);
4779 #elif OIIO_SIMD_SSE
4780  // Expressed as half-words and considering little endianness, we
4781  // currently have AxBxCxDx (the 'x' means don't care).
4782  vint4 clamped = m_simd & vint4(0xffff); // A0B0C0D0
4783  vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6));
4784  // low = AB00xxxx
4785  vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6));
4786  // high = xxxx00CD
4787  vint4 highswapped = shuffle_sse<2,3,0,1>(high); // 00CDxxxx
4788  vint4 result = low | highswapped; // ABCDxxxx
4789  _mm_storel_pd ((double *)values, _mm_castsi128_pd(result));
4790  // At this point, values[] should hold A,B,C,D
4791 #else
4792  SIMD_DO (values[i] = m_val[i]);
4793 #endif
4794 }
4795 
4796 
4797 
4798 OIIO_FORCEINLINE void vint4::store (unsigned char *values) const {
4799 #if OIIO_AVX512VL_ENABLED
4800  _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd);
4801 #elif OIIO_SIMD_SSE
4802  vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000
4803  simd_t val16 = _mm_packs_epi32(clamped, _mm_setzero_si128()); // A0B0 C0D0 xxxx xxxx
4804  simd_t val8 = _mm_packus_epi16(val16, _mm_setzero_si128()); // ABCD xxxx xxxx xxxx
4805  _mm_store_ss((float*)values, _mm_castsi128_ps(val8));
4806 #elif OIIO_SIMD_NEON
4807  vint4 clamped = m_simd & vint4(0xff);
4808  int16x8_t val16 = vcombine_s16(vqmovn_s32(clamped), vdup_n_s16(0));
4809  uint8x16_t val8 = vcombine_u8(vqmovun_s16(val16), vdup_n_u8(0));
4810  vst1q_lane_u32((uint32_t*)values, vreinterpretq_u32_u8(val8), 0);
4811 #else
4812  SIMD_DO (values[i] = m_val[i]);
4813 #endif
4814 }
4815 
4816 
4817 
4818 
4819 template<int i0, int i1, int i2, int i3>
4821 #if OIIO_SIMD_SSE
4822  return shuffle_sse<i0,i1,i2,i3> (__m128i(a));
4823 #else
4824  return vint4(a[i0], a[i1], a[i2], a[i3]);
4825 #endif
4826 }
4827 
4828 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
4829 
4830 
4831 template<int i>
4833 #if OIIO_SIMD_SSE >= 4
4834  return _mm_extract_epi32(v.simd(), i); // SSE4.1 only
4835 #elif OIIO_SIMD_NEON
4836  return vgetq_lane_s32(v.simd(), i);
4837 #else
4838  return v[i];
4839 #endif
4840 }
4841 
4842 #if OIIO_SIMD_SSE
4843 template<> OIIO_FORCEINLINE int extract<0> (const vint4& v) {
4844  return _mm_cvtsi128_si32(v.simd());
4845 }
4846 #endif
4847 
4848 template<int i>
4850 #if OIIO_SIMD_SSE >= 4
4851  return _mm_insert_epi32 (a.simd(), val, i);
4852 #elif OIIO_SIMD_NEON
4853  return vld1q_lane_s32(&val, a.simd(), i);
4854 #else
4855  vint4 tmp = a;
4856  tmp[i] = val;
4857  return tmp;
4858 #endif
4859 }
4860 
4861 
4862 
4863 OIIO_FORCEINLINE int vint4::x () const { return extract<0>(*this); }
4864 OIIO_FORCEINLINE int vint4::y () const { return extract<1>(*this); }
4865 OIIO_FORCEINLINE int vint4::z () const { return extract<2>(*this); }
4866 OIIO_FORCEINLINE int vint4::w () const { return extract<3>(*this); }
4867 OIIO_FORCEINLINE void vint4::set_x (int val) { *this = insert<0>(*this, val); }
4868 OIIO_FORCEINLINE void vint4::set_y (int val) { *this = insert<1>(*this, val); }
4869 OIIO_FORCEINLINE void vint4::set_z (int val) { *this = insert<2>(*this, val); }
4870 OIIO_FORCEINLINE void vint4::set_w (int val) { *this = insert<3>(*this, val); }
4871 
4872 
4874 {
4875 #if OIIO_SIMD_SSE
4876  return _mm_castps_si128 (x.simd());
4877 #else
4878  return *(vint4 *)&x;
4879 #endif
4880 }
4881 
4882 // Old names: (DEPRECATED 1.8)
4883 OIIO_DEPRECATED("use bitcast_to_int() (1.8)")
4884 inline vint4 bitcast_to_int4 (const vbool4& x) { return bitcast_to_int(x); }
4885 
4886 
4888 #if OIIO_SIMD_SSE >= 3
4889  // People seem to agree that SSE3 does add reduction best with 2
4890  // horizontal adds.
4891  // suppose v = (a, b, c, d)
4892  simd::vint4 ab_cd = _mm_hadd_epi32 (v.simd(), v.simd());
4893  // ab_cd = (a+b, c+d, a+b, c+d)
4894  simd::vint4 abcd = _mm_hadd_epi32 (ab_cd.simd(), ab_cd.simd());
4895  // all abcd elements are a+b+c+d, return an element as fast as possible
4896  return abcd;
4897 #elif OIIO_SIMD_SSE >= 2
4898  // I think this is the best we can do for SSE2, and I'm still not sure
4899  // it's faster than the default scalar operation. But anyway...
4900  // suppose v = (a, b, c, d)
4901  vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
4902  // ab_ab_cd_cd = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
4903  vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
4904  // cd_cd_ab_ab = (c+d,c+d,a+b,a+b)
4905  vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components
4906  return abcd;
4907 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4908  return vint4(vaddvq_s32(v));
4909 #else
4910  return vint4(reduce_add(v));
4911 #endif
4912 }
4913 
4914 
4916 #if OIIO_SIMD_SSE
4917  return extract<0> (vreduce_add(v));
4918 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4919  return vaddvq_s32(v);
4920 #else
4921  SIMD_RETURN_REDUCE (int, 0, r += v[i]);
4922 #endif
4923 }
4924 
4925 
4927 #if OIIO_SIMD_SSE
4928  vint4 ab = v & shuffle<1,1,3,3>(v); // ab bb cd dd
4929  vint4 abcd = ab & shuffle<2>(ab);
4930  return extract<0>(abcd);
4931 #else
4932  SIMD_RETURN_REDUCE (int, -1, r &= v[i]);
4933 #endif
4934 }
4935 
4936 
4938 #if OIIO_SIMD_SSE
4939  vint4 ab = v | shuffle<1,1,3,3>(v); // ab bb cd dd
4940  vint4 abcd = ab | shuffle<2>(ab);
4941  return extract<0>(abcd);
4942 #else
4943  SIMD_RETURN_REDUCE (int, 0, r |= v[i]);
4944 #endif
4945 }
4946 
4947 
4948 
4949 OIIO_FORCEINLINE vint4 blend (const vint4& a, const vint4& b, const vbool4& mask) {
4950 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4951  return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.simd()),
4952  _mm_castsi128_ps(b.simd()), mask));
4953 #elif OIIO_SIMD_SSE
4954  return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.simd()), b.simd()),
4955  _mm_andnot_si128(_mm_castps_si128(mask.simd()), a.simd()));
4956 #elif OIIO_SIMD_NEON
4957  return vbslq_s32 (mask.simd(), b.simd(), a.simd());
4958 #else
4959  SIMD_RETURN (vint4, mask[i] ? b[i] : a[i]);
4960 #endif
4961 }
4962 
4964 #if OIIO_SIMD_SSE
4965  return _mm_and_si128(_mm_castps_si128(mask), a.simd());
4966 #elif OIIO_SIMD_NEON
4967  return vbslq_s32 (mask.simd(), a.simd(), vint4(0));
4968 #else
4969  SIMD_RETURN (vint4, mask[i] ? a[i] : 0.0f);
4970 #endif
4971 }
4972 
4973 
4975 #if OIIO_SIMD_SSE
4976  return _mm_andnot_si128(_mm_castps_si128(mask), a.simd());
4977 #else
4978  SIMD_RETURN (vint4, mask[i] ? 0.0f : a[i]);
4979 #endif
4980 }
4981 
4982 
4983 OIIO_FORCEINLINE vint4 select (const vbool4& mask, const vint4& a, const vint4& b) {
4984  return blend (b, a, mask);
4985 }
4986 
4987 
4988 
4990 #if OIIO_SIMD_SSE >= 3
4991  return _mm_abs_epi32(a.simd());
4992 #elif OIIO_SIMD_NEON
4993  return vabsq_s32(a.simd());
4994 #else
4995  SIMD_RETURN (vint4, std::abs(a[i]));
4996 #endif
4997 }
4998 
4999 
5000 
5001 OIIO_FORCEINLINE vint4 min (const vint4& a, const vint4& b) {
5002 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
5003  return _mm_min_epi32 (a, b);
5004 #elif OIIO_SIMD_NEON
5005  return vminq_s32(a, b);
5006 #else
5007  SIMD_RETURN (vint4, std::min(a[i], b[i]));
5008 #endif
5009 }
5010 
5011 
5012 OIIO_FORCEINLINE vint4 max (const vint4& a, const vint4& b) {
5013 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
5014  return _mm_max_epi32 (a, b);
5015 #elif OIIO_SIMD_NEON
5016  return vmaxq_s32(a, b);
5017 #else
5018  SIMD_RETURN (vint4, std::max(a[i], b[i]));
5019 #endif
5020 }
5021 
5022 
5024 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5025  // return _mm_rol_epi32 (x, s);
5026  // We want to do this ^^^ but this intrinsic only takes an *immediate*
5027  // argument for s, and there isn't a way to express in C++ that a
5028  // parameter must be an immediate/literal value from the caller.
5029  return (x<<s) | srl(x,32-s);
5030 #else
5031  return (x<<s) | srl(x,32-s);
5032 #endif
5033 }
5034 
5035 // DEPRECATED (2.1)
5036 OIIO_FORCEINLINE vint4 rotl32 (const vint4& x, const unsigned int k) {
5037  return rotl(x, k);
5038 }
5039 
5040 
5041 OIIO_FORCEINLINE vint4 andnot (const vint4& a, const vint4& b) {
5042 #if OIIO_SIMD_SSE
5043  return _mm_andnot_si128 (a.simd(), b.simd());
5044 #else
5045  SIMD_RETURN (vint4, ~(a[i]) & b[i]);
5046 #endif
5047 }
5048 
5049 
5050 // Implementation had to be after the definition of vint4::Zero.
5052  m_simd = (ival != vint4::Zero());
5053 }
5054 
5055 
5056 
5058  // NO INTEGER MODULUS IN SSE!
5059  SIMD_RETURN (vint4, b[i] ? a[i] % b[i] : 0);
5060 }
5061 
5063  return b ? (a % b) : vint4::Zero();
5064 }
5065 
5066 
5067 
5068 
5069 //////////////////////////////////////////////////////////////////////
5070 // vint8 implementation
5071 
5073  m_simd = other.m_simd;
5074  return *this;
5075 }
5076 
5079  return m_val[i];
5080 }
5081 
5084  return m_val[i];
5085 }
5086 
5089  m_val[i] = val;
5090 }
5091 
5092 
5094 #if OIIO_SIMD_AVX
5095  m_simd = _mm256_set1_epi32 (a);
5096 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5097  m_4[0].load(a);
5098  m_4[1].load(a);
5099 #else
5100  SIMD_CONSTRUCT (a);
5101 #endif
5102 }
5103 
5104 
5105 OIIO_FORCEINLINE void vint8::load (int a, int b, int c, int d,
5106  int e, int f, int g, int h) {
5107 #if OIIO_SIMD_AVX
5108  m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a);
5109 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5110  m_4[0].load(a, b, c, d);
5111  m_4[1].load(e, f, g, h);
5112 #else
5113  m_val[0] = a;
5114  m_val[1] = b;
5115  m_val[2] = c;
5116  m_val[3] = d;
5117  m_val[4] = e;
5118  m_val[5] = f;
5119  m_val[6] = g;
5120  m_val[7] = h;
5121 #endif
5122 }
5123 
5124 
5126 #if OIIO_SIMD_AVX
5127  m_simd = _mm256_loadu_si256 ((const simd_t *)values);
5128 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5129  m_4[0].load(values);
5130  m_4[1].load(values+4);
5131 #else
5132  SIMD_CONSTRUCT (values[i]);
5133 #endif
5134 }
5135 
5136 
5137 OIIO_FORCEINLINE void vint8::load (const int *values, int n)
5138 {
5139  OIIO_DASSERT (n >= 0 && n <= elements);
5140 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5141  m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values);
5142 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5143  if (n > 4) {
5144  vint4 lo, hi;
5145  lo.load (values);
5146  hi.load (values+4, n-4);
5147  m_4[0] = lo;
5148  m_4[1] = hi;
5149  } else {
5150  vint4 lo, hi;
5151  lo.load (values, n);
5152  hi.clear();
5153  m_4[0] = lo;
5154  m_4[1] = hi;
5155  }
5156 #else
5157  for (int i = 0; i < n; ++i)
5158  m_val[i] = values[i];
5159  for (int i = n; i < elements; ++i)
5160  m_val[i] = 0;
5161 #endif
5162 }
5163 
5164 
5165 OIIO_FORCEINLINE void vint8::load (const short *values) {
5166 #if OIIO_SIMD_AVX >= 2
5167  m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values));
5168 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5169  m_4[0].load(values);
5170  m_4[1].load(values+4);
5171 #else
5172  SIMD_CONSTRUCT (values[i]);
5173 #endif
5174 }
5175 
5176 OIIO_FORCEINLINE void vint8::load (const unsigned short *values) {
5177 #if OIIO_SIMD_AVX >= 2
5178  m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values));
5179 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5180  m_4[0].load(values);
5181  m_4[1].load(values+4);
5182 #else
5183  SIMD_CONSTRUCT (values[i]);
5184 #endif
5185 }
5186 
5187 
5189 #if OIIO_SIMD_AVX >= 2
5190  __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5191  m_simd = _mm256_cvtepi8_epi32 (bytes);
5192 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5193  m_4[0].load(values);
5194  m_4[1].load(values+4);
5195 #else
5196  SIMD_CONSTRUCT (values[i]);
5197 #endif
5198 }
5199 
5200 OIIO_FORCEINLINE void vint8::load (const unsigned char *values) {
5201 #if OIIO_SIMD_AVX >= 2
5202  __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5203  m_simd = _mm256_cvtepu8_epi32 (bytes);
5204 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5205  m_4[0].load(values);
5206  m_4[1].load(values+4);
5207 #else
5208  SIMD_CONSTRUCT (values[i]);
5209 #endif
5210 }
5211 
5212 
5213 
5215 
5216 OIIO_FORCEINLINE vint8::vint8 (int a, int b, int c, int d,
5217  int e, int f, int g, int h) {
5218  load(a,b,c,d,e,f,g,h);
5219 }
5220 
5221 OIIO_FORCEINLINE vint8::vint8 (const int *vals) { load (vals); }
5222 OIIO_FORCEINLINE vint8::vint8 (const unsigned short *vals) { load(vals); }
5223 OIIO_FORCEINLINE vint8::vint8 (const short *vals) { load(vals); }
5224 OIIO_FORCEINLINE vint8::vint8 (const unsigned char *vals) { load(vals); }
5225 OIIO_FORCEINLINE vint8::vint8 (const char *vals) { load(vals); }
5226 
5227 OIIO_FORCEINLINE const vint8 & vint8::operator= (int a) { load(a); return *this; }
5228 
5229 
5231 #if OIIO_SIMD_AVX
5232  // Use an unaligned store -- it's just as fast when the memory turns
5233  // out to be aligned, nearly as fast even when unaligned. Not worth
5234  // the headache of using stores that require alignment.
5235  _mm256_storeu_si256 ((simd_t *)values, m_simd);
5236 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5237  m_4[0].store(values);
5238  m_4[1].store(values+4);
5239 #else
5240  SIMD_DO (values[i] = m_val[i]);
5241 #endif
5242 }
5243 
5244 
5246 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5247  m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
5248 #elif OIIO_SIMD_AVX >= 2
5249  m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
5250 #else
5251  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
5252 #endif
5253 }
5254 
5255 
5257 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5258  m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
5259 #elif OIIO_SIMD_AVX >= 2
5260  m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask));
5261 #else
5262  SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
5263 #endif
5264 }
5265 
5266 
5268 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5269  _mm256_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
5270 #elif OIIO_SIMD_AVX >= 2
5271  _mm256_maskstore_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
5272 #else
5273  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
5274 #endif
5275 }
5276 
5277 
5279 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5280  _mm256_mask_storeu_epi32 (values, __mmask8(mask.bitmask()), m_simd);
5281 #elif OIIO_SIMD_AVX >= 2
5282  _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask), m_simd);
5283 #else
5284  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
5285 #endif
5286 }
5287 
5288 
5289 template <int scale>
5290 OIIO_FORCEINLINE void
5291 vint8::gather (const value_t *baseptr, const vint_t& vindex)
5292 {
5293 #if OIIO_SIMD_AVX >= 2
5294  m_simd = _mm256_i32gather_epi32 (baseptr, vindex, scale);
5295 #else
5296  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
5297 #endif
5298 }
5299 
5300 template<int scale>
5301 OIIO_FORCEINLINE void
5302 vint8::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
5303 {
5304 #if OIIO_SIMD_AVX >= 2
5305  m_simd = _mm256_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask), scale);
5306 #else
5307  SIMD_DO (if (mask[i]) m_val[i] = *(const value_t *)((const char *)baseptr + vindex[i]*scale));
5308 #endif
5309 }
5310 
5311 template<int scale>
5312 OIIO_FORCEINLINE void
5313 vint8::scatter (value_t *baseptr, const vint_t& vindex) const
5314 {
5315 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5316  _mm256_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
5317 #else
5318  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5319 #endif
5320 }
5321 
5322 template<int scale>
5323 OIIO_FORCEINLINE void
5325  const vint_t& vindex) const
5326 {
5327 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5328  _mm256_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
5329 #else
5330  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5331 #endif
5332 }
5333 
5334 
5336 #if OIIO_SIMD_AVX
5337  m_simd = _mm256_setzero_si256();
5338 #else
5339  *this = 0;
5340 #endif
5341 }
5342 
5343 
5345 #if OIIO_SIMD_AVX
5346  return _mm256_setzero_si256();
5347 #else
5348  return 0;
5349 #endif
5350 }
5351 
5352 OIIO_FORCEINLINE const vint8 vint8::One () { return vint8(1); }
5353 
5354 OIIO_FORCEINLINE const vint8 vint8::NegOne () { return vint8(-1); }
5355 
5356 
5357 OIIO_FORCEINLINE const vint8 vint8::Iota (int start, int step) {
5358  return vint8 (start+0*step, start+1*step, start+2*step, start+3*step,
5359  start+4*step, start+5*step, start+6*step, start+7*step);
5360 }
5361 
5362 
5364  return vint8 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7);
5365 }
5366 
5367 
5369 #if OIIO_SIMD_AVX
5370  return _mm256_castsi256_si128 (simd());
5371 #else
5372  return m_4[0];
5373 #endif
5374 }
5375 
5377 #if OIIO_SIMD_AVX
5378  return _mm256_extractf128_si256 (simd(), 1);
5379 #else
5380  return m_4[1];
5381 #endif
5382 }
5383 
5384 
5385 OIIO_FORCEINLINE vint8::vint8 (const vint4& lo, const vint4 &hi) {
5386 #if OIIO_SIMD_AVX
5387  __m256i r = _mm256_castsi128_si256 (lo);
5388  m_simd = _mm256_insertf128_si256 (r, hi, 1);
5389  // N.B. equivalent, if available: m_simd = _mm256_set_m128i (hi, lo);
5390  // FIXME: when would this not be available?
5391 #else
5392  m_4[0] = lo;
5393  m_4[1] = hi;
5394 #endif
5395 }
5396 
5397 
5399 #if OIIO_SIMD_AVX >= 2
5400  return _mm256_add_epi32 (a.simd(), b.simd());
5401 #else
5402  SIMD_RETURN (vint8, a[i] + b[i]);
5403 #endif
5404 }
5405 
5406 
5408  return a = a + b;
5409 }
5410 
5411 
5413 #if OIIO_SIMD_AVX >= 2
5414  return _mm256_sub_epi32 (_mm256_setzero_si256(), a);
5415 #else
5416  SIMD_RETURN (vint8, -a[i]);
5417 #endif
5418 }
5419 
5420 
5422 #if OIIO_SIMD_AVX >= 2
5423  return _mm256_sub_epi32 (a.simd(), b.simd());
5424 #else
5425  SIMD_RETURN (vint8, a[i] - b[i]);
5426 #endif
5427 }
5428 
5429 
5431  return a = a - b;
5432 }
5433 
5434 
5436 #if OIIO_SIMD_AVX >= 2
5437  return _mm256_mullo_epi32 (a.simd(), b.simd());
5438 #else
5439  SIMD_RETURN (vint8, a[i] * b[i]);
5440 #endif
5441 }
5442 
5443 
5444 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, const vint8& b) { return a = a * b; }
5445 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, int b) { return a = a * b; }
5446 
5447 
5449  // NO INTEGER DIVISION IN SSE or AVX!
5450  SIMD_RETURN (vint8, a[i] / b[i]);
5451 }
5452 
5453 OIIO_FORCEINLINE const vint8& operator/= (vint8& a, const vint8& b) { return a = a / b; }
5454 
5455 
5457  // NO INTEGER MODULUS IN SSE or AVX!
5458  SIMD_RETURN (vint8, a[i] % b[i]);
5459 }
5460 
5461 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, const vint8& b) { return a = a % b; }
5462 
5464  // NO INTEGER MODULUS in SSE or AVX!
5465  SIMD_RETURN (vint8, a[i] % w);
5466 }
5467 
5468 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, int b) { return a = a % b; }
5469 
5470 
5472 #if OIIO_SIMD_AVX >= 2
5473  return _mm256_and_si256 (a.simd(), b.simd());
5474 #else
5475  SIMD_RETURN (vint8, a[i] & b[i]);
5476 #endif
5477 }
5478 
5479 OIIO_FORCEINLINE const vint8& operator&= (vint8& a, const vint8& b) { return a = a & b; }
5480 
5482 #if OIIO_SIMD_AVX >= 2
5483  return _mm256_or_si256 (a.simd(), b.simd());
5484 #else
5485  SIMD_RETURN (vint8, a[i] | b[i]);
5486 #endif
5487 }
5488 
5489 OIIO_FORCEINLINE const vint8& operator|= (vint8& a, const vint8& b) { return a = a | b; }
5490 
5492 #if OIIO_SIMD_AVX >= 2
5493  return _mm256_xor_si256 (a.simd(), b.simd());
5494 #else
5495  SIMD_RETURN (vint8, a[i] ^ b[i]);
5496 #endif
5497 }
5498 
5499 OIIO_FORCEINLINE const vint8& operator^= (vint8& a, const vint8& b) { return a = a ^ b; }
5500 
5501 
5503 #if OIIO_SIMD_AVX >= 2
5504  return a ^ a.NegOne();
5505 #else
5506  SIMD_RETURN (vint8, ~a[i]);
5507 #endif
5508 }
5509 
5510 
5511 OIIO_FORCEINLINE vint8 operator<< (const vint8& a, unsigned int bits) {
5512 #if OIIO_SIMD_AVX >= 2
5513  return _mm256_slli_epi32 (a, bits);
5514 #elif OIIO_SIMD_SSE
5515  return vint8 (a.lo() << bits, a.hi() << bits);
5516 #else
5517  SIMD_RETURN (vint8, a[i] << bits);
5518 #endif
5519 }
5520 
5521 
5522 OIIO_FORCEINLINE const vint8& operator<<= (vint8& a, const unsigned int bits) {
5523  return a = a << bits;
5524 }
5525 
5526 OIIO_FORCEINLINE vint8 operator>> (const vint8& a, const unsigned int bits) {
5527 #if OIIO_SIMD_AVX >= 2
5528  return _mm256_srai_epi32 (a, bits);
5529 #elif OIIO_SIMD_SSE
5530  return vint8 (a.lo() >> bits, a.hi() >> bits);
5531 #else
5532  SIMD_RETURN (vint8, a[i] >> bits);
5533 #endif
5534 }
5535 
5536 OIIO_FORCEINLINE const vint8& operator>>= (vint8& a, const unsigned int bits) {
5537  return a = a >> bits;
5538 }
5539 
5540 
5541 OIIO_FORCEINLINE vint8 srl (const vint8& a, const unsigned int bits) {
5542 #if OIIO_SIMD_AVX >= 2
5543  return _mm256_srli_epi32 (a, bits);
5544 #else
5545  SIMD_RETURN (vint8, int ((unsigned int)(a[i]) >> bits));
5546 #endif
5547 }
5548 
5549 
5551  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5552 #if OIIO_SIMD_AVX >= 2
5553  return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.m_simd, b.m_simd));
5554 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5555  return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
5556 #else
5557  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
5558 #endif
5559 }
5560 
5561 
5563  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5564  return ! (a == b);
5565 }
5566 
5567 
5569  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5570 #if OIIO_SIMD_AVX >= 2
5571  return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b));
5572 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5573  return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
5574 #else
5575  SIMD_RETURN (vbool8, a[i] > b[i] ? -1 : 0);
5576 #endif
5577 }
5578 
5579 
5581  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5582 #if OIIO_SIMD_AVX >= 2
5583  // No lt or lte!
5584  return (b > a);
5585 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5586  return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
5587 #else
5588  SIMD_RETURN (vbool8, a[i] < b[i] ? -1 : 0);
5589 #endif
5590 }
5591 
5592 
5594  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5595  return (a > b) | (a == b);
5596 }
5597 
5598 
5600  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5601  return (b > a) | (a == b);
5602 }
5603 
5604 
5605 inline std::ostream& operator<< (std::ostream& cout, const vint8& val) {
5606  cout << val[0];
5607  for (int i = 1; i < val.elements; ++i)
5608  cout << ' ' << val[i];
5609  return cout;
5610 }
5611 
5612 
5613 OIIO_FORCEINLINE void vint8::store (int *values, int n) const {
5614  OIIO_DASSERT (n >= 0 && n <= elements);
5615 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5616  // This SHOULD be fast, but in my benchmarks, it is slower!
5617  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
5618  // Re-test this periodically with new Intel hardware.
5619  _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)), m_simd);
5620 #elif OIIO_SIMD_SSE
5621  if (n <= 4) {
5622  lo().store (values, n);
5623  } else if (n < 8) {
5624  lo().store (values);
5625  hi().store (values+4, n-4);
5626  } else {
5627  store (values);
5628  }
5629 #else
5630  for (int i = 0; i < n; ++i)
5631  values[i] = m_val[i];
5632 #endif
5633 }
5634 
5635 
5636 // FIXME(AVX): fast vint8 store to unsigned short, unsigned char
5637 
5638 OIIO_FORCEINLINE void vint8::store (unsigned short *values) const {
5639 #if OIIO_AVX512VL_ENABLED
5640  _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff), m_simd);
5641 #elif OIIO_SIMD_SSE
5642  lo().store (values);
5643  hi().store (values+4);
5644 #else
5645  SIMD_DO (values[i] = m_val[i]);
5646 #endif
5647 }
5648 
5649 
5650 OIIO_FORCEINLINE void vint8::store (unsigned char *values) const {
5651 #if OIIO_AVX512VL_ENABLED
5652  _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff), m_simd);
5653 #elif OIIO_SIMD_SSE
5654  lo().store (values);
5655  hi().store (values+4);
5656 #else
5657  SIMD_DO (values[i] = m_val[i]);
5658 #endif
5659 }
5660 
5661 
5662 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
5664 #if OIIO_SIMD_AVX >= 2
5665  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
5666  return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.simd()), index.simd()));
5667 #else
5668  return vint8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
5669 #endif
5670 }
5671 
5672 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5673  return shuffle<i,i,i,i,i,i,i,i>(a);
5674 }
5675 
5676 
5677 template<int i>
5679 #if OIIO_SIMD_AVX && !_WIN32
5680  return _mm256_extract_epi32(v.simd(), i);
5681 #else
5682  return v[i];
5683 #endif
5684 }
5685 
5686 
5687 template<int i>
5689 #if OIIO_SIMD_AVX && !_WIN32
5690  return _mm256_insert_epi32 (a.simd(), val, i);
5691 #else
5692  vint8 tmp = a;
5693  tmp[i] = val;
5694  return tmp;
5695 #endif
5696 }
5697 
5698 
5699 OIIO_FORCEINLINE int vint8::x () const { return extract<0>(*this); }
5700 OIIO_FORCEINLINE int vint8::y () const { return extract<1>(*this); }
5701 OIIO_FORCEINLINE int vint8::z () const { return extract<2>(*this); }
5702 OIIO_FORCEINLINE int vint8::w () const { return extract<3>(*this); }
5703 OIIO_FORCEINLINE void vint8::set_x (int val) { *this = insert<0>(*this, val); }
5704 OIIO_FORCEINLINE void vint8::set_y (int val) { *this = insert<1>(*this, val); }
5705 OIIO_FORCEINLINE void vint8::set_z (int val) { *this = insert<2>(*this, val); }
5706 OIIO_FORCEINLINE void vint8::set_w (int val) { *this = insert<3>(*this, val); }
5707 
5708 
5710 {
5711 #if OIIO_SIMD_AVX
5712  return _mm256_castps_si256 (x.simd());
5713 #else
5714  return vint8(bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
5715 #endif
5716 }
5717 
5718 
5720 #if OIIO_SIMD_AVX >= 2
5721  // From Syrah:
5722  vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.simd(), _mm256_setzero_si256());
5723  vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256());
5724  // get efgh in the 0-idx slot
5725  vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
5726  vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
5727  return shuffle<0>(final_sum);
5728 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5729  vint4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
5730  return vint8(hadd4, hadd4);
5731 #else
5732  return vint8(reduce_add(v));
5733 #endif
5734 }
5735 
5736 
5738 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
5739  return extract<0> (vreduce_add(v));
5740 #else
5741  return reduce_add(v.lo()) + reduce_add(v.hi());
5742 #endif
5743 }
5744 
5745 
5747 #if OIIO_SSE_AVX >= 2
5748  vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5749  vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5750  vint8 abcdefgh = abcd & shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5751  return extract<0> (abcdefgh);
5752 #else
5753  // AVX 1.0 or less -- use SSE
5754  return reduce_and(v.lo() & v.hi());
5755 #endif
5756 }
5757 
5758 
5760 #if OIIO_SSE_AVX >= 2
5761  vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5762  vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5763  vint8 abcdefgh = abcd | shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5764  return extract<0> (abcdefgh);
5765 #else
5766  // AVX 1.0 or less -- use SSE
5767  return reduce_or(v.lo() | v.hi());
5768 #endif
5769 }
5770 
5771 
5772 OIIO_FORCEINLINE vint8 blend (const vint8& a, const vint8& b, const vbool8& mask) {
5773 #if OIIO_SIMD_AVX
5774  return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.simd()),
5775  _mm256_castsi256_ps(b.simd()), mask));
5776 #elif OIIO_SIMD_SSE
5777  return vint8 (blend(a.lo(), b.lo(), mask.lo()),
5778  blend(a.hi(), b.hi(), mask.hi()));
5779 #else
5780  SIMD_RETURN (vint8, mask[i] ? b[i] : a[i]);
5781 #endif
5782 }
5783 
5784 
5786 // FIXME: More efficient for AVX-512 to use
5787 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(maxk),a))?
5788 #if OIIO_SIMD_AVX
5789  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.simd()), mask));
5790 #elif OIIO_SIMD_SSE
5791  return vint8 (blend0(a.lo(), mask.lo()),
5792  blend0(a.hi(), mask.hi()));
5793 #else
5794  SIMD_RETURN (vint8, mask[i] ? a[i] : 0.0f);
5795 #endif
5796 }
5797 
5798 
5800 // FIXME: More efficient for AVX-512 to use
5801 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(!maxk),a))?
5802 #if OIIO_SIMD_AVX
5803  return _mm256_castps_si256 (_mm256_andnot_ps (mask.simd(), _mm256_castsi256_ps(a.simd())));
5804 #elif OIIO_SIMD_SSE
5805  return vint8 (blend0not(a.lo(), mask.lo()),
5806  blend0not(a.hi(), mask.hi()));
5807 #else
5808  SIMD_RETURN (vint8, mask[i] ? 0.0f : a[i]);
5809 #endif
5810 }
5811 
5812 OIIO_FORCEINLINE vint8 select (const vbool8& mask, const vint8& a, const vint8& b) {
5813  return blend (b, a, mask);
5814 }
5815 
5816 
5818 #if OIIO_SIMD_AVX >= 2
5819  return _mm256_abs_epi32(a.simd());
5820 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5821  return vint8(abs(a.lo()), abs(a.hi()));
5822 #else
5823  SIMD_RETURN (vint8, std::abs(a[i]));
5824 #endif
5825 }
5826 
5827 
5828 OIIO_FORCEINLINE vint8 min (const vint8& a, const vint8& b) {
5829 #if OIIO_SIMD_AVX >= 2
5830  return _mm256_min_epi32 (a, b);
5831 #else
5832  return vint8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
5833 #endif
5834 }
5835 
5836 
5837 OIIO_FORCEINLINE vint8 max (const vint8& a, const vint8& b) {
5838 #if OIIO_SIMD_AVX >= 2
5839  return _mm256_max_epi32 (a, b);
5840 #else
5841  return vint8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
5842 #endif
5843 }
5844 
5845 
5847 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5848  // return _mm256_rol_epi32 (x, s);
5849  // We want to do this ^^^ but this intrinsic only takes an *immediate*
5850  // argument for s, and there isn't a way to express in C++ that a
5851  // parameter must be an immediate/literal value from the caller.
5852  return (x<<s) | srl(x,32-s);
5853 #else
5854  return (x<<s) | srl(x,32-s);
5855 #endif
5856 }
5857 
5858 // DEPRECATED (2.1)
5859 OIIO_FORCEINLINE vint8 rotl32 (const vint8& x, const unsigned int k) {
5860  return rotl(x, k);
5861 }
5862 
5863 
5864 OIIO_FORCEINLINE vint8 andnot (const vint8& a, const vint8& b) {
5865 #if OIIO_SIMD_AVX >= 2
5866  return _mm256_andnot_si256 (a.simd(), b.simd());
5867 #elif OIIO_SIMD_AVX >= 1
5868  return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.simd()), _mm256_castsi256_ps(b.simd())));
5869 #else
5870  SIMD_RETURN (vint8, ~(a[i]) & b[i]);
5871 #endif
5872 }
5873 
5874 
5875 // Implementation had to be after the definition of vint8::Zero.
5877  m_simd = (ival != vint8::Zero());
5878 }
5879 
5880 
5881 
5883  // NO INTEGER MODULUS IN SSE!
5884  SIMD_RETURN (vint8, b[i] ? a[i] % b[i] : 0);
5885 }
5886 
5888  return b ? (a % b) : vint8::Zero();
5889 }
5890 
5891 
5892 
5893 
5894 //////////////////////////////////////////////////////////////////////
5895 // vint16 implementation
5896 
5898  m_simd = other.m_simd;
5899  return *this;
5900 }
5901 
5904  return m_val[i];
5905 }
5906 
5909  return m_val[i];
5910 }
5911 
5914  m_val[i] = val;
5915 }
5916 
5917 
5919 #if OIIO_SIMD_AVX >= 512
5920  m_simd = _mm512_set1_epi32 (a);
5921 #else
5922  m_8[0].load (a);
5923  m_8[1].load (a);
5924 #endif
5925 }
5926 
5927 
5928 OIIO_FORCEINLINE void vint16::load (int v0, int v1, int v2, int v3,
5929  int v4, int v5, int v6, int v7,
5930  int v8, int v9, int v10, int v11,
5931  int v12, int v13, int v14, int v15) {
5932 #if OIIO_SIMD_AVX >= 512
5933  m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7,
5934  v8, v9, v10, v11, v12, v13, v14, v15);
5935 #else
5936  m_val[ 0] = v0;
5937  m_val[ 1] = v1;
5938  m_val[ 2] = v2;
5939  m_val[ 3] = v3;
5940  m_val[ 4] = v4;
5941  m_val[ 5] = v5;
5942  m_val[ 6] = v6;
5943  m_val[ 7] = v7;
5944  m_val[ 8] = v8;
5945  m_val[ 9] = v9;
5946  m_val[10] = v10;
5947  m_val[11] = v11;
5948  m_val[12] = v12;
5949  m_val[13] = v13;
5950  m_val[14] = v14;
5951  m_val[15] = v15;
5952 #endif
5953 }
5954 
5955 
5957 #if OIIO_SIMD_AVX >= 512
5958  m_simd = _mm512_loadu_si512 ((const simd_t *)values);
5959 #else
5960  m_8[0].load (values);
5961  m_8[1].load (values+8);
5962 #endif
5963 }
5964 
5965 
5966 OIIO_FORCEINLINE void vint16::load (const int *values, int n)
5967 {
5968 #if OIIO_SIMD_AVX >= 512
5969  m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values);
5970 #else
5971  if (n > 8) {
5972  m_8[0].load (values);
5973  m_8[1].load (values+8, n-8);
5974  } else {
5975  m_8[0].load (values, n);
5976  m_8[1].clear ();
5977  }
5978 #endif
5979 }
5980 
5981 
5982 OIIO_FORCEINLINE void vint16::load (const short *values) {
5983 #if OIIO_SIMD_AVX >= 512
5984  m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values));
5985 #else
5986  m_8[0].load (values);
5987  m_8[1].load (values+8);
5988 #endif
5989 }
5990 
5991 OIIO_FORCEINLINE void vint16::load (const unsigned short *values) {
5992 #if OIIO_SIMD_AVX >= 512
5993  m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values));
5994 #else
5995  m_8[0].load (values);
5996  m_8[1].load (values+8);
5997 #endif
5998 }
5999 
6000 
6002 #if OIIO_SIMD_AVX >= 512
6003  m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values));
6004 #else
6005  m_8[0].load (values);
6006  m_8[1].load (values+8);
6007 #endif
6008 }
6009 
6010 OIIO_FORCEINLINE void vint16::load (const unsigned char *values) {
6011 #if OIIO_SIMD_AVX >= 512
6012  m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values));
6013 #else
6014  m_8[0].load (values);
6015  m_8[1].load (values+8);
6016 #endif
6017 }
6018 
6019 
6021 
6023  int v4, int v5, int v6, int v7,
6024  int v8, int v9, int v10, int v11,
6025  int v12, int v13, int v14, int v15) {
6026  load (v0, v1, v2, v3, v4, v5, v6, v7,
6027  v8, v9, v10, v11, v12, v13, v14, v15);
6028 }
6029 
6030 OIIO_FORCEINLINE vint16::vint16 (const int *vals) { load (vals); }
6031 OIIO_FORCEINLINE vint16::vint16 (const unsigned short *vals) { load(vals); }
6032 OIIO_FORCEINLINE vint16::vint16 (const short *vals) { load(vals); }
6033 OIIO_FORCEINLINE vint16::vint16 (const unsigned char *vals) { load(vals); }
6034 OIIO_FORCEINLINE vint16::vint16 (const char *vals) { load(vals); }
6035 
6036 OIIO_FORCEINLINE const vint16 & vint16::operator= (int a) { load(a); return *this; }
6037 
6038 
6040 #if OIIO_SIMD_AVX >= 512
6041  m_simd = _mm512_maskz_loadu_epi32 (mask, (const simd_t *)values);
6042 #else
6043  m_8[0].load_mask (mask.lo(), values);
6044  m_8[1].load_mask (mask.hi(), values+8);
6045 #endif
6046 }
6047 
6048 
6050 #if OIIO_SIMD_AVX >= 512
6051  _mm512_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
6052 #else
6053  lo().store_mask (mask.lo(), values);
6054  hi().store_mask (mask.hi(), values+8);
6055 #endif
6056 }
6057 
6058 
6059 template <int scale>
6060 OIIO_FORCEINLINE void
6061 vint16::gather (const value_t *baseptr, const vint_t& vindex) {
6062 #if OIIO_SIMD_AVX >= 512
6063  m_simd = _mm512_i32gather_epi32 (vindex, baseptr, scale);
6064 #else
6065  m_8[0].gather<scale> (baseptr, vindex.lo());
6066  m_8[1].gather<scale> (baseptr, vindex.hi());
6067 #endif
6068 }
6069 
6070 template<int scale>
6071 OIIO_FORCEINLINE void
6072 vint16::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex) {
6073 #if OIIO_SIMD_AVX >= 512
6074  m_simd = _mm512_mask_i32gather_epi32 (m_simd, mask, vindex, baseptr, scale);
6075 #else
6076  m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
6077  m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
6078 #endif
6079 }
6080 
6081 template<int scale>
6082 OIIO_FORCEINLINE void
6083 vint16::scatter (value_t *baseptr, const vint_t& vindex) const {
6084 #if OIIO_SIMD_AVX >= 512
6085  _mm512_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
6086 #else
6087  lo().scatter<scale> (baseptr, vindex.lo());
6088  hi().scatter<scale> (baseptr, vindex.hi());
6089 #endif
6090 }
6091 
6092 template<int scale>
6093 OIIO_FORCEINLINE void
6095  const vint_t& vindex) const {
6096 #if OIIO_SIMD_AVX >= 512
6097  _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex, m_simd, scale);
6098 #else
6099  lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
6100  hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
6101 #endif
6102 }
6103 
6104 
6106 #if OIIO_SIMD_AVX >= 512
6107  // Use an unaligned store -- it's just as fast when the memory turns
6108  // out to be aligned, nearly as fast even when unaligned. Not worth
6109  // the headache of using stores that require alignment.
6110  _mm512_storeu_si512 ((simd_t *)values, m_simd);
6111 #else
6112  lo().store (values);
6113  hi().store (values+8);
6114 #endif
6115 }
6116 
6117 
6119 #if OIIO_SIMD_AVX >= 512
6120  m_simd = _mm512_setzero_si512();
6121 #else
6122  *this = 0;
6123 #endif
6124 }
6125 
6126 
6128 #if OIIO_SIMD_AVX >= 512
6129  return _mm512_setzero_epi32();
6130 #else
6131  return 0;
6132 #endif
6133 }
6134 
6136 
6138 
6139 
6141  return vint16 (start+0*step, start+1*step, start+2*step, start+3*step,
6142  start+4*step, start+5*step, start+6*step, start+7*step,
6143  start+8*step, start+9*step, start+10*step, start+11*step,
6144  start+12*step, start+13*step, start+14*step, start+15*step);
6145 }
6146 
6147 
6149  return vint16 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
6150  1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15);
6151 }
6152 
6153 
6155 #if OIIO_SIMD_AVX >= 512
6156  return _mm512_castsi512_si256 (simd());
6157 #else
6158  return m_8[0];
6159 #endif
6160 }
6161 
6163 #if OIIO_SIMD_AVX >= 512
6164  return _mm512_extracti64x4_epi64 (simd(), 1);
6165 #else
6166  return m_8[1];
6167 #endif
6168 }
6169 
6170 
6171 OIIO_FORCEINLINE vint16::vint16 (const vint8& lo, const vint8 &hi) {
6172 #if OIIO_SIMD_AVX >= 512
6173  __m512i r = _mm512_castsi256_si512 (lo);
6174  m_simd = _mm512_inserti32x8 (r, hi, 1);
6175 #else
6176  m_8[0] = lo;
6177  m_8[1] = hi;
6178 #endif
6179 }
6180 
6181 
6182 OIIO_FORCEINLINE vint16::vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d) {
6183 #if OIIO_SIMD_AVX >= 512
6184  m_simd = _mm512_broadcast_i32x4(a);
6185  m_simd = _mm512_inserti32x4 (m_simd, b, 1);
6186  m_simd = _mm512_inserti32x4 (m_simd, c, 2);
6187  m_simd = _mm512_inserti32x4 (m_simd, d, 3);
6188 #else
6189  m_8[0] = vint8(a,b);
6190  m_8[1] = vint8(c,d);
6191 #endif
6192 }
6193 
6194 
6196 #if OIIO_SIMD_AVX >= 512
6197  return _mm512_add_epi32 (a.simd(), b.simd());
6198 #else
6199  return vint16 (a.lo()+b.lo(), a.hi()+b.hi());
6200 #endif
6201 }
6202 
6203 
6205  return a = a + b;
6206 }
6207 
6208 
6210 #if OIIO_SIMD_AVX >= 512
6211  return _mm512_sub_epi32 (_mm512_setzero_si512(), a);
6212 #else
6213  return vint16 (-a.lo(), -a.hi());
6214 #endif
6215 }
6216 
6217 
6219 #if OIIO_SIMD_AVX >= 512
6220  return _mm512_sub_epi32 (a.simd(), b.simd());
6221 #else
6222  return vint16 (a.lo()-b.lo(), a.hi()-b.hi());
6223 #endif
6224 }
6225 
6226 
6228  return a = a - b;
6229 }
6230 
6231 
6233 #if OIIO_SIMD_AVX >= 512
6234  return _mm512_mullo_epi32 (a.simd(), b.simd());
6235 #else
6236  return vint16 (a.lo()*b.lo(), a.hi()*b.hi());
6237 #endif
6238 }
6239 
6240 
6241 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, const vint16& b) { return a = a * b; }
6242 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, int b) { return a = a * b; }
6243 
6244 
6246  // NO INTEGER DIVISION IN AVX512!
6247  SIMD_RETURN (vint16, a[i] / b[i]);
6248 }
6249 
6250 OIIO_FORCEINLINE const vint16& operator/= (vint16& a, const vint16& b) { return a = a / b; }
6251 
6252 
6254  // NO INTEGER MODULUS IN AVX512!
6255  SIMD_RETURN (vint16, a[i] % b[i]);
6256 }
6257 
6258 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, const vint16& b) { return a = a % b; }
6259 
6261  // NO INTEGER MODULUS in AVX512!
6262  SIMD_RETURN (vint16, a[i] % w);
6263 }
6264 
6265 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, int b) { return a = a % b; }
6266 
6267 
6269 #if OIIO_SIMD_AVX >= 512
6270  return _mm512_and_si512 (a.simd(), b.simd());
6271 #else
6272  return vint16 (a.lo() & b.lo(), a.hi() & b.hi());
6273 #endif
6274 }
6275 
6276 OIIO_FORCEINLINE const vint16& operator&= (vint16& a, const vint16& b) { return a = a & b; }
6277 
6279 #if OIIO_SIMD_AVX >= 512
6280  return _mm512_or_si512 (a.simd(), b.simd());
6281 #else
6282  return vint16 (a.lo() | b.lo(), a.hi() | b.hi());
6283 #endif
6284 }
6285 
6286 OIIO_FORCEINLINE const vint16& operator|= (vint16& a, const vint16& b) { return a = a | b; }
6287 
6289 #if OIIO_SIMD_AVX >= 512
6290  return _mm512_xor_si512 (a.simd(), b.simd());
6291 #else
6292  return vint16 (a.lo() ^ b.lo(), a.hi() ^ b.hi());
6293 #endif
6294 }
6295 
6296 OIIO_FORCEINLINE const vint16& operator^= (vint16& a, const vint16& b) { return a = a ^ b; }
6297 
6298 
6300 #if OIIO_SIMD_AVX >= 512
6301  return a ^ a.NegOne();
6302 #else
6303  return vint16 (~a.lo(), ~a.hi());
6304 #endif
6305 }
6306 
6307 
6308 OIIO_FORCEINLINE vint16 operator<< (const vint16& a, const unsigned int bits) {
6309 #if OIIO_SIMD_AVX >= 512
6310  return _mm512_sllv_epi32 (a, vint16(int(bits)));
6311  // return _mm512_slli_epi32 (a, bits);
6312  // FIXME: can this be slli?
6313 #else
6314  return vint16 (a.lo() << bits, a.hi() << bits);
6315 #endif
6316 }
6317 
6318 
6319 OIIO_FORCEINLINE const vint16& operator<<= (vint16& a, const unsigned int bits) {
6320  return a = a << bits;
6321 }
6322 
6323 OIIO_FORCEINLINE vint16 operator>> (const vint16& a, const unsigned int bits) {
6324 #if OIIO_SIMD_AVX >= 512
6325  return _mm512_srav_epi32 (a, vint16(int(bits)));
6326  // FIXME: can this be srai?
6327 #else
6328  return vint16 (a.lo() >> bits, a.hi() >> bits);
6329 #endif
6330 }
6331 
6332 OIIO_FORCEINLINE const vint16& operator>>= (vint16& a, const unsigned int bits) {
6333  return a = a >> bits;
6334 }
6335 
6336 
6337 OIIO_FORCEINLINE vint16 srl (const vint16& a, const unsigned int bits) {
6338 #if OIIO_SIMD_AVX >= 512
6339  return _mm512_srlv_epi32 (a, vint16(int(bits)));
6340  // FIXME: can this be srli?
6341 #else
6342  return vint16 (srl(a.lo(), bits), srl (a.hi(), bits));
6343 #endif
6344 }
6345 
6346 
6348 #if OIIO_SIMD_AVX >= 512
6349  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 0 /*_MM_CMPINT_EQ*/);
6350 #else /* Fall back to 8-wide */
6351  return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
6352 #endif
6353 }
6354 
6355 
6357 #if OIIO_SIMD_AVX >= 512
6358  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 4 /*_MM_CMPINT_NEQ*/);
6359 #else /* Fall back to 8-wide */
6360  return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
6361 #endif
6362 }
6363 
6364 
6366 #if OIIO_SIMD_AVX >= 512
6367  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 6 /*_MM_CMPINT_NLE*/);
6368 #else /* Fall back to 8-wide */
6369  return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
6370 #endif
6371 }
6372 
6373 
6375 #if OIIO_SIMD_AVX >= 512
6376  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 1 /*_MM_CMPINT_LT*/);
6377 #else /* Fall back to 8-wide */
6378  return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
6379 #endif
6380 }
6381 
6382 
6384 #if OIIO_SIMD_AVX >= 512
6385  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 5 /*_MM_CMPINT_NLT*/);
6386 #else /* Fall back to 8-wide */
6387  return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
6388 #endif
6389 }
6390 
6391 
6393 #if OIIO_SIMD_AVX >= 512
6394  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 2 /*_MM_CMPINT_LE*/);
6395 #else /* Fall back to 8-wide */
6396  return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
6397 #endif
6398 }
6399 
6400 
6401 inline std::ostream& operator<< (std::ostream& cout, const vint16& val) {
6402  cout << val[0];
6403  for (int i = 1; i < val.elements; ++i)
6404  cout << ' ' << val[i];
6405  return cout;
6406 }
6407 
6408 
6409 
6410 OIIO_FORCEINLINE void vint16::store (int *values, int n) const {
6411  OIIO_DASSERT (n >= 0 && n <= elements);
6412 #if 0 && OIIO_SIMD_AVX >= 512
6413  // This SHOULD be fast, but in my benchmarks, it is slower!
6414  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6415  // Re-test this periodically with new Intel hardware.
6416  _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)), m_simd);
6417 #else
6418  if (n > 8) {
6419  m_8[0].store (values);
6420  m_8[1].store (values+8, n-8);
6421  } else {
6422  m_8[0].store (values, n);
6423  }
6424 #endif
6425 }
6426 
6427 
6428 OIIO_FORCEINLINE void vint16::store (unsigned short *values) const {
6429 #if OIIO_SIMD_AVX512
6430  _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff), m_simd);
6431 #elif OIIO_SIMD_AVX >= 2
6432  lo().store (values);
6433  hi().store (values+8);
6434 #else
6435  SIMD_DO (values[i] = m_val[i]);
6436 #endif
6437 }
6438 
6439 
6440 OIIO_FORCEINLINE void vint16::store (unsigned char *values) const {
6441 #if OIIO_SIMD_AVX512
6442  _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff), m_simd);
6443 #elif OIIO_SIMD_AVX >= 2
6444  lo().store (values);
6445  hi().store (values+8);
6446 #else
6447  SIMD_DO (values[i] = m_val[i]);
6448 #endif
6449 }
6450 
6451 
6452 
6453 // Shuffle groups of 4
6454 template<int i0, int i1, int i2, int i3>
6455 vint16 shuffle4 (const vint16& a) {
6456 #if OIIO_SIMD_AVX >= 512
6457  __m512 x = _mm512_castsi512_ps(a);
6458  return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,i2,i1,i0)));
6459 #else
6460  vint4 x[4];
6461  a.store ((int *)x);
6462  return vint16 (x[i0], x[i1], x[i2], x[i3]);
6463 #endif
6464 }
6465 
6466 template<int i> vint16 shuffle4 (const vint16& a) {
6467  return shuffle4<i,i,i,i> (a);
6468 }
6469 
6470 template<int i0, int i1, int i2, int i3>
6471 vint16 shuffle (const vint16& a) {
6472 #if OIIO_SIMD_AVX >= 512
6473  __m512 x = _mm512_castsi512_ps(a);
6474  return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,i2,i1,i0)));
6475 #else
6476  vint4 x[4];
6477  a.store ((int *)x);
6478  return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
6479  shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
6480 #endif
6481 }
6482 
6483 template<int i> vint16 shuffle (const vint16& a) {
6484  return shuffle<i,i,i,i> (a);
6485 }
6486 
6487 
6488 template<int i>
6490  return a[i];
6491 }
6492 
6493 
6494 template<int i>
6496  vint16 tmp = a;
6497  tmp[i] = val;
6498  return tmp;
6499 }
6500 
6501 
6503 #if OIIO_SIMD_AVX >= 512
6504  return _mm_cvtsi128_si32(_mm512_castsi512_si128(m_simd));
6505 #else
6506  return m_val[0];
6507 #endif
6508 }
6509 
6510 OIIO_FORCEINLINE int vint16::y () const { return m_val[1]; }
6511 OIIO_FORCEINLINE int vint16::z () const { return m_val[2]; }
6512 OIIO_FORCEINLINE int vint16::w () const { return m_val[3]; }
6517 
6518 
6520 {
6521 #if OIIO_SIMD_AVX >= 512
6522  return _mm512_maskz_set1_epi32 (x, -1);
6523 #else
6524  return vint16 (bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
6525 #endif
6526 }
6527 
6528 
6530 #if OIIO_SIMD_AVX >= 512
6531  // Nomenclature: ABCD are the vint4's comprising v
6532  // First, add the vint4's and make them all the same
6533  vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6534  vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD); // ABCD in all quads
6535  // Now, add within each vint4
6536  vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed
6537  return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
6538 #else
6539  vint8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
6540  return vint16 (sum, sum);
6541 #endif
6542 }
6543 
6544 
6546 #if OIIO_SIMD_AVX >= 512
6547  return vreduce_add(v).x();
6548 #else
6549  return reduce_add(v.lo()) + reduce_add(v.hi());
6550 #endif
6551 }
6552 
6553 
6555 #if OIIO_SIMD_AVX >= 512
6556  // Nomenclature: ABCD are the vint4's comprising v
6557  // First, and the vint4's and make them all the same
6558  vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6559  vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD);
6560  // Now, and within each vint4
6561  vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(w); // each adjacent int is summed
6562  vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd);
6563  return r.x();
6564 #else
6565  return reduce_and(v.lo()) & reduce_and(v.hi());
6566 #endif
6567 }
6568 
6569 
6571 #if OIIO_SIMD_AVX >= 512
6572  // Nomenclature: ABCD are the vint4's comprising v
6573  // First, or the vint4's or make them all the same
6574  vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6575  vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD);
6576  // Now, or within each vint4
6577  vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(w); // each adjacent int is summed
6578  vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd);
6579  return r.x();
6580 #else
6581  return reduce_or(v.lo()) | reduce_or(v.hi());
6582 #endif
6583 }
6584 
6585 
6586 
6587 OIIO_FORCEINLINE vint16 blend (const vint16& a, const vint16& b, const vbool16& mask) {
6588 #if OIIO_SIMD_AVX >= 512
6589  return _mm512_mask_blend_epi32 (mask, a, b);
6590 #else
6591  return vint16 (blend (a.lo(), b.lo(), mask.lo()),
6592  blend (a.hi(), b.hi(), mask.hi()));
6593 #endif
6594 }
6595 
6596 
6598 #if OIIO_SIMD_AVX >= 512
6599  return _mm512_maskz_mov_epi32 (mask, a);
6600 #else
6601  return vint16 (blend0 (a.lo(), mask.lo()),
6602  blend0 (a.hi(), mask.hi()));
6603 #endif
6604 }
6605 
6606 
6608 #if OIIO_SIMD_AVX >= 512
6609  return _mm512_maskz_mov_epi32 (!mask, a);
6610 #else
6611  return vint16 (blend0not (a.lo(), mask.lo()),
6612  blend0not (a.hi(), mask.hi()));
6613 #endif
6614 }
6615 
6616 OIIO_FORCEINLINE vint16 select (const vbool16& mask, const vint16& a, const vint16& b) {
6617  return blend (b, a, mask);
6618 }
6619 
6620 
6622 #if OIIO_SIMD_AVX >= 512
6623  return _mm512_abs_epi32(a.simd());
6624 #else
6625  return vint16 (abs(a.lo()), abs(a.hi()));
6626 #endif
6627 }
6628 
6629 
6630 OIIO_FORCEINLINE vint16 min (const vint16& a, const vint16& b) {
6631 #if OIIO_SIMD_AVX >= 512
6632  return _mm512_min_epi32 (a, b);
6633 #else
6634  return vint16 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
6635 #endif
6636 }
6637 
6638 
6639 OIIO_FORCEINLINE vint16 max (const vint16& a, const vint16& b) {
6640 #if OIIO_SIMD_AVX >= 512
6641  return _mm512_max_epi32 (a, b);
6642 #else
6643  return vint16 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
6644 #endif
6645 }
6646 
6647 
6649 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6650  // return _mm512_rol_epi32 (x, s);
6651  // We want to do this ^^^ but this intrinsic only takes an *immediate*
6652  // argument for s, and there isn't a way to express in C++ that a
6653  // parameter must be an immediate/literal value from the caller.
6654  return (x<<s) | srl(x,32-s);
6655 #else
6656  return (x<<s) | srl(x,32-s);
6657 #endif
6658 }
6659 
6660 // DEPRECATED (2.1)
6661 OIIO_FORCEINLINE vint16 rotl32 (const vint16& x, const unsigned int k) {
6662  return rotl(x, k);
6663 }
6664 
6665 
6667 #if OIIO_SIMD_AVX >= 512
6668  return _mm512_andnot_epi32 (a.simd(), b.simd());
6669 #else
6670  return vint16 (andnot(a.lo(), b.lo()), andnot(a.hi(), b.hi()));
6671 #endif
6672 }
6673 
6674 
6675 
6677  // NO INTEGER MODULUS IN SSE!
6678  SIMD_RETURN (vint16, b[i] ? a[i] % b[i] : 0);
6679 }
6680 
6682  return b ? (a % b) : vint16::Zero();
6683 }
6684 
6685 
6686 
6687 
6688 
6689 //////////////////////////////////////////////////////////////////////
6690 // vfloat4 implementation
6691 
6692 
6694 #if OIIO_SIMD_SSE
6695  m_simd = _mm_cvtepi32_ps (ival.simd());
6696 #elif OIIO_SIMD_NEON
6697  m_simd = vcvtq_f32_s32(ival.simd());
6698 #else
6699  SIMD_CONSTRUCT (float(ival[i]));
6700 #endif
6701 }
6702 
6703 
6705 #if OIIO_SIMD_SSE
6706  return _mm_setzero_ps();
6707 #else
6708  return vfloat4(0.0f);
6709 #endif
6710 }
6711 
6713  return vfloat4(1.0f);
6714 }
6715 
6716 OIIO_FORCEINLINE const vfloat4 vfloat4::Iota (float start, float step) {
6717  return vfloat4 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step);
6718 }
6719 
6720 /// Set all components to 0.0
6722 #if OIIO_SIMD_SSE
6723  m_simd = _mm_setzero_ps();
6724 #else
6725  load (0.0f);
6726 #endif
6727 }
6728 
6729 
6732  return m_val[i];
6733 }
6734 
6737  return m_val[i];
6738 }
6739 
6740 
6742 #if OIIO_SIMD_SSE
6743  m_simd = _mm_set1_ps (val);
6744 #elif OIIO_SIMD_NEON
6745  m_simd = vdupq_n_f32 (val);
6746 #else
6747  SIMD_CONSTRUCT (val);
6748 #endif
6749 }
6750 
6751 OIIO_FORCEINLINE void vfloat4::load (float a, float b, float c, float d) {
6752 #if OIIO_SIMD_SSE
6753  m_simd = _mm_set_ps (d, c, b, a);
6754 #elif OIIO_SIMD_NEON
6755  float values[4] = { a, b, c, d };
6756  m_simd = vld1q_f32 (values);
6757 #else
6758  m_val[0] = a;
6759  m_val[1] = b;
6760  m_val[2] = c;
6761  m_val[3] = d;
6762 #endif
6763 }
6764 
6765  /// Load from an array of 4 values
6767 #if OIIO_SIMD_SSE
6768  m_simd = _mm_loadu_ps (values);
6769 #elif OIIO_SIMD_NEON
6770  m_simd = vld1q_f32 (values);
6771 #else
6772  SIMD_CONSTRUCT (values[i]);
6773 #endif
6774 }
6775 
6776 
6777 OIIO_FORCEINLINE void vfloat4::load (const float *values, int n) {
6778  OIIO_DASSERT (n >= 0 && n <= elements);
6779 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6780  m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values);
6781 #elif OIIO_SIMD_SSE
6782  switch (n) {
6783  case 1:
6784  m_simd = _mm_load_ss (values);
6785  break;
6786  case 2:
6787  // Trickery: load one double worth of bits!
6788  m_simd = _mm_castpd_ps (_mm_load_sd ((const double*)values));
6789  break;
6790  case 3:
6791  m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0f);
6792  // This looks wasteful, but benchmarks show that it's the
6793  // fastest way to set 3 values with the 4th getting zero.
6794  // Actually, gcc and clang both turn it into something more
6795  // efficient than _mm_setr_ps. The version below looks smart,
6796  // but was much more expensive as the _mm_setr_ps!
6797  // __m128 xy = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)values));
6798  // m_simd = _mm_movelh_ps(xy, _mm_load_ss (values + 2));
6799  break;
6800  case 4:
6801  m_simd = _mm_loadu_ps (values);
6802  break;
6803  default:
6804  clear();
6805  break;
6806  }
6807 #elif OIIO_SIMD_NEON
6808  //switch (n) {
6809  //case 1: m_simd = vdupq_n_f32(0); m_simd[0] = values[0]; break;
6810  //case 2: load (values[0], values[1], 0.0f, 0.0f); break;
6811  //case 3: load (values[0], values[1], values[2], 0.0f); break;
6812  //case 4: m_simd = vld1q_f32 (values); break;
6813  //default: break;
6814  m_simd = vld1q_f32(values);
6815  switch (n) {
6816  case 1: m_simd = vsetq_lane_f32(0.0f, m_simd, 1);
6817  case 2: m_simd = vsetq_lane_f32(0.0f, m_simd, 2);
6818  case 3: m_simd = vsetq_lane_f32(0.0f, m_simd, 3);
6819  default: break;
6820  }
6821 #else
6822  for (int i = 0; i < n; ++i)
6823  m_val[i] = values[i];
6824  for (int i = n; i < paddedelements; ++i)
6825  m_val[i] = 0;
6826 #endif
6827 }
6828 
6829 
6830 OIIO_FORCEINLINE void vfloat4::load (const unsigned short *values) {
6831 #if OIIO_SIMD_SSE >= 2
6832  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6833  // You might guess that the following is faster, but it's NOT:
6834  // NO! m_simd = _mm_cvtpu16_ps (*(__m64*)values);
6835 #else
6836  SIMD_CONSTRUCT (values[i]);
6837 #endif
6838 }
6839 
6840 
6842 #if OIIO_SIMD_SSE >= 2
6843  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6844 #else
6845  SIMD_CONSTRUCT (values[i]);
6846 #endif
6847 }
6848 
6849 
6850 OIIO_FORCEINLINE void vfloat4::load (const unsigned char *values) {
6851 #if OIIO_SIMD_SSE >= 2
6852  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6853 #else
6854  SIMD_CONSTRUCT (values[i]);
6855 #endif
6856 }
6857 
6858 // Load from an array of 4 char values, convert to float
6860 #if OIIO_SIMD_SSE >= 2
6861  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6862 #else
6863  SIMD_CONSTRUCT (values[i]);
6864 #endif
6865 }
6866 
6867 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6868 OIIO_FORCEINLINE void vfloat4::load (const half *values) {
6869 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6870  /* Enabled 16 bit float instructions! */
6871  __m128i a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
6872  m_simd = _mm_cvtph_ps (a);
6873 #elif OIIO_SIMD_SSE >= 2
6874  // SSE half-to-float by Fabian "ryg" Giesen. Public domain.
6875  // https://gist.github.com/rygorous/2144712
6876  vint4 h ((const unsigned short *)values);
6877 # define CONSTI(name) *(const __m128i *)&name
6878 # define CONSTF(name) *(const __m128 *)&name
6879  OIIO_SIMD_UINT4_CONST(mask_nosign, 0x7fff);
6880  OIIO_SIMD_UINT4_CONST(magic, (254 - 15) << 23);
6881  OIIO_SIMD_UINT4_CONST(was_infnan, 0x7bff);
6882  OIIO_SIMD_UINT4_CONST(exp_infnan, 255 << 23);
6883  __m128i mnosign = CONSTI(mask_nosign);
6884  __m128i expmant = _mm_and_si128(mnosign, h);
6885  __m128i justsign = _mm_xor_si128(h, expmant);
6886  __m128i expmant2 = expmant; // copy (just here for counting purposes)
6887  __m128i shifted = _mm_slli_epi32(expmant, 13);
6888  __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
6889  __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan));
6890  __m128i sign = _mm_slli_epi32(justsign, 16);
6891  __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan));
6892  __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
6893  __m128 final = _mm_or_ps(scaled, sign_inf);
6894  // ~11 SSE2 ops.
6895  m_simd = final;
6896 # undef CONSTI
6897 # undef CONSTF
6898 #elif OIIO_SIMD_NEON
6899  vint4 h ((const unsigned short *)values);
6900  uint32x4_t u32 = vreinterpretq_u32_s32(h);
6901  uint16x4_t u16 = vmovn_u32(u32);
6902  float16x4_t f16 = vreinterpret_f16_u16(u16);
6903  m_simd = vcvt_f32_f16(f16);
6904 #else /* No SIMD defined: */
6905  SIMD_CONSTRUCT (values[i]);
6906 #endif
6907 }
6908 #endif /* _HALF_H_ or _IMATH_H_ */
6909 
6910 OIIO_FORCEINLINE void
6911 vfloat4::load_pairs(const float* lo, const float* hi)
6912 {
6913 #if OIIO_SIMD_SSE
6914  m_simd = _mm_loadh_pi(_mm_loadl_pi(Zero(), (__m64*)lo), (__m64*)hi);
6915 #else
6916  m_val[0] = lo[0];
6917  m_val[1] = lo[1];
6918  m_val[2] = hi[0];
6919  m_val[3] = hi[1];
6920 #endif
6921 }
6922 
6923 OIIO_FORCEINLINE void vfloat4::store (float *values) const {
6924 #if OIIO_SIMD_SSE
6925  // Use an unaligned store -- it's just as fast when the memory turns
6926  // out to be aligned, nearly as fast even when unaligned. Not worth
6927  // the headache of using stores that require alignment.
6928  _mm_storeu_ps (values, m_simd);
6929 #elif OIIO_SIMD_NEON
6930  vst1q_f32 (values, m_simd);
6931 #else
6932  SIMD_DO (values[i] = m_val[i]);
6933 #endif
6934 }
6935 
6936 OIIO_FORCEINLINE void vfloat4::store (float *values, int n) const {
6937  OIIO_DASSERT (n >= 0 && n <= 4);
6938 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6939  // This SHOULD be fast, but in my benchmarks, it is slower!
6940  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6941  // Re-test this periodically with new Intel hardware.
6942  _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)), m_simd);
6943 #elif OIIO_SIMD_SSE
6944  switch (n) {
6945  case 1:
6946  _mm_store_ss (values, m_simd);
6947  break;
6948  case 2:
6949  // Trickery: store two floats as a double worth of bits
6950  _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6951  break;
6952  case 3:
6953  values[0] = m_val[0];
6954  values[1] = m_val[1];
6955  values[2] = m_val[2];
6956  // This looks wasteful, but benchmarks show that it's the
6957  // fastest way to store 3 values, in benchmarks was faster than
6958  // this, below:
6959  // _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6960  // _mm_store_ss (values + 2, _mm_movehl_ps(m_simd,m_simd));
6961  break;
6962  case 4:
6963  store (values);
6964  break;
6965  default:
6966  break;
6967  }
6968 #elif OIIO_SIMD_NEON
6969  switch (n) {
6970  case 1:
6971  vst1q_lane_f32 (values, m_simd, 0);
6972  break;
6973  case 2:
6974  vst1q_lane_f32 (values++, m_simd, 0);
6975  vst1q_lane_f32 (values, m_simd, 1);
6976  break;
6977  case 3:
6978  vst1q_lane_f32 (values++, m_simd, 0);
6979  vst1q_lane_f32 (values++, m_simd, 1);
6980  vst1q_lane_f32 (values, m_simd, 2);
6981  break;
6982  case 4:
6983  vst1q_f32 (values, m_simd); break;
6984  default:
6985  break;
6986  }
6987 #else
6988  for (int i = 0; i < n; ++i)
6989  values[i] = m_val[i];
6990 #endif
6991 }
6992 
6993 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6994 OIIO_FORCEINLINE void vfloat4::store (half *values) const {
6995 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6996  __m128i h = _mm_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
6997  _mm_store_sd ((double *)values, _mm_castsi128_pd(h));
6998 #elif OIIO_SIMD_NEON
6999  float16x4_t f16 = vcvt_f16_f32(m_simd);
7000  uint16x4_t u16 = vreinterpret_u16_f16(f16);
7001  vst1_u16((unsigned short*)values, u16);
7002 #else
7003  SIMD_DO (values[i] = m_val[i]);
7004 #endif
7005 }
7006 #endif
7007 
7008 
7009 OIIO_FORCEINLINE void vfloat4::load_mask (int mask, const float *values) {
7010 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7011  m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
7012 #elif OIIO_SIMD_AVX
7013  m_simd = _mm_maskload_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
7014 #else
7015  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
7016 #endif
7017 }
7018 
7019 
7020 OIIO_FORCEINLINE void vfloat4::load_mask (const vbool_t& mask, const float *values) {
7021 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7022  m_simd = _mm_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
7023 #elif OIIO_SIMD_AVX
7024  m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask));
7025 #else
7026  SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
7027 #endif
7028 }
7029 
7030 
7031 OIIO_FORCEINLINE void vfloat4::store_mask (int mask, float *values) const {
7032 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7033  _mm_mask_storeu_ps (values, __mmask8(mask), m_simd);
7034 #elif OIIO_SIMD_AVX
7035  _mm_maskstore_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
7036 #else
7037  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
7038 #endif
7039 }
7040 
7041 
7042 OIIO_FORCEINLINE void vfloat4::store_mask (const vbool_t& mask, float *values) const {
7043 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7044  _mm_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
7045 #elif OIIO_SIMD_AVX
7046  _mm_maskstore_ps (values, _mm_castps_si128(mask.simd()), m_simd);
7047 #else
7048  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
7049 #endif
7050 }
7051 
7052 
7053 template <int scale>
7054 OIIO_FORCEINLINE void
7055 vfloat4::gather (const value_t *baseptr, const vint_t& vindex)
7056 {
7057 #if OIIO_SIMD_AVX >= 2
7058  m_simd = _mm_i32gather_ps (baseptr, vindex, scale);
7059 #else
7060  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
7061 #endif
7062 }
7063 
7064 template<int scale>
7065 OIIO_FORCEINLINE void
7066 vfloat4::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
7067 {
7068 #if OIIO_SIMD_AVX >= 2
7069  m_simd = _mm_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
7070 #else
7071  SIMD_DO (if (mask[i]) m_val[i] = *(const value_t *)((const char *)baseptr + vindex[i]*scale));
7072 #endif
7073 }
7074 
7075 template<int scale>
7076 OIIO_FORCEINLINE void
7077 vfloat4::scatter (value_t *baseptr, const vint_t& vindex) const
7078 {
7079 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7080  // FIXME: disable because it benchmarks slower than the dumb way
7081  _mm_i32scatter_ps (baseptr, vindex, m_simd, scale);
7082 #else
7083  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
7084 #endif
7085 }
7086 
7087 template<int scale>
7088 OIIO_FORCEINLINE void
7090  const vint_t& vindex) const
7091 {
7092 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7093  // FIXME: disable because it benchmarks slower than the dumb way
7094  _mm_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
7095 #else
7096  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
7097 #endif
7098 }
7099 
7100 
7102 #if OIIO_SIMD_SSE
7103  return _mm_add_ps (a.m_simd, b.m_simd);
7104 #elif OIIO_SIMD_NEON
7105  return vaddq_f32 (a.m_simd, b.m_simd);
7106 #else
7107  SIMD_RETURN (vfloat4, a[i] + b[i]);
7108 #endif
7109 }
7110 
7112 #if OIIO_SIMD_SSE
7113  m_simd = _mm_add_ps (m_simd, a.m_simd);
7114 #elif OIIO_SIMD_NEON
7115  m_simd = vaddq_f32 (m_simd, a.m_simd);
7116 #else
7117  SIMD_DO (m_val[i] += a[i]);
7118 #endif
7119  return *this;
7120  }
7121 
7123 #if OIIO_SIMD_SSE
7124  return _mm_sub_ps (_mm_setzero_ps(), m_simd);
7125 #elif OIIO_SIMD_NEON
7126  return vsubq_f32 (Zero(), m_simd);
7127 #else
7128  SIMD_RETURN (vfloat4, -m_val[i]);
7129 #endif
7130 }
7131 
7133 #if OIIO_SIMD_SSE
7134  return _mm_sub_ps (a.m_simd, b.m_simd);
7135 #elif OIIO_SIMD_NEON
7136  return vsubq_f32 (a.m_simd, b.m_simd);
7137 #else
7138  SIMD_RETURN (vfloat4, a[i] - b[i]);
7139 #endif
7140 }
7141 
7143 #if OIIO_SIMD_SSE
7144  m_simd = _mm_sub_ps (m_simd, a.m_simd);
7145 #elif OIIO_SIMD_NEON
7146  m_simd = vsubq_f32 (m_simd, a.m_simd);
7147 #else
7148  SIMD_DO (m_val[i] -= a[i]);
7149 #endif
7150  return *this;
7151 }
7152 
7154 #if OIIO_SIMD_SSE
7155  return _mm_mul_ps (a.m_simd, _mm_set1_ps(b));
7156 #elif OIIO_SIMD_NEON
7157  return vmulq_n_f32 (a.m_simd, b);
7158 #else
7159  SIMD_RETURN (vfloat4, a[i] * b);
7160 #endif
7161 }
7162 
7164  return b * a;
7165 }
7166 
7168 #if OIIO_SIMD_SSE
7169  return _mm_mul_ps (a.m_simd, b.m_simd);
7170 #elif OIIO_SIMD_NEON
7171  return vmulq_f32 (a.m_simd, b.m_simd);
7172 #else
7173  SIMD_RETURN (vfloat4, a[i] * b[i]);
7174 #endif
7175 }
7176 
7178 #if OIIO_SIMD_SSE
7179  m_simd = _mm_mul_ps (m_simd, a.m_simd);
7180 #elif OIIO_SIMD_NEON
7181  m_simd = vmulq_f32 (m_simd, a.m_simd);
7182 #else
7183  SIMD_DO (m_val[i] *= a[i]);
7184 #endif
7185  return *this;
7186 }
7187 
7189 #if OIIO_SIMD_SSE
7190  m_simd = _mm_mul_ps (m_simd, _mm_set1_ps(val));
7191 #elif OIIO_SIMD_NEON
7192  m_simd = vmulq_n_f32 (m_simd, val);
7193 #else
7194  SIMD_DO (m_val[i] *= val);
7195 #endif
7196  return *this;
7197 }
7198 
7200 #if OIIO_SIMD_SSE
7201  return _mm_div_ps (a.m_simd, b.m_simd);
7202 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7203  return vdivq_f32 (a.m_simd, b.m_simd);
7204 #else
7205  SIMD_RETURN (vfloat4, a[i] / b[i]);
7206 #endif
7207 }
7208 
7210 #if OIIO_SIMD_SSE
7211  m_simd = _mm_div_ps (m_simd, a.m_simd);
7212 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7213  m_simd = vdivq_f32 (m_simd, a.m_simd);
7214 #else
7215  SIMD_DO (m_val[i] /= a[i]);
7216 #endif
7217  return *this;
7218 }
7219 
7221 #if OIIO_SIMD_SSE
7222  m_simd = _mm_div_ps (m_simd, _mm_set1_ps(val));
7223 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7224  m_simd = vdivq_f32 (m_simd, vfloat4(val));
7225 #else
7226  SIMD_DO (m_val[i] /= val);
7227 #endif
7228  return *this;
7229 }
7230 
7232 #if OIIO_SIMD_SSE
7233  return _mm_cmpeq_ps (a.m_simd, b.m_simd);
7234 #elif OIIO_SIMD_NEON
7235  return vceqq_f32 (a.m_simd, b.m_simd);
7236 #else
7237  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
7238 #endif
7239 }
7240 
7242 #if OIIO_SIMD_SSE
7243  return _mm_cmpneq_ps (a.m_simd, b.m_simd);
7244 #elif OIIO_SIMD_NEON
7245  // implemented as NOT(a == b)
7246  return vmvnq_u32(vceqq_f32 (a.m_simd, b.m_simd));
7247 #else
7248  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
7249 #endif
7250 }
7251 
7253 #if OIIO_SIMD_SSE
7254  return _mm_cmplt_ps (a.m_simd, b.m_simd);
7255 #elif OIIO_SIMD_NEON
7256  return vcltq_f32 (a.m_simd, b.m_simd);
7257 #else
7258  SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
7259 #endif
7260 }
7261 
7263 #if OIIO_SIMD_SSE
7264  return _mm_cmpgt_ps (a.m_simd, b.m_simd);
7265 #elif OIIO_SIMD_NEON
7266  return vcgtq_f32 (a.m_simd, b.m_simd);
7267 #else
7268  SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
7269 #endif
7270 }
7271 
7273 #if OIIO_SIMD_SSE
7274  return _mm_cmpge_ps (a.m_simd, b.m_simd);
7275 #elif OIIO_SIMD_NEON
7276  return vcgeq_f32 (a.m_simd, b.m_simd);
7277 #else
7278  SIMD_RETURN (vbool4, a[i] >= b[i] ? -1 : 0);
7279 #endif
7280 }
7281 
7283 #if OIIO_SIMD_SSE
7284  return _mm_cmple_ps (a.m_simd, b.m_simd);
7285 #elif OIIO_SIMD_NEON
7286  return vcleq_f32 (a.m_simd, b.m_simd);
7287 #else
7288  SIMD_RETURN (vbool4, a[i] <= b[i] ? -1 : 0);
7289 #endif
7290 }
7291 
7293 #if OIIO_SIMD_SSE
7294  return _mm_movelh_ps (a.m_simd, b.m_simd);
7295 #else
7296  return vfloat4 (a[0], a[1], b[0], b[1]);
7297 #endif
7298 }
7299 
7301 #if OIIO_SIMD_SSE
7302  return _mm_unpacklo_ps (a.m_simd, b.m_simd);
7303 #else
7304  return vfloat4 (a[0], b[0], a[1], b[1]);
7305 #endif
7306 }
7307 
7309  return insert<3>(*this, 0.0f);
7310 }
7311 
7313  return insert<3>(*this, 1.0f);
7314 }
7315 
7316 inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val) {
7317  cout << val[0];
7318  for (int i = 1; i < val.elements; ++i)
7319  cout << ' ' << val[i];
7320  return cout;
7321 }
7322 
7323 
7324 // Implementation had to be after the definition of vfloat4.
7326 {
7327 #if OIIO_SIMD_SSE
7328  m_simd = _mm_cvttps_epi32(f.simd());
7329 #elif OIIO_SIMD_NEON
7330  m_simd = vcvtq_s32_f32(f.simd());
7331 #else
7332  SIMD_CONSTRUCT ((int) f[i]);
7333 #endif
7334 }
7335 
7336 
7337 template<int i0, int i1, int i2, int i3>
7339 #if OIIO_SIMD_SSE
7340  return shuffle_sse<i0,i1,i2,i3> (__m128(a));
7341 #else
7342  return vfloat4(a[i0], a[i1], a[i2], a[i3]);
7343 #endif
7344 }
7345 
7346 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
7347 
7348 #if OIIO_SIMD_NEON
7349 template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
7350  float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
7351 }
7352 template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
7353  float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
7354 }
7355 template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
7356  float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
7357 }
7358 template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7359  float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
7360 }
7361 #endif
7362 
7363 
7364 template<int i>
7365 OIIO_FORCEINLINE vfloat4
7366 shuffle(const vfloat4& a, const vfloat4& b)
7367 {
7368 #if OIIO_SIMD_SSE
7369  return vfloat4(_mm_shuffle_ps(a, b, i));
7370 #else
7371  return vfloat4(a[i & 0x03], a[(i >> 2) & (0x03)],
7372  b[(i >> 4) & 0x03], b[(i >> 6) & (0x03)]);
7373 #endif
7374 }
7375 
7376 
7377 /// Helper: as rapid as possible extraction of one component, when the
7378 /// index is fixed.
7379 template<int i>
7381 #if OIIO_SIMD_SSE
7382  return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.simd()));
7383 #elif OIIO_SIMD_NEON
7384  return vgetq_lane_f32(a.simd(), i);
7385 #else
7386  return a[i];
7387 #endif
7388 }
7389 
7390 #if OIIO_SIMD_SSE
7391 template<> OIIO_FORCEINLINE float extract<0> (const vfloat4& a) {
7392  return _mm_cvtss_f32(a.simd());
7393 }
7394 #endif
7395 
7396 
7397 /// Helper: substitute val for a[i]
7398 template<int i>
7400 #if OIIO_SIMD_SSE >= 4
7401  return _mm_insert_ps (a, _mm_set_ss(val), i<<4);
7402 #elif OIIO_SIMD_NEON
7403  return vld1q_lane_f32(&val, a.simd(), i);
7404 #else
7405  vfloat4 tmp = a;
7406  tmp[i] = val;
7407  return tmp;
7408 #endif
7409 }
7410 
7411 #if OIIO_SIMD_SSE
7412 // Slightly faster special cases for SSE
7413 template<> OIIO_FORCEINLINE vfloat4 insert<0> (const vfloat4& a, float val) {
7414  return _mm_move_ss (a.simd(), _mm_set_ss(val));
7415 }
7416 #endif
7417 
7418 
7419 OIIO_FORCEINLINE float vfloat4::x () const { return extract<0>(*this); }
7420 OIIO_FORCEINLINE float vfloat4::y () const { return extract<1>(*this); }
7421 OIIO_FORCEINLINE float vfloat4::z () const { return extract<2>(*this); }
7422 OIIO_FORCEINLINE float vfloat4::w () const { return extract<3>(*this); }
7423 OIIO_FORCEINLINE void vfloat4::set_x (float val) { *this = insert<0>(*this, val); }
7424 OIIO_FORCEINLINE void vfloat4::set_y (float val) { *this = insert<1>(*this, val); }
7425 OIIO_FORCEINLINE void vfloat4::set_z (float val) { *this = insert<2>(*this, val); }
7426 OIIO_FORCEINLINE void vfloat4::set_w (float val) { *this = insert<3>(*this, val); }
7427 
7428 
7430 {
7431 #if OIIO_SIMD_SSE
7432  return _mm_castps_si128 (x.simd());
7433 #elif OIIO_SIMD_NEON
7434  return vreinterpretq_s32_f32 (x.simd());
7435 #else
7436  return *(vint4 *)&x;
7437 #endif
7438 }
7439 
7441 {
7442 #if OIIO_SIMD_SSE
7443  return _mm_castsi128_ps (x.simd());
7444 #elif OIIO_SIMD_NEON
7445  return vreinterpretq_f32_s32 (x.simd());
7446 #else
7447  return *(vfloat4 *)&x;
7448 #endif
7449 }
7450 
7451 
7452 // Old names:
7453 inline vint4 bitcast_to_int4 (const vfloat4& x) { return bitcast_to_int(x); }
7454 inline vfloat4 bitcast_to_float4 (const vint4& x) { return bitcast_to_float(x); }
7455 
7456 
7457 
7459 #if OIIO_SIMD_SSE >= 3
7460  // People seem to agree that SSE3 does add reduction best with 2
7461  // horizontal adds.
7462  // suppose v = (a, b, c, d)
7463  simd::vfloat4 ab_cd = _mm_hadd_ps (v.simd(), v.simd());
7464  // ab_cd = (a+b, c+d, a+b, c+d)
7465  simd::vfloat4 abcd = _mm_hadd_ps (ab_cd.simd(), ab_cd.simd());
7466  // all abcd elements are a+b+c+d
7467  return abcd;
7468 #elif OIIO_SIMD_SSE
7469  // I think this is the best we can do for SSE2, and I'm still not sure
7470  // it's faster than the default scalar operation. But anyway...
7471  // suppose v = (a, b, c, d)
7472  vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
7473  // now x = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
7474  vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
7475  // now y = (c+d,c+d,a+b,a+b)
7476  vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components
7477  return abcd;
7478 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7479  return vfloat4(vaddvq_f32(v));
7480 #else
7481  return vfloat4 (v[0] + v[1] + v[2] + v[3]);
7482 #endif
7483 }
7484 
7485 
7487 #if OIIO_SIMD_SSE
7488  return _mm_cvtss_f32(vreduce_add (v));
7489 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7490  return vaddvq_f32(v);
7491 #else
7492  return v[0] + v[1] + v[2] + v[3];
7493 #endif
7494 }
7495 
7497 #if OIIO_SIMD_SSE >= 4
7498  return _mm_dp_ps (a.simd(), b.simd(), 0xff);
7499 #elif OIIO_SIMD_NEON
7500  float32x4_t ab = vmulq_f32(a, b);
7501  float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab));
7502  return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
7503 #else
7504  return vreduce_add (a*b);
7505 #endif
7506 }
7507 
7508 OIIO_FORCEINLINE float dot (const vfloat4 &a, const vfloat4 &b) {
7509 #if OIIO_SIMD_SSE >= 4
7510  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0xff));
7511 #else
7512  return reduce_add (a*b);
7513 #endif
7514 }
7515 
7517 #if OIIO_SIMD_SSE >= 4
7518  return _mm_dp_ps (a.simd(), b.simd(), 0x7f);
7519 #else
7520  return vreduce_add((a*b).xyz0());
7521 #endif
7522 }
7523 
7524 OIIO_FORCEINLINE float dot3 (const vfloat4 &a, const vfloat4 &b) {
7525 #if OIIO_SIMD_SSE >= 4
7526  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
7527 #else
7528  return reduce_add ((a*b).xyz0());
7529 #endif
7530 }
7531 
7532 
7534 {
7535 #if OIIO_SIMD_SSE >= 4
7536  // SSE >= 4.1 only
7537  return _mm_blendv_ps (a.simd(), b.simd(), mask.simd());
7538 #elif OIIO_SIMD_SSE
7539  // Trick for SSE < 4.1
7540  return _mm_or_ps (_mm_and_ps(mask.simd(), b.simd()),
7541  _mm_andnot_ps(mask.simd(), a.simd()));
7542 #elif OIIO_SIMD_NEON
7543  return vbslq_f32 (mask.simd(), b.simd(), a.simd());
7544 #else
7545  return vfloat4 (mask[0] ? b[0] : a[0],
7546  mask[1] ? b[1] : a[1],
7547  mask[2] ? b[2] : a[2],
7548  mask[3] ? b[3] : a[3]);
7549 #endif
7550 }
7551 
7552 
7554 {
7555 #if OIIO_SIMD_SSE
7556  return _mm_and_ps(mask.simd(), a.simd());
7557 #else
7558  return vfloat4 (mask[0] ? a[0] : 0.0f,
7559  mask[1] ? a[1] : 0.0f,
7560  mask[2] ? a[2] : 0.0f,
7561  mask[3] ? a[3] : 0.0f);
7562 #endif
7563 }
7564 
7565 
7567 {
7568 #if OIIO_SIMD_SSE
7569  return _mm_andnot_ps(mask.simd(), a.simd());
7570 #else
7571  return vfloat4 (mask[0] ? 0.0f : a[0],
7572  mask[1] ? 0.0f : a[1],
7573  mask[2] ? 0.0f : a[2],
7574  mask[3] ? 0.0f : a[3]);
7575 #endif
7576 }
7577 
7578 
7580 #if OIIO_SIMD_SSE
7581  return blend0not (a/b, b == vfloat4::Zero());
7582 #else
7583  return vfloat4 (b[0] == 0.0f ? 0.0f : a[0] / b[0],
7584  b[1] == 0.0f ? 0.0f : a[1] / b[1],
7585  b[2] == 0.0f ? 0.0f : a[2] / b[2],
7586  b[3] == 0.0f ? 0.0f : a[3] / b[3]);
7587 #endif
7588 }
7589 
7590 
7592 {
7593 #if OIIO_SIMD_SSE
7594  return vfloat3(safe_div(a, shuffle<3>(a)).xyz0());
7595 #else
7596  float d = a[3];
7597  return d == 0.0f ? vfloat3 (0.0f) : vfloat3 (a[0]/d, a[1]/d, a[2]/d);
7598 #endif
7599 }
7600 
7601 
7602 
7604 {
7605  return blend (b, a, mask);
7606 }
7607 
7608 
7610 {
7611 #if OIIO_SIMD_SSE
7612  // Just clear the sign bit for cheap fabsf
7613  return _mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
7614 #elif OIIO_SIMD_NEON
7615  return vabsq_f32(a.simd());
7616 #else
7617  SIMD_RETURN (vfloat4, fabsf(a[i]));
7618 #endif
7619 }
7620 
7621 
7623 {
7624  vfloat4 one(1.0f);
7625  return blend (one, -one, a < vfloat4::Zero());
7626 }
7627 
7628 
7630 {
7631 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7632  return _mm_ceil_ps (a);
7633 #else
7634  SIMD_RETURN (vfloat4, ceilf(a[i]));
7635 #endif
7636 }
7637 
7639 {
7640 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7641  return _mm_floor_ps (a);
7642 #else
7643  SIMD_RETURN (vfloat4, floorf(a[i]));
7644 #endif
7645 }
7646 
7648 {
7649 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7650  return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
7651 #elif OIIO_SIMD_NEON
7652  return vrndnq_f32(a);
7653 #else
7654  SIMD_RETURN (vfloat4, roundf(a[i]));
7655 #endif
7656 }
7657 
7659 {
7660  // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
7661 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7662  return vint4(floor(a));
7663 #else
7664  SIMD_RETURN (vint4, (int)floorf(a[i]));
7665 #endif
7666 }
7667 
7668 
7670 {
7671  return vint4 (round(a));
7672 }
7673 
7674 
7676 {
7677 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
7678  // avx512vl directly has rcp14 on float4
7679  vfloat4 r = _mm_rcp14_ps(a);
7680  return r * nmadd(r,a,vfloat4(2.0f));
7681 #elif OIIO_SIMD_AVX512
7682  // Trickery: in and out of the 512 bit registers to use fast approx rcp
7683  vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a));
7684  return _mm512_castps512_ps128(r);
7685 #elif OIIO_SIMD_SSE
7686  vfloat4 r = _mm_rcp_ps(a);
7687  return r * nmadd(r,a,vfloat4(2.0f));
7688 #else
7689  SIMD_RETURN (vfloat4, 1.0f/a[i]);
7690 #endif
7691 }
7692 
7693 
7695 {
7696 #if OIIO_SIMD_SSE
7697  return _mm_sqrt_ps (a.simd());
7698 #elif OIIO_SIMD_NEON
7699  return vsqrtq_f32 (a.simd());
7700 #else
7701  SIMD_RETURN (vfloat4, sqrtf(a[i]));
7702 #endif
7703 }
7704 
7705 
7707 {
7708 #if OIIO_SIMD_SSE
7709  return _mm_div_ps (_mm_set1_ps(1.0f), _mm_sqrt_ps (a.simd()));
7710 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7711  return vdivq_f32(vdupq_n_f32(1.0f), vsqrtq_f32(a));
7712 #else
7713  SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7714 #endif
7715 }
7716 
7717 
7719 {
7720 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
7721  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7722  return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
7723 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7724  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7725  return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
7726 #elif OIIO_SIMD_SSE
7727  return _mm_rsqrt_ps (a.simd());
7728 #else
7729  SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7730 #endif
7731 }
7732 
7733 
7735 {
7736 #if OIIO_SIMD_SSE
7737  return _mm_min_ps (a, b);
7738 #elif OIIO_SIMD_NEON
7739  return vminq_f32(a, b);
7740 #else
7741  SIMD_RETURN (vfloat4, std::min (a[i], b[i]));
7742 #endif
7743 }
7744 
7746 {
7747 #if OIIO_SIMD_SSE
7748  return _mm_max_ps (a, b);
7749 #elif OIIO_SIMD_NEON
7750  return vmaxq_f32(a, b);
7751 #else
7752  SIMD_RETURN (vfloat4, std::max (a[i], b[i]));
7753 #endif
7754 }
7755 
7756 
7758 #if OIIO_SIMD_SSE
7759  return _mm_andnot_ps (a.simd(), b.simd());
7760 #else
7761  vint4 ai = bitcast_to_int(a);
7762  vint4 bi = bitcast_to_int(b);
7763  return bitcast_to_float(vint4(~(ai[0]) & bi[0],
7764  ~(ai[1]) & bi[1],
7765  ~(ai[2]) & bi[2],
7766  ~(ai[3]) & bi[3]));
7767 #endif
7768 }
7769 
7770 
7772  const simd::vfloat4& c)
7773 {
7774 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7775  // If we are sure _mm_fmadd_ps intrinsic is available, use it.
7776  return _mm_fmadd_ps (a, b, c);
7777 #elif OIIO_SIMD_NEON
7778  return vmlaq_f32(c.simd(), a.simd(), b.simd());
7779 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7780  // If we directly access the underlying __m128, on some platforms and
7781  // compiler flags, it will turn into fma anyway, even if we don't use
7782  // the intrinsic.
7783  return a.simd() * b.simd() + c.simd();
7784 #else
7785  // Fallback: just use regular math and hope for the best.
7786  return a * b + c;
7787 #endif
7788 }
7789 
7790 
7792  const simd::vfloat4& c)
7793 {
7794 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7795  // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7796  return _mm_fmsub_ps (a, b, c);
7797 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7798  // If we directly access the underlying __m128, on some platforms and
7799  // compiler flags, it will turn into fma anyway, even if we don't use
7800  // the intrinsic.
7801  return a.simd() * b.simd() - c.simd();
7802 #else
7803  // Fallback: just use regular math and hope for the best.
7804  return a * b - c;
7805 #endif
7806 }
7807 
7808 
7809 
7811  const simd::vfloat4& c)
7812 {
7813 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7814  // If we are sure _mm_fnmadd_ps intrinsic is available, use it.
7815  return _mm_fnmadd_ps (a, b, c);
7816 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7817  // If we directly access the underlying __m128, on some platforms and
7818  // compiler flags, it will turn into fma anyway, even if we don't use
7819  // the intrinsic.
7820  return c.simd() - a.simd() * b.simd();
7821 #else
7822  // Fallback: just use regular math and hope for the best.
7823  return c - a * b;
7824 #endif
7825 }
7826 
7827 
7828 
7830  const simd::vfloat4& c)
7831 {
7832 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7833  // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7834  return _mm_fnmsub_ps (a, b, c);
7835 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7836  // If we directly access the underlying __m128, on some platforms and
7837  // compiler flags, it will turn into fma anyway, even if we don't use
7838  // the intrinsic.
7839  return -(a.simd() * b.simd()) - c.simd();
7840 #else
7841  // Fallback: just use regular math and hope for the best.
7842  return -(a * b) - c;
7843 #endif
7844 }
7845 
7846 
7847 
7848 // Full precision exp() of all components of a SIMD vector.
7849 template<typename T>
7850 OIIO_FORCEINLINE T exp (const T& v)
7851 {
7852 #if OIIO_SIMD_SSE
7853  // Implementation inspired by:
7854  // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7855  // Which is listed as Copyright (C) 2007 Julien Pommier and distributed
7856  // under the zlib license.
7857  typedef typename T::vint_t int_t;
7858  T x = v;
7859  const float exp_hi (88.3762626647949f);
7860  const float exp_lo (-88.3762626647949f);
7861  const float cephes_LOG2EF (1.44269504088896341f);
7862  const float cephes_exp_C1 (0.693359375f);
7863  const float cephes_exp_C2 (-2.12194440e-4f);
7864  const float cephes_exp_p0 (1.9875691500E-4f);
7865  const float cephes_exp_p1 (1.3981999507E-3f);
7866  const float cephes_exp_p2 (8.3334519073E-3f);
7867  const float cephes_exp_p3 (4.1665795894E-2f);
7868  const float cephes_exp_p4 (1.6666665459E-1f);
7869  const float cephes_exp_p5 (5.0000001201E-1f);
7870  T tmp (0.0f);
7871  T one (1.0f);
7872  x = min (x, T(exp_hi));
7873  x = max (x, T(exp_lo));
7874  T fx = madd (x, T(cephes_LOG2EF), T(0.5f));
7875  int_t emm0 = int_t(fx);
7876  tmp = T(emm0);
7877  T mask = bitcast_to_float (bitcast_to_int(tmp > fx) & bitcast_to_int(one));
7878  fx = tmp - mask;
7879  tmp = fx * cephes_exp_C1;
7880  T z = fx * cephes_exp_C2;
7881  x = x - tmp;
7882  x = x - z;
7883  z = x * x;
7884  T y = cephes_exp_p0;
7885  y = madd (y, x, cephes_exp_p1);
7886  y = madd (y, x, cephes_exp_p2);
7887  y = madd (y, x, cephes_exp_p3);
7888  y = madd (y, x, cephes_exp_p4);
7889  y = madd (y, x, cephes_exp_p5);
7890  y = madd (y, z, x);
7891  y = y + one;
7892  emm0 = (int_t(fx) + int_t(0x7f)) << 23;
7893  T pow2n = bitcast_to_float(emm0);
7894  y = y * pow2n;
7895  return y;
7896 #else
7897  SIMD_RETURN (T, expf(v[i]));
7898 #endif
7899 }
7900 
7901 
7902 
7903 // Full precision log() of all components of a SIMD vector.
7904 template<typename T>
7905 OIIO_FORCEINLINE T log (const T& v)
7906 {
7907 #if OIIO_SIMD_SSE
7908  // Implementation inspired by:
7909  // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7910  // Which is listed as Copyright (C) 2007 Julien Pommier and distributed
7911  // under the zlib license.
7912  typedef typename T::vint_t int_t;
7913  typedef typename T::vbool_t bool_t;
7914  T x = v;
7915  int_t emm0;
7916  T zero (T::Zero());
7917  T one (1.0f);
7918  bool_t invalid_mask = (x <= zero);
7919  const int min_norm_pos ((int)0x00800000);
7920  const int inv_mant_mask ((int)~0x7f800000);
7921  x = max(x, bitcast_to_float(int_t(min_norm_pos))); /* cut off denormalized stuff */
7922  emm0 = srl (bitcast_to_int(x), 23);
7923  /* keep only the fractional part */
7924  x = bitcast_to_float (bitcast_to_int(x) & int_t(inv_mant_mask));
7926  emm0 = emm0 - int_t(0x7f);
7927  T e (emm0);
7928  e = e + one;
7929  // OIIO_SIMD_vFLOAT4_CONST (cephes_SQRTHF, 0.707106781186547524f);
7930  const float cephes_SQRTHF (0.707106781186547524f);
7931  bool_t mask = (x < T(cephes_SQRTHF));
7932  T tmp = bitcast_to_float (bitcast_to_int(x) & bitcast_to_int(mask));
7933  x = x - one;
7934  e = e - bitcast_to_float (bitcast_to_int(one) & bitcast_to_int(mask));
7935  x = x + tmp;
7936  T z = x * x;
7937  const float cephes_log_p0 (7.0376836292E-2f);
7938  const float cephes_log_p1 (- 1.1514610310E-1f);
7939  const float cephes_log_p2 (1.1676998740E-1f);
7940  const float cephes_log_p3 (- 1.2420140846E-1f);
7941  const float cephes_log_p4 (+ 1.4249322787E-1f);
7942  const float cephes_log_p5 (- 1.6668057665E-1f);
7943  const float cephes_log_p6 (+ 2.0000714765E-1f);
7944  const float cephes_log_p7 (- 2.4999993993E-1f);
7945  const float cephes_log_p8 (+ 3.3333331174E-1f);
7946  const float cephes_log_q1 (-2.12194440e-4f);
7947  const float cephes_log_q2 (0.693359375f);
7948  T y = cephes_log_p0;
7949  y = madd (y, x, T(cephes_log_p1));
7950  y = madd (y, x, T(cephes_log_p2));
7951  y = madd (y, x, T(cephes_log_p3));
7952  y = madd (y, x, T(cephes_log_p4));
7953  y = madd (y, x, T(cephes_log_p5));
7954  y = madd (y, x, T(cephes_log_p6));
7955  y = madd (y, x, T(cephes_log_p7));
7956  y = madd (y, x, T(cephes_log_p8));
7957  y = y * x;
7958  y = y * z;
7959  y = madd(e, T(cephes_log_q1), y);
7960  y = nmadd (z, 0.5f, y);
7961  x = x + y;
7962  x = madd (e, T(cephes_log_q2), x);
7963  x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(invalid_mask)); // negative arg will be NAN
7964  return x;
7965 #else
7966  SIMD_RETURN (T, logf(v[i]));
7967 #endif
7968 }
7969 
7970 
7971 
7973 {
7974 #if OIIO_SIMD_SSE
7975  _MM_TRANSPOSE4_PS (a.simd(), b.simd(), c.simd(), d.simd());
7976 #else
7977  vfloat4 A (a[0], b[0], c[0], d[0]);
7978  vfloat4 B (a[1], b[1], c[1], d[1]);
7979  vfloat4 C (a[2], b[2], c[2], d[2]);
7980  vfloat4 D (a[3], b[3], c[3], d[3]);
7981  a = A; b = B; c = C; d = D;
7982 #endif
7983 }
7984 
7985 
7986 OIIO_FORCEINLINE void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
7987  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3)
7988 {
7989 #if OIIO_SIMD_SSE
7990  //_MM_TRANSPOSE4_PS (a, b, c, d);
7991  auto l02 = _mm_unpacklo_ps (a, c);
7992  auto h02 = _mm_unpackhi_ps (a, c);
7993  auto l13 = _mm_unpacklo_ps (b, d);
7994  auto h13 = _mm_unpackhi_ps (b, d);
7995  r0 = vfloat4(_mm_unpacklo_ps (l02, l13));
7996  r1 = vfloat4(_mm_unpackhi_ps (l02, l13));
7997  r2 = vfloat4(_mm_unpacklo_ps (h02, h13));
7998  r3 = vfloat4(_mm_unpackhi_ps (h02, h13));
7999 #else
8000  r0.load (a[0], b[0], c[0], d[0]);
8001  r1.load (a[1], b[1], c[1], d[1]);
8002  r2.load (a[2], b[2], c[2], d[2]);
8003  r3.load (a[3], b[3], c[3], d[3]);
8004 #endif
8005 }
8006 
8007 
8009 {
8010 #if OIIO_SIMD_SSE
8011  __m128 A = _mm_castsi128_ps (a);
8012  __m128 B = _mm_castsi128_ps (b);
8013  __m128 C = _mm_castsi128_ps (c);
8014  __m128 D = _mm_castsi128_ps (d);
8015  _MM_TRANSPOSE4_PS (A, B, C, D);
8016  a = _mm_castps_si128 (A);
8017  b = _mm_castps_si128 (B);
8018  c = _mm_castps_si128 (C);
8019  d = _mm_castps_si128 (D);
8020 #else
8021  vint4 A (a[0], b[0], c[0], d[0]);
8022  vint4 B (a[1], b[1], c[1], d[1]);
8023  vint4 C (a[2], b[2], c[2], d[2]);
8024  vint4 D (a[3], b[3], c[3], d[3]);
8025  a = A; b = B; c = C; d = D;
8026 #endif
8027 }
8028 
8029 OIIO_FORCEINLINE void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
8030  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3)
8031 {
8032 #if OIIO_SIMD_SSE
8033  //_MM_TRANSPOSE4_PS (a, b, c, d);
8034  __m128 A = _mm_castsi128_ps (a);
8035  __m128 B = _mm_castsi128_ps (b);
8036  __m128 C = _mm_castsi128_ps (c);
8037  __m128 D = _mm_castsi128_ps (d);
8038  _MM_TRANSPOSE4_PS (A, B, C, D);
8039  r0 = _mm_castps_si128 (A);
8040  r1 = _mm_castps_si128 (B);
8041  r2 = _mm_castps_si128 (C);
8042  r3 = _mm_castps_si128 (D);
8043 #else
8044  r0.load (a[0], b[0], c[0], d[0]);
8045  r1.load (a[1], b[1], c[1], d[1]);
8046  r2.load (a[2], b[2], c[2], d[2]);
8047  r3.load (a[3], b[3], c[3], d[3]);
8048 #endif
8049 }
8050 
8051 
8053  const vfloat4& c, const vfloat4& d)
8054 {
8055 #if OIIO_SIMD_SSE
8056  vfloat4 l02 = _mm_unpacklo_ps (a, c);
8057  vfloat4 l13 = _mm_unpacklo_ps (b, d);
8058  return _mm_unpacklo_ps (l02, l13);
8059 #else
8060  return vfloat4 (a[0], b[0], c[0], d[0]);
8061 #endif
8062 }
8063 
8064 
8066  const vint4& c, const vint4& d)
8067 {
8068 #if OIIO_SIMD_SSE
8069  vint4 l02 = _mm_unpacklo_epi32 (a, c);
8070  vint4 l13 = _mm_unpacklo_epi32 (b, d);
8071  return _mm_unpacklo_epi32 (l02, l13);
8072 #else
8073  return vint4 (a[0], b[0], c[0], d[0]);
8074 #endif
8075 }
8076 
8077 
8078 
8079 //////////////////////////////////////////////////////////////////////
8080 // vfloat3 implementation
8081 
8083 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
8084  m_simd = other.m_simd;
8085 #else
8086  SIMD_CONSTRUCT_PAD (other[i]);
8087 #endif
8088 }
8089 
8091 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
8092  m_simd = other.simd();
8093 #else
8094  SIMD_CONSTRUCT_PAD (other[i]);
8095 #endif
8096 }
8097 
8099 
8101 
8102 OIIO_FORCEINLINE const vfloat3 vfloat3::Iota (float start, float step) {
8103  return vfloat3 (start+0.0f*step, start+1.0f*step, start+2.0f*step);
8104 }
8105 
8106 
8107 OIIO_FORCEINLINE void vfloat3::load (float val) { vfloat4::load (val, val, val, 0.0f); }
8108 
8109 OIIO_FORCEINLINE void vfloat3::load (const float *values) { vfloat4::load (values, 3); }
8110 
8111 OIIO_FORCEINLINE void vfloat3::load (const float *values, int n) {
8112  vfloat4::load (values, n);
8113 }
8114 
8115 OIIO_FORCEINLINE void vfloat3::load (const unsigned short *values) {
8116  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
8117 }
8118 
8119 OIIO_FORCEINLINE void vfloat3::load (const short *values) {
8120  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
8121 }
8122 
8123 OIIO_FORCEINLINE void vfloat3::load (const unsigned char *values) {
8124  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
8125 }
8126 
8127 OIIO_FORCEINLINE void vfloat3::load (const char *values) {
8128  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
8129 }
8130 
8131 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8132 OIIO_FORCEINLINE void vfloat3::load (const half *values) {
8133  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
8134 }
8135 #endif /* _HALF_H_ or _IMATH_H_ */
8136 
8137 OIIO_FORCEINLINE void vfloat3::store (float *values) const {
8138  vfloat4::store (values, 3);
8139 }
8140 
8141 OIIO_FORCEINLINE void vfloat3::store (float *values, int n) const {
8142  vfloat4::store (values, n);
8143 }
8144 
8145 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8146 OIIO_FORCEINLINE void vfloat3::store (half *values) const {
8147  SIMD_DO (values[i] = m_val[i]);
8148 }
8149 #endif
8150 
8151 
8153  return vfloat3 (vfloat4(a) + vfloat4(b));
8154 }
8155 
8157  *this = *this + a; return *this;
8158 }
8159 
8161  return vfloat3 (-vfloat4(*this));
8162 }
8163 
8165  return vfloat3 (vfloat4(a) - vfloat4(b));
8166 }
8167 
8169  *this = *this - a; return *this;
8170 }
8171 
8173  return vfloat3 (vfloat4(a) * vfloat4(b));
8174 }
8175 
8177  return vfloat3 (vfloat4(a) * b);
8178 }
8179 
8181  return b * a;
8182 }
8183 
8185  *this = *this * a; return *this;
8186 }
8187 
8189  *this = *this * a; return *this;
8190 }
8191 
8193  return vfloat3 (vfloat4(a) / b.xyz1()); // Avoid divide by zero!
8194 }
8195 
8197  *this = *this / a; return *this;
8198 }
8199 
8201  *this = *this / a; return *this;
8202 }
8203 
8204 
8205 inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val) {
8206  cout << val[0];
8207  for (int i = 1; i < val.elements; ++i)
8208  cout << ' ' << val[i];
8209  return cout;
8210 }
8211 
8212 
8214 {
8215 #if OIIO_SIMD_SSE
8216  // Just clear the sign bit for cheap fabsf
8217  return vfloat3(_mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
8218 #elif OIIO_SIMD_NEON
8219  return vfloat3(vabsq_f32(a.simd()));
8220 #else
8221  SIMD_RETURN (vfloat3, fabsf(a[i]));
8222 #endif
8223 }
8224 
8225 
8227 {
8228  vfloat3 one(1.0f);
8229  return vfloat3(blend (one, -one, a < vfloat3::Zero()));
8230 }
8231 
8232 
8234 {
8235 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8236  return vfloat3(_mm_ceil_ps (a));
8237 #else
8238  SIMD_RETURN (vfloat3, ceilf(a[i]));
8239 #endif
8240 }
8241 
8243 {
8244 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8245  return vfloat3(_mm_floor_ps (a));
8246 #else
8247  SIMD_RETURN (vfloat3, floorf(a[i]));
8248 #endif
8249 }
8250 
8252 {
8253 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8254  return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)));
8255 #else
8256  SIMD_RETURN (vfloat3, roundf(a[i]));
8257 #endif
8258 }
8259 
8260 
8262 #if OIIO_SIMD_SSE
8263  return vfloat3 ((vreduce_add(vfloat4(v))).xyz0());
8264 #else
8265  return vfloat3 (v[0] + v[1] + v[2]);
8266 #endif
8267 }
8268 
8269 
8271 #if OIIO_SIMD_SSE >= 4
8272  return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8273 #else
8274  return vreduce_add (a*b);
8275 #endif
8276 }
8277 
8278 
8279 OIIO_FORCEINLINE float dot (const vfloat3 &a, const vfloat3 &b) {
8280 #if OIIO_SIMD_SSE >= 4
8281  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
8282 #elif OIIO_SIMD
8283  return reduce_add (a*b);
8284 #else
8285  return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
8286 #endif
8287 }
8288 
8289 
8291 #if OIIO_SIMD_SSE >= 4
8292  return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8293 #else
8294  return vfloat3 (vreduce_add((a*b).xyz0()).xyz0());
8295 #endif
8296 }
8297 
8298 
8300 {
8301  return dot(*this, *this);
8302 }
8303 
8304 
8306 {
8307  return sqrtf(dot(*this, *this));
8308 }
8309 
8310 
8312 {
8313 #if OIIO_SIMD
8314  vfloat3 len2 = vdot3 (*this, *this);
8315  return vfloat3 (safe_div (*this, sqrt(len2)));
8316 #else
8317  float len2 = dot (*this, *this);
8318  return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8319 #endif
8320 }
8321 
8322 
8324 {
8325 #if OIIO_SIMD
8326  vfloat3 len2 = vdot3 (*this, *this);
8327  vfloat4 invlen = blend0not (rsqrt_fast (len2), len2 == vfloat4::Zero());
8328  return vfloat3 ((*this) * invlen);
8329 #else
8330  float len2 = dot (*this, *this);
8331  return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8332 #endif
8333 }
8334 
8335 
8336 
8337 //////////////////////////////////////////////////////////////////////
8338 // matrix44 implementation
8339 
8340 
8341 #ifdef INCLUDED_IMATHMATRIX_H
8343  return *(Imath::M44f*)this;
8344 }
8345 #endif
8346 
8347 
8349  return m_row[i];
8350 }
8351 
8352 
8354 {
8355  m_row[0] = m[0];
8356  m_row[1] = m[1];
8357  m_row[2] = m[2];
8358  m_row[3] = m[3];
8359  return *this;
8360 }
8361 
8362 
8364 #if OIIO_SIMD_SSE
8365  matrix44 T;
8366  simd::transpose (m_row[0], m_row[1], m_row[2], m_row[3],
8367  T.m_row[0], T.m_row[1], T.m_row[2], T.m_row[3]);
8368  return T;
8369 #else
8370  return matrix44(m_vals[0][0], m_vals[1][0], m_vals[2][0], m_vals[3][0],
8371  m_vals[0][1], m_vals[1][1], m_vals[2][1], m_vals[3][1],
8372  m_vals[0][2], m_vals[1][2], m_vals[2][2], m_vals[3][2],
8373  m_vals[0][3], m_vals[1][3], m_vals[2][3], m_vals[3][3]);
8374 #endif
8375 }
8376 
8378 #if OIIO_SIMD_SSE
8379  vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8380  shuffle<2>(V) * m_row[2] + m_row[3];
8381  R = R / shuffle<3>(R);
8382  return vfloat3 (R.xyz0());
8383 #else
8384  value_t a, b, c, w;
8385  a = V[0] * m_vals[0][0] + V[1] * m_vals[1][0] + V[2] * m_vals[2][0] + m_vals[3][0];
8386  b = V[0] * m_vals[0][1] + V[1] * m_vals[1][1] + V[2] * m_vals[2][1] + m_vals[3][1];
8387  c = V[0] * m_vals[0][2] + V[1] * m_vals[1][2] + V[2] * m_vals[2][2] + m_vals[3][2];
8388  w = V[0] * m_vals[0][3] + V[1] * m_vals[1][3] + V[2] * m_vals[2][3] + m_vals[3][3];
8389  return vfloat3(a / w, b / w, c / w);
8390 #endif
8391 }
8392 
8394 #if OIIO_SIMD_SSE
8395  vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8396  shuffle<2>(V) * m_row[2];
8397  return vfloat3 (R.xyz0());
8398 #else
8399  value_t a, b, c;
8400  a = V[0] * m_vals[0][0] + V[1] * m_vals[1][0] + V[2] * m_vals[2][0];
8401  b = V[0] * m_vals[0][1] + V[1] * m_vals[1][1] + V[2] * m_vals[2][1];
8402  c = V[0] * m_vals[0][2] + V[1] * m_vals[1][2] + V[2] * m_vals[2][2];
8403  return vfloat3(a, b, c);
8404 #endif
8405 }
8406 
8408 #if OIIO_SIMD_SSE
8409  matrix44 T = transposed();
8410  vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8411  shuffle<2>(V) * T[2];
8412  return vfloat3 (R.xyz0());
8413 #else
8414  value_t a, b, c;
8415  a = V[0] * m_vals[0][0] + V[1] * m_vals[0][1] + V[2] * m_vals[0][2];
8416  b = V[0] * m_vals[1][0] + V[1] * m_vals[1][1] + V[2] * m_vals[1][2];
8417  c = V[0] * m_vals[2][0] + V[1] * m_vals[2][1] + V[2] * m_vals[2][2];
8418  return vfloat3(a, b, c);
8419 #endif
8420 }
8421 
8423 {
8424 #if OIIO_SIMD_SSE
8425  return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8426  shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8427 #else
8428  float a, b, c, w;
8429  a = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0] + V[3] * M[3][0];
8430  b = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1] + V[3] * M[3][1];
8431  c = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2] + V[3] * M[3][2];
8432  w = V[0] * M[0][3] + V[1] * M[1][3] + V[2] * M[2][3] + V[3] * M[3][3];
8433  return vfloat4(a, b, c, w);
8434 #endif
8435 }
8436 
8438 {
8439 #if OIIO_SIMD_SSE >= 3
8440  vfloat4 m0v = M[0] * V; // [ M00*Vx, M01*Vy, M02*Vz, M03*Vw ]
8441  vfloat4 m1v = M[1] * V; // [ M10*Vx, M11*Vy, M12*Vz, M13*Vw ]
8442  vfloat4 m2v = M[2] * V; // [ M20*Vx, M21*Vy, M22*Vz, M23*Vw ]
8443  vfloat4 m3v = M[3] * V; // [ M30*Vx, M31*Vy, M32*Vz, M33*Vw ]
8444  vfloat4 s01 = _mm_hadd_ps(m0v, m1v);
8445  // [ M00*Vx + M01*Vy, M02*Vz + M03*Vw, M10*Vx + M11*Vy, M12*Vz + M13*Vw ]
8446  vfloat4 s23 = _mm_hadd_ps(m2v, m3v);
8447  // [ M20*Vx + M21*Vy, M22*Vz + M23*Vw, M30*Vx + M31*Vy, M32*Vz + M33*Vw ]
8448  vfloat4 result = _mm_hadd_ps(s01, s23);
8449  // [ M00*Vx + M01*Vy + M02*Vz + M03*Vw,
8450  // M10*Vx + M11*Vy + M12*Vz + M13*Vw,
8451  // M20*Vx + M21*Vy + M22*Vz + M23*Vw,
8452  // M30*Vx + M31*Vy + M32*Vz + M33*Vw ]
8453  return result;
8454 #else
8455  return vfloat4(dot(M[0], V), dot(M[1], V), dot(M[2], V), dot(M[3], V));
8456 #endif
8457 }
8458 
8459 
8461 #if OIIO_SIMD_SSE
8462  vbool4 b0 = (m_row[0] == m[0]);
8463  vbool4 b1 = (m_row[1] == m[1]);
8464  vbool4 b2 = (m_row[2] == m[2]);
8465  vbool4 b3 = (m_row[3] == m[3]);
8466  return simd::all (b0 & b1 & b2 & b3);
8467 #else
8468  return memcmp(this, &m, 16*sizeof(float)) == 0;
8469 #endif
8470 }
8471 
8473 #if OIIO_SIMD_SSE
8474  vbool4 b0 = (m_row[0] != m[0]);
8475  vbool4 b1 = (m_row[1] != m[1]);
8476  vbool4 b2 = (m_row[2] != m[2]);
8477  vbool4 b3 = (m_row[3] != m[3]);
8478  return simd::any (b0 | b1 | b2 | b3);
8479 #else
8480  return memcmp(this, &m, 16*sizeof(float)) != 0;
8481 #endif
8482 }
8483 
8484 
8486  return memcmp(data(), m.data(), 16*sizeof(float)) == 0;
8487 }
8488 
8490  return (b == a);
8491 }
8492 
8494  return memcmp(data(), m.data(), 16*sizeof(float)) != 0;
8495 }
8496 
8498  return (b != a);
8499 }
8500 
8501 
8502 
8504 {
8505  // Adapted from this code from Intel:
8506  // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
8507  vfloat4 minor0, minor1, minor2, minor3;
8508  vfloat4 det, tmp1;
8509 #if 0
8510  // Original code looked like this:
8511  vfloat4 row0, row1, row2, row3;
8512  const float *src = (const float *)&msrc;
8513  tmp1.load_pairs(src, src+ 4);
8514  row1.load_pairs(src+8, src+12);
8515  row0 = shuffle<0x88>(tmp1, row1);
8516  row1 = shuffle<0xDD>(row1, tmp1);
8517  tmp1.load_pairs(src+ 2, src+ 6);
8518  row3.load_pairs(src+10, src+14);
8519  row2 = shuffle<0x88>(tmp1, row3);
8520  row3 = shuffle<0xDD>(row3, tmp1);
8521 #else
8522  // But this is simpler and easier to understand:
8523  matrix44 Mt = this->transposed();
8524  vfloat4 row0 = Mt[0];
8525  vfloat4 row1 = shuffle<2,3,0,1>(Mt[1]);
8526  vfloat4 row2 = Mt[2];
8527  vfloat4 row3 = shuffle<2,3,0,1>(Mt[3]);
8528 #endif
8529  // At this point, the row variables should contain the following indices
8530  // of the original input matrix:
8531  // row0 = 0 4 8 12
8532  // row1 = 9 13 1 5
8533  // row2 = 2 6 10 14
8534  // row3 = 11 15 3 7
8535 
8536  // -----------------------------------------------
8537  tmp1 = row2 * row3;
8538  tmp1 = shuffle<1,0,3,2>(tmp1);
8539  minor0 = row1 * tmp1;
8540  minor1 = row0 * tmp1;
8541  tmp1 = shuffle<2,3,0,1>(tmp1);
8542  minor0 = (row1 * tmp1) - minor0;
8543  minor1 = (row0 * tmp1) - minor1;
8544  minor1 = shuffle<2,3,0,1>(minor1);
8545  // -----------------------------------------------
8546  tmp1 = row1 * row2;
8547  tmp1 = shuffle<1,0,3,2>(tmp1);
8548  minor0 = (row3 * tmp1) + minor0;
8549  minor3 = row0 * tmp1;
8550  tmp1 = shuffle<2,3,0,1>(tmp1);
8551  minor0 = minor0 - (row3 * tmp1);
8552  minor3 = (row0 * tmp1) - minor3;
8553  minor3 = shuffle<2,3,0,1>(minor3);
8554  // -----------------------------------------------
8555  tmp1 = shuffle<2,3,0,1>(row1) * row3;
8556  tmp1 = shuffle<1,0,3,2>(tmp1);
8557  row2 = shuffle<2,3,0,1>(row2);
8558  minor0 = (row2 * tmp1) + minor0;
8559  minor2 = row0 * tmp1;
8560  tmp1 = shuffle<2,3,0,1>(tmp1);
8561  minor0 = minor0 - (row2 * tmp1);
8562  minor2 = (row0 * tmp1) - minor2;
8563  minor2 = shuffle<2,3,0,1>(minor2);
8564  // -----------------------------------------------
8565  tmp1 = row0 * row1;
8566  tmp1 = shuffle<1,0,3,2>(tmp1);
8567  minor2 = (row3 * tmp1) + minor2;
8568  minor3 = (row2 * tmp1) - minor3;
8569  tmp1 = shuffle<2,3,0,1>(tmp1);
8570  minor2 = (row3 * tmp1) - minor2;
8571  minor3 = minor3 - (row2 * tmp1);
8572  // -----------------------------------------------
8573  tmp1 = row0 * row3;
8574  tmp1 = shuffle<1,0,3,2>(tmp1);
8575  minor1 = minor1 - (row2 * tmp1);
8576  minor2 = (row1 * tmp1) + minor2;
8577  tmp1 = shuffle<2,3,0,1>(tmp1);
8578  minor1 = (row2 * tmp1) + minor1;
8579  minor2 = minor2 - (row1 * tmp1);
8580  // -----------------------------------------------
8581  tmp1 = row0 * row2;
8582  tmp1 = shuffle<1,0,3,2>(tmp1);
8583  minor1 = (row3 * tmp1) + minor1;
8584  minor3 = minor3 - (row1 * tmp1);
8585  tmp1 = shuffle<2,3,0,1>(tmp1);
8586  minor1 = minor1 - (row3 * tmp1);
8587  minor3 = (row1 * tmp1) + minor3;
8588  // -----------------------------------------------
8589  det = row0 * minor0;
8590  float det0 = reduce_add(det);
8591  float tmp1_0 = 1.0f / det0;
8592  det0 = (tmp1_0 + tmp1_0) - (det0 * tmp1_0 * tmp1_0);
8593  det = vfloat4(det0);
8594  return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8595 }
8596 
8597 
8598 
8599 inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
8600  const float *m = (const float *)&M;
8601  cout << m[0];
8602  for (int i = 1; i < 16; ++i)
8603  cout << ' ' << m[i];
8604  return cout;
8605 }
8606 
8607 
8608 
8610  return M.transformp (V);
8611 }
8612 
8614  return M.transformv (V);
8615 }
8616 
8618 {
8619  return M.transformvT (V);
8620 }
8621 
8622 
8624 {
8625  return matrix44(M).transformp(V);
8626 }
8627 
8629 {
8630  return matrix44(M).transformv(V);
8631 }
8632 
8634 {
8635  return matrix44(M).transformvT(V);
8636 }
8637 
8638 
8639 
8640 //////////////////////////////////////////////////////////////////////
8641 // vfloat8 implementation
8642 
8645  return m_val[i];
8646 }
8647 
8650  return m_val[i];
8651 }
8652 
8653 
8654 inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val) {
8655  cout << val[0];
8656  for (int i = 1; i < val.elements; ++i)
8657  cout << ' ' << val[i];
8658  return cout;
8659 }
8660 
8661 
8663 #if OIIO_SIMD_AVX
8664  return _mm256_castps256_ps128 (simd());
8665 #else
8666  return m_4[0];
8667 #endif
8668 }
8669 
8671 #if OIIO_SIMD_AVX
8672  return _mm256_extractf128_ps (simd(), 1);
8673 #else
8674  return m_4[1];
8675 #endif
8676 }
8677 
8678 
8680 #if OIIO_SIMD_AVX
8681  __m256 r = _mm256_castps128_ps256 (lo);
8682  m_simd = _mm256_insertf128_ps (r, hi, 1);
8683  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
8684  // FIXME: when would that not be available?
8685 #else
8686  m_4[0] = lo;
8687  m_4[1] = hi;
8688 #endif
8689 }
8690 
8691 
8693 #if OIIO_SIMD_AVX
8694  m_simd = _mm256_cvtepi32_ps (ival);
8695 #else
8696  SIMD_CONSTRUCT (float(ival[i]));
8697 #endif
8698 }
8699 
8700 
8702 #if OIIO_SIMD_AVX
8703  return _mm256_setzero_ps();
8704 #else
8705  return vfloat8(0.0f);
8706 #endif
8707 }
8708 
8710  return vfloat8(1.0f);
8711 }
8712 
8713 OIIO_FORCEINLINE const vfloat8 vfloat8::Iota (float start, float step) {
8714  return vfloat8 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
8715  start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step);
8716 }
8717 
8718 /// Set all components to 0.0
8720 #if OIIO_SIMD_AVX
8721  m_simd = _mm256_setzero_ps();
8722 #else
8723  load (0.0f);
8724 #endif
8725 }
8726 
8727 
8728 
8730 #if OIIO_SIMD_AVX
8731  m_simd = _mm256_set1_ps (val);
8732 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8733  m_4[0].load(val);
8734  m_4[1].load(val);
8735 #else
8736  SIMD_CONSTRUCT (val);
8737 #endif
8738 }
8739 
8740 OIIO_FORCEINLINE void vfloat8::load (float a, float b, float c, float d,
8741  float e, float f, float g, float h) {
8742 #if OIIO_SIMD_AVX
8743  m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a);
8744 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8745  m_4[0].load(a, b, c, d);
8746  m_4[1].load(e, f, g, h);
8747 #else
8748  m_val[0] = a;
8749  m_val[1] = b;
8750  m_val[2] = c;
8751  m_val[3] = d;
8752  m_val[4] = e;
8753  m_val[5] = f;
8754  m_val[6] = g;
8755  m_val[7] = h;
8756 #endif
8757 }
8758 
8759 
8760 OIIO_FORCEINLINE void vfloat8::load (const float *values) {
8761 #if OIIO_SIMD_AVX
8762  m_simd = _mm256_loadu_ps (values);
8763 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8764  m_4[0].load(values);
8765  m_4[1].load(values+4);
8766 #else
8767  SIMD_CONSTRUCT (values[i]);
8768 #endif
8769 }
8770 
8771 
8772 OIIO_FORCEINLINE void vfloat8::load (const float *values, int n) {
8773  OIIO_DASSERT (n >= 0 && n <= elements);
8774 #if 0 && OIIO_AVX512VL_ENABLED
8775  // This SHOULD be fast, but in my benchmarks, it is slower!
8776  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8777  // Re-test this periodically with new Intel hardware.
8778  m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values);
8779 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8780  if (n > 4) {
8781  vfloat4 lo, hi;
8782  lo.load (values);
8783  hi.load (values+4, n-4);
8784  m_4[0] = lo;
8785  m_4[1] = hi;
8786  } else {
8787  vfloat4 lo, hi;
8788  lo.load (values, n);
8789  hi.clear();
8790  m_4[0] = lo;
8791  m_4[1] = hi;
8792  }
8793 #else
8794  for (int i = 0; i < n; ++i)
8795  m_val[i] = values[i];
8796  for (int i = n; i < paddedelements; ++i)
8797  m_val[i] = 0;
8798 #endif
8799 }
8800 
8801 
8802 OIIO_FORCEINLINE void vfloat8::load (const unsigned short *values) {
8803 #if OIIO_SIMD_AVX
8804  // Rely on the ushort->int conversion, then convert to float
8805  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8806 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8807  m_4[0].load(values);
8808  m_4[1].load(values+4);
8809 #else
8810  SIMD_CONSTRUCT (values[i]);
8811 #endif
8812 }
8813 
8814 
8815 OIIO_FORCEINLINE void vfloat8::load (const short *values) {
8816 #if OIIO_SIMD_AVX
8817  // Rely on the short->int conversion, then convert to float
8818  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8819 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8820  m_4[0].load(values);
8821  m_4[1].load(values+4);
8822 #else
8823  SIMD_CONSTRUCT (values[i]);
8824 #endif
8825 }
8826 
8827 
8828 OIIO_FORCEINLINE void vfloat8::load (const unsigned char *values) {
8829 #if OIIO_SIMD_AVX
8830  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8831 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8832  m_4[0].load(values);
8833  m_4[1].load(values+4);
8834 #else
8835  SIMD_CONSTRUCT (values[i]);
8836 #endif
8837 }
8838 
8839 
8840 OIIO_FORCEINLINE void vfloat8::load (const char *values) {
8841 #if OIIO_SIMD_AVX
8842  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8843 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8844  m_4[0].load(values);
8845  m_4[1].load(values+4);
8846 #else
8847  SIMD_CONSTRUCT (values[i]);
8848 #endif
8849 }
8850 
8851 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8852 OIIO_FORCEINLINE void vfloat8::load (const half *values) {
8853 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8854  /* Enabled 16 bit float instructions! */
8855  vint4 a ((const int *)values);
8856  m_simd = _mm256_cvtph_ps (a);
8857 #elif OIIO_SIMD_SSE >= 2 || OIIO_SIMD_NEON
8858  m_4[0] = vfloat4(values);
8859  m_4[1] = vfloat4(values+4);
8860 #else /* No SIMD defined: */
8861  SIMD_CONSTRUCT (values[i]);
8862 #endif
8863 }
8864 #endif /* _HALF_H_ or _IMATH_H_ */
8865 
8866 
8867 OIIO_FORCEINLINE void vfloat8::store (float *values) const {
8868 #if OIIO_SIMD_AVX
8869  // Use an unaligned store -- it's just as fast when the memory turns
8870  // out to be aligned, nearly as fast even when unaligned. Not worth
8871  // the headache of using stores that require alignment.
8872  _mm256_storeu_ps (values, m_simd);
8873 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8874  m_4[0].store(values);
8875  m_4[1].store(values+4);
8876 #else
8877  SIMD_DO (values[i] = m_val[i]);
8878 #endif
8879 }
8880 
8881 
8882 OIIO_FORCEINLINE void vfloat8::store (float *values, int n) const {
8883  OIIO_DASSERT (n >= 0 && n <= elements);
8884 #if 0 && OIIO_AVX512VL_ENABLED
8885  // This SHOULD be fast, but in my benchmarks, it is slower!
8886  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8887  // Re-test this periodically with new Intel hardware.
8888  _mm256_mask_storeu_ps (values, __mmask8(~(0xff << n)), m_simd);
8889 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8890  if (n <= 4) {
8891  lo().store (values, n);
8892  } else if (n <= 8) {
8893  lo().store (values);
8894  hi().store (values+4, n-4);
8895  }
8896 #else
8897  for (int i = 0; i < n; ++i)
8898  values[i] = m_val[i];
8899 #endif
8900 }
8901 
8902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8903 OIIO_FORCEINLINE void vfloat8::store (half *values) const {
8904 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8905  __m128i h = _mm256_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
8906  _mm_storeu_si128 ((__m128i *)values, h);
8907 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8908  m_4[0].store(values);
8909  m_4[1].store(values+4);
8910 #else
8911  SIMD_DO (values[i] = m_val[i]);
8912 #endif
8913 }
8914 #endif
8915 
8916 
8917 OIIO_FORCEINLINE void vfloat8::load_mask (int mask, const float *values) {
8918 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8919  m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
8920 #elif OIIO_SIMD_AVX
8921  m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
8922 #else
8923  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
8924 #endif
8925 }
8926 
8927 
8928 OIIO_FORCEINLINE void vfloat8::load_mask (const vbool8& mask, const float *values) {
8929 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8930  m_simd = _mm256_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
8931 #elif OIIO_SIMD_AVX
8932  m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask));
8933 #else
8934  SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
8935 #endif
8936 }
8937 
8938 
8939 OIIO_FORCEINLINE void vfloat8::store_mask (int mask, float *values) const {
8940 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8941  _mm256_mask_storeu_ps (values, __mmask8(mask), m_simd);
8942 #elif OIIO_SIMD_AVX
8943  _mm256_maskstore_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
8944 #else
8945  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
8946 #endif
8947 }
8948 
8949 
8950 OIIO_FORCEINLINE void vfloat8::store_mask (const vbool8& mask, float *values) const {
8951 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8952  _mm256_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
8953 #elif OIIO_SIMD_AVX
8954  _mm256_maskstore_ps (values, _mm256_castps_si256(mask.simd()), m_simd);
8955 #else
8956  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
8957 #endif
8958 }
8959 
8960 
8961 template <int scale>
8962 OIIO_FORCEINLINE void
8963 vfloat8::gather (const value_t *baseptr, const vint_t& vindex)
8964 {
8965 #if OIIO_SIMD_AVX >= 2
8966  m_simd = _mm256_i32gather_ps (baseptr, vindex, scale);
8967 #else
8968  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
8969 #endif
8970 }
8971 
8972 template<int scale>
8973 OIIO_FORCEINLINE void
8974 vfloat8::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
8975 {
8976 #if OIIO_SIMD_AVX >= 2
8977  m_simd = _mm256_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
8978 #else
8979  SIMD_DO (if (mask[i]) m_val[i] = *(const value_t *)((const char *)baseptr + vindex[i]*scale));
8980 #endif
8981 }
8982 
8983 template<int scale>
8984 OIIO_FORCEINLINE void
8985 vfloat8::scatter (value_t *baseptr, const vint_t& vindex) const
8986 {
8987 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8988  _mm256_i32scatter_ps (baseptr, vindex, m_simd, scale);
8989 #else
8990  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
8991 #endif
8992 }
8993 
8994 template<int scale>
8995 OIIO_FORCEINLINE void
8997  const vint_t& vindex) const
8998 {
8999 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9000  _mm256_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
9001 #else
9002  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
9003 #endif
9004 }
9005 
9006 
9007 
9009 #if OIIO_SIMD_AVX
9010  return _mm256_add_ps (a, b);
9011 #else
9012  return vfloat8 (a.lo()+b.lo(), a.hi()+b.hi());
9013 #endif
9014 }
9015 
9017  return a = a + b;
9018 }
9019 
9021 #if OIIO_SIMD_AVX
9022  return _mm256_sub_ps (_mm256_setzero_ps(), a);
9023 #else
9024  return vfloat8 (-a.lo(), -a.hi());
9025 #endif
9026 }
9027 
9029 #if OIIO_SIMD_AVX
9030  return _mm256_sub_ps (a, b);
9031 #else
9032  return vfloat8 (a.lo()-b.lo(), a.hi()-b.hi());
9033 #endif
9034 }
9035 
9037  return a = a - b;
9038 }
9039 
9041 #if OIIO_SIMD_AVX
9042  return _mm256_mul_ps (a.m_simd, _mm256_set1_ps(b));
9043 #else
9044  return vfloat8 (a.lo()*b, a.hi()*b);
9045 #endif
9046 }
9047 
9049  return b * a;
9050 }
9051 
9053 #if OIIO_SIMD_AVX
9054  return _mm256_mul_ps (a, b);
9055 #else
9056  return vfloat8 (a.lo()*b.lo(), a.hi()*b.hi());
9057 #endif
9058 }
9059 
9061  return a = a * b;
9062 }
9063 
9065 #if OIIO_SIMD_AVX
9066  return _mm256_div_ps (a, b);
9067 #else
9068  return vfloat8 (a.lo()/b.lo(), a.hi()/b.hi());
9069 #endif
9070 }
9071 
9073  return a = a / b;
9074 }
9075 
9077 #if OIIO_SIMD_AVX
9078  return _mm256_cmp_ps (a, b, _CMP_EQ_OQ);
9079 #else
9080  return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
9081 #endif
9082 }
9083 
9085 #if OIIO_SIMD_AVX
9086  return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ);
9087 #else
9088  return vbool8 (a.lo() != b.lo(), a.hi() != b.hi());
9089 #endif
9090 }
9091 
9093 #if OIIO_SIMD_AVX
9094  return _mm256_cmp_ps (a, b, _CMP_LT_OQ);
9095 #else
9096  return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
9097 #endif
9098 }
9099 
9101 #if OIIO_SIMD_AVX
9102  return _mm256_cmp_ps (a, b, _CMP_GT_OQ);
9103 #else
9104  return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
9105 #endif
9106 }
9107 
9109 #if OIIO_SIMD_AVX
9110  return _mm256_cmp_ps (a, b, _CMP_GE_OQ);
9111 #else
9112  return vbool8 (a.lo() >= b.lo(), a.hi() >= b.hi());
9113 #endif
9114 }
9115 
9117 #if OIIO_SIMD_AVX
9118  return _mm256_cmp_ps (a, b, _CMP_LE_OQ);
9119 #else
9120  return vbool8 (a.lo() <= b.lo(), a.hi() <= b.hi());
9121 #endif
9122 }
9123 
9124 
9125 // Implementation had to be after the definition of vfloat8.
9127 {
9128 #if OIIO_SIMD_AVX
9129  m_simd = _mm256_cvttps_epi32(f);
9130 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9131  *this = vint8 (vint4(f.lo()), vint4(f.hi()));
9132 #else
9133  SIMD_CONSTRUCT ((int) f[i]);
9134 #endif
9135 }
9136 
9137 
9138 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
9140 #if OIIO_SIMD_AVX >= 2
9141  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
9142  return _mm256_permutevar8x32_ps (a, index);
9143 #else
9144  return vfloat8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
9145 #endif
9146 }
9147 
9148 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
9149 #if OIIO_SIMD_AVX >= 2
9150  return _mm256_permutevar8x32_ps (a, vint8(i));
9151 #else
9152  return shuffle<i,i,i,i,i,i,i,i>(a);
9153 #endif
9154 }
9155 
9156 
9157 template<int i>
9159 #if OIIO_SIMD_AVX_NO_FIXME
9160  // Looks like the fastest we can do it is to extract a vfloat4,
9161  // shuffle its one element everywhere, then extract element 0.
9162  _m128 f4 = _mm256_extractf128_ps (i >> 2);
9163  int j = i & 3;
9164  return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd()));
9165 #else
9166  return v[i];
9167 #endif
9168 }
9169 
9170 
9171 template<int i>
9173 #if OIIO_SIMD_AVX_NO_FIXME
9174  return _mm256_insert_epi32 (a, val, i);
9175 #else
9176  vfloat8 tmp = a;
9177  tmp[i] = val;
9178  return tmp;
9179 #endif
9180 }
9181 
9182 
9183 OIIO_FORCEINLINE float vfloat8::x () const { return extract<0>(*this); }
9184 OIIO_FORCEINLINE float vfloat8::y () const { return extract<1>(*this); }
9185 OIIO_FORCEINLINE float vfloat8::z () const { return extract<2>(*this); }
9186 OIIO_FORCEINLINE float vfloat8::w () const { return extract<3>(*this); }
9187 OIIO_FORCEINLINE void vfloat8::set_x (float val) { *this = insert<0>(*this, val); }
9188 OIIO_FORCEINLINE void vfloat8::set_y (float val) { *this = insert<1>(*this, val); }
9189 OIIO_FORCEINLINE void vfloat8::set_z (float val) { *this = insert<2>(*this, val); }
9190 OIIO_FORCEINLINE void vfloat8::set_w (float val) { *this = insert<3>(*this, val); }
9191 
9192 
9194 {
9195 #if OIIO_SIMD_AVX
9196  return _mm256_castps_si256 (x.simd());
9197 #else
9198  return vint8(bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
9199 #endif
9200 }
9201 
9203 {
9204 #if OIIO_SIMD_AVX
9205  return _mm256_castsi256_ps (x.simd());
9206 #else
9207  return vfloat8(bitcast_to_float(x.lo()), bitcast_to_float(x.hi()));
9208 #endif
9209 }
9210 
9211 
9213 #if OIIO_SIMD_AVX
9214  // From Syrah:
9215  vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
9216  vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
9217  // get efgh in the 0-idx slot
9218  vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
9219  vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
9220  return shuffle<0>(final_sum);
9221 #else
9222  vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
9223  return vfloat8(hadd4, hadd4);
9224 #endif
9225 }
9226 
9227 
9229 #if OIIO_SIMD_AVX >= 2
9230  return extract<0>(vreduce_add(v));
9231 #else
9232  return reduce_add(v.lo()) + reduce_add(v.hi());
9233 #endif
9234 }
9235 
9236 
9238 {
9239 #if OIIO_SIMD_AVX
9240  return _mm256_blendv_ps (a, b, mask);
9241 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9242  return vfloat8 (blend (a.lo(), b.lo(), mask.lo()),
9243  blend (a.hi(), b.hi(), mask.hi()));
9244 #else
9245  SIMD_RETURN (vfloat8, mask[i] ? b[i] : a[i]);
9246 #endif
9247 }
9248 
9249 
9251 {
9252 #if OIIO_SIMD_AVX
9253  return _mm256_and_ps(mask, a);
9254 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9255  return vfloat8 (blend0 (a.lo(), mask.lo()),
9256  blend0 (a.hi(), mask.hi()));
9257 #else
9258  SIMD_RETURN (vfloat8, mask[i] ? a[i] : 0.0f);
9259 #endif
9260 }
9261 
9262 
9264 {
9265 #if OIIO_SIMD_AVX
9266  return _mm256_andnot_ps(mask, a);
9267 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9268  return vfloat8 (blend0not (a.lo(), mask.lo()),
9269  blend0not (a.hi(), mask.hi()));
9270 #else
9271  SIMD_RETURN (vfloat8, mask[i] ? 0.0f : a[i]);
9272 #endif
9273 }
9274 
9275 
9277 {
9278  return blend (b, a, mask);
9279 }
9280 
9281 
9283 #if OIIO_SIMD_SSE
9284  return blend0not (a/b, b == vfloat8::Zero());
9285 #else
9286  SIMD_RETURN (vfloat8, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
9287 #endif
9288 }
9289 
9290 
9292 {
9293 #if OIIO_SIMD_AVX
9294  // Just clear the sign bit for cheap fabsf
9295  return _mm256_and_ps (a.simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
9296 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9297  return vfloat8(abs(a.lo()), abs(a.hi()));
9298 #else
9299  SIMD_RETURN (vfloat8, fabsf(a[i]));
9300 #endif
9301 }
9302 
9303 
9305 {
9306  vfloat8 one(1.0f);
9307  return blend (one, -one, a < vfloat8::Zero());
9308 }
9309 
9310 
9312 {
9313 #if OIIO_SIMD_AVX
9314  return _mm256_ceil_ps (a);
9315 #else
9316  SIMD_RETURN (vfloat8, ceilf(a[i]));
9317 #endif
9318 }
9319 
9321 {
9322 #if OIIO_SIMD_AVX
9323  return _mm256_floor_ps (a);
9324 #else
9325  SIMD_RETURN (vfloat8, floorf(a[i]));
9326 #endif
9327 }
9328 
9330 {
9331 #if OIIO_SIMD_AVX
9332  return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9333 #else
9334  SIMD_RETURN (vfloat8, roundf(a[i]));
9335 #endif
9336 }
9337 
9339 {
9340  // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
9341 #if OIIO_SIMD_AVX
9342  return vint8(floor(a));
9343 #elif OIIO_SIMD_SSE /* SSE2/3 */
9344  return vint8 (ifloor(a.lo()), ifloor(a.hi()));
9345 #else
9346  SIMD_RETURN (vint8, (int)floorf(a[i]));
9347 #endif
9348 }
9349 
9350 
9352 {
9353  return vint8 (round(a));
9354 }
9355 
9356 
9357 
9359 {
9360 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
9361  vfloat8 r = _mm256_rcp14_ps(a);
9362  return r * nmadd(r,a,vfloat8(2.0f));
9363 #elif OIIO_SIMD_AVX
9364  vfloat8 r = _mm256_rcp_ps(a);
9365  return r * nmadd(r,a,vfloat8(2.0f));
9366 #else
9367  return vfloat8(rcp_fast(a.lo()), rcp_fast(a.hi()));
9368 #endif
9369 }
9370 
9371 
9373 {
9374 #if OIIO_SIMD_AVX
9375  return _mm256_sqrt_ps (a.simd());
9376 #else
9377  SIMD_RETURN (vfloat8, sqrtf(a[i]));
9378 #endif
9379 }
9380 
9381 
9382 
9384 {
9385 #if OIIO_SIMD_AVX
9386  return _mm256_div_ps (_mm256_set1_ps(1.0f), _mm256_sqrt_ps (a.simd()));
9387 #else
9388  SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9389 #endif
9390 }
9391 
9392 
9393 
9395 {
9396 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9397  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9398  return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
9399 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9400  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9401  return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
9402 #elif OIIO_SIMD_AVX
9403  return _mm256_rsqrt_ps (a.simd());
9404 #elif OIIO_SIMD_SSE
9405  return vfloat8 (rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
9406 #else
9407  SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9408 #endif
9409 }
9410 
9411 
9412 
9414 {
9415 #if OIIO_SIMD_AVX
9416  return _mm256_min_ps (a, b);
9417 #else
9418  return vfloat8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
9419 #endif
9420 }
9421 
9423 {
9424 #if OIIO_SIMD_AVX
9425  return _mm256_max_ps (a, b);
9426 #else
9427  return vfloat8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
9428 #endif
9429 }
9430 
9431 
9433 #if OIIO_SIMD_AVX
9434  return _mm256_andnot_ps (a.simd(), b.simd());
9435 #else
9436  vint8 ai = bitcast_to_int(a);
9437  vint8 bi = bitcast_to_int(b);
9438  return bitcast_to_float(vint8(~(ai[0]) & bi[0],
9439  ~(ai[1]) & bi[1],
9440  ~(ai[2]) & bi[2],
9441  ~(ai[3]) & bi[3],
9442  ~(ai[4]) & bi[4],
9443  ~(ai[5]) & bi[5],
9444  ~(ai[6]) & bi[6],
9445  ~(ai[7]) & bi[7]));
9446 #endif
9447 }
9448 
9449 
9451  const simd::vfloat8& c)
9452 {
9453 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9454  // If we are sure _mm256_fmadd_ps intrinsic is available, use it.
9455  return _mm256_fmadd_ps (a, b, c);
9456 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9457  return vfloat8 (madd(a.lo(), b.lo(), c.lo()),
9458  madd(a.hi(), b.hi(), c.hi()));
9459 #else
9460  // Fallback: just use regular math and hope for the best.
9461  return a * b + c;
9462 #endif
9463 }
9464 
9465 
9467  const simd::vfloat8& c)
9468 {
9469 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9470  // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9471  return _mm256_fmsub_ps (a, b, c);
9472 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9473  return vfloat8 (msub(a.lo(), b.lo(), c.lo()),
9474  msub(a.hi(), b.hi(), c.hi()));
9475 #else
9476  // Fallback: just use regular math and hope for the best.
9477  return a * b - c;
9478 #endif
9479 }
9480 
9481 
9482 
9484  const simd::vfloat8& c)
9485 {
9486 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9487  // If we are sure _mm256_fnmadd_ps intrinsic is available, use it.
9488  return _mm256_fnmadd_ps (a, b, c);
9489 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9490  return vfloat8 (nmadd(a.lo(), b.lo(), c.lo()),
9491  nmadd(a.hi(), b.hi(), c.hi()));
9492 #else
9493  // Fallback: just use regular math and hope for the best.
9494  return c - a * b;
9495 #endif
9496 }
9497 
9498 
9499 
9501  const simd::vfloat8& c)
9502 {
9503 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9504  // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9505  return _mm256_fnmsub_ps (a, b, c);
9506 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9507  return vfloat8 (nmsub(a.lo(), b.lo(), c.lo()),
9508  nmsub(a.hi(), b.hi(), c.hi()));
9509 #else
9510  // Fallback: just use regular math and hope for the best.
9511  return -(a * b) - c;
9512 #endif
9513 }
9514 
9515 
9516 
9517 
9518 //////////////////////////////////////////////////////////////////////
9519 // vfloat16 implementation
9520 
9523  return m_val[i];
9524 }
9525 
9528  return m_val[i];
9529 }
9530 
9531 
9532 inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val) {
9533  cout << val[0];
9534  for (int i = 1; i < val.elements; ++i)
9535  cout << ' ' << val[i];
9536  return cout;
9537 }
9538 
9539 
9541 #if OIIO_SIMD_AVX >= 512
9542  return _mm512_castps512_ps256 (simd());
9543 #else
9544  return m_8[0];
9545 #endif
9546 }
9547 
9549 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED
9550  return _mm512_extractf32x8_ps (simd(), 1);
9551 #else
9552  return m_8[1];
9553 #endif
9554 }
9555 
9556 
9557 OIIO_FORCEINLINE vfloat16::vfloat16 (float v0, float v1, float v2, float v3,
9558  float v4, float v5, float v6, float v7,
9559  float v8, float v9, float v10, float v11,
9560  float v12, float v13, float v14, float v15) {
9561  load (v0, v1, v2, v3, v4, v5, v6, v7,
9562  v8, v9, v10, v11, v12, v13, v14, v15);
9563 }
9564 
9566 #if OIIO_SIMD_AVX >= 512
9567  __m512 r = _mm512_castps256_ps512 (lo);
9568  m_simd = _mm512_insertf32x8 (r, hi, 1);
9569 #else
9570  m_8[0] = lo;
9571  m_8[1] = hi;
9572 #endif
9573 }
9574 
9575 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d) {
9576 #if OIIO_SIMD_AVX >= 512
9577  m_simd = _mm512_broadcast_f32x4(a);
9578  m_simd = _mm512_insertf32x4 (m_simd, b, 1);
9579  m_simd = _mm512_insertf32x4 (m_simd, c, 2);
9580  m_simd = _mm512_insertf32x4 (m_simd, d, 3);
9581 #else
9582  m_8[0] = vfloat8(a,b);
9583  m_8[1] = vfloat8(c,d);
9584 #endif
9585 }
9586 
9587 
9589 #if OIIO_SIMD_AVX >= 512
9590  m_simd = _mm512_cvtepi32_ps (ival);
9591 #else
9592  SIMD_CONSTRUCT (float(ival[i]));
9593 #endif
9594 }
9595 
9596 
9598 #if OIIO_SIMD_AVX >= 512
9599  return _mm512_setzero_ps();
9600 #else
9601  return vfloat16(0.0f);
9602 #endif
9603 }
9604 
9606  return vfloat16(1.0f);
9607 }
9608 
9609 OIIO_FORCEINLINE const vfloat16 vfloat16::Iota (float start, float step) {
9610  return vfloat16 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
9611  start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step,
9612  start+8.0f*step, start+9.0f*step, start+10.0f*step, start+11.0f*step,
9613  start+12.0f*step, start+13.0f*step, start+14.0f*step, start+15.0f*step);
9614 }
9615 
9616 /// Set all components to 0.0
9618 #if OIIO_SIMD_AVX >= 512
9619  m_simd = _mm512_setzero_ps();
9620 #else
9621  load (0.0f);
9622 #endif
9623 }
9624 
9625 
9627 #if OIIO_SIMD_AVX >= 512
9628  m_simd = _mm512_set1_ps (a);
9629 #else
9630  m_8[0].load (a);
9631  m_8[1].load (a);
9632 #endif
9633 }
9634 
9635 
9636 OIIO_FORCEINLINE void vfloat16::load (float v0, float v1, float v2, float v3,
9637  float v4, float v5, float v6, float v7,
9638  float v8, float v9, float v10, float v11,
9639  float v12, float v13, float v14, float v15) {
9640 #if OIIO_SIMD_AVX >= 512
9641  m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7,
9642  v8, v9, v10, v11, v12, v13, v14, v15);
9643 #else
9644  m_val[ 0] = v0;
9645  m_val[ 1] = v1;
9646  m_val[ 2] = v2;
9647  m_val[ 3] = v3;
9648  m_val[ 4] = v4;
9649  m_val[ 5] = v5;
9650  m_val[ 6] = v6;
9651  m_val[ 7] = v7;
9652  m_val[ 8] = v8;
9653  m_val[ 9] = v9;
9654  m_val[10] = v10;
9655  m_val[11] = v11;
9656  m_val[12] = v12;
9657  m_val[13] = v13;
9658  m_val[14] = v14;
9659  m_val[15] = v15;
9660 #endif
9661 }
9662 
9663 
9664 OIIO_FORCEINLINE void vfloat16::load (const float *values) {
9665 #if OIIO_SIMD_AVX >= 512
9666  m_simd = _mm512_loadu_ps (values);
9667 #else
9668  m_8[0].load (values);
9669  m_8[1].load (values+8);
9670 #endif
9671 }
9672 
9673 
9674 OIIO_FORCEINLINE void vfloat16::load (const float *values, int n)
9675 {
9676  OIIO_DASSERT (n >= 0 && n <= elements);
9677 #if OIIO_SIMD_AVX >= 512
9678  m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values);
9679 #else
9680  if (n > 8) {
9681  m_8[0].load (values);
9682  m_8[1].load (values+8, n-8);
9683  } else {
9684  m_8[0].load (values, n);
9685  m_8[1].clear ();
9686  }
9687 #endif
9688 }
9689 
9690 
9691 OIIO_FORCEINLINE void vfloat16::load (const unsigned short *values) {
9692 #if OIIO_SIMD_AVX >= 512
9693  // Rely on the ushort->int conversion, then convert to float
9694  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9695 #else
9696  m_8[0].load (values);
9697  m_8[1].load (values+8);
9698 #endif
9699 }
9700 
9701 
9702 OIIO_FORCEINLINE void vfloat16::load (const short *values) {
9703 #if OIIO_SIMD_AVX >= 512
9704  // Rely on the short->int conversion, then convert to float
9705  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9706 #else
9707  m_8[0].load (values);
9708  m_8[1].load (values+8);
9709 #endif
9710 }
9711 
9712 
9713 OIIO_FORCEINLINE void vfloat16::load (const unsigned char *values) {
9714 #if OIIO_SIMD_AVX >= 512
9715  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9716 #else
9717  m_8[0].load (values);
9718  m_8[1].load (values+8);
9719 #endif
9720 }
9721 
9722 
9723 OIIO_FORCEINLINE void vfloat16::load (const char *values) {
9724 #if OIIO_SIMD_AVX >= 512
9725  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9726 #else
9727  m_8[0].load (values);
9728  m_8[1].load (values+8);
9729 #endif
9730 }
9731 
9732 
9733 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9734 OIIO_FORCEINLINE void vfloat16::load (const half *values) {
9735 #if OIIO_SIMD_AVX >= 512
9736  /* Enabled 16 bit float instructions! */
9737  vint8 a ((const int *)values);
9738  m_simd = _mm512_cvtph_ps (a);
9739 #else
9740  m_8[0].load (values);
9741  m_8[1].load (values+8);
9742 #endif
9743 }
9744 #endif /* _HALF_H_ or _IMATH_H_ */
9745 
9746 
9747 
9748 OIIO_FORCEINLINE void vfloat16::store (float *values) const {
9749 #if OIIO_SIMD_AVX >= 512
9750  // Use an unaligned store -- it's just as fast when the memory turns
9751  // out to be aligned, nearly as fast even when unaligned. Not worth
9752  // the headache of using stores that require alignment.
9753  _mm512_storeu_ps (values, m_simd);
9754 #else
9755  m_8[0].store (values);
9756  m_8[1].store (values+8);
9757 #endif
9758 }
9759 
9760 
9761 OIIO_FORCEINLINE void vfloat16::store (float *values, int n) const {
9762  OIIO_DASSERT (n >= 0 && n <= elements);
9763  // FIXME: is this faster with AVX masked stores?
9764 #if 0 && OIIO_SIMD_AVX >= 512
9765  // This SHOULD be fast, but in my benchmarks, it is slower!
9766  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
9767  // Re-test this periodically with new Intel hardware.
9768  _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)), m_simd);
9769 #else
9770  if (n <= 8) {
9771  lo().store (values, n);
9772  } else if (n < 16) {
9773  lo().store (values);
9774  hi().store (values+8, n-8);
9775  } else {
9776  store (values);
9777  }
9778 #endif
9779 }
9780 
9781 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9782 OIIO_FORCEINLINE void vfloat16::store (half *values) const {
9783 #if OIIO_SIMD_AVX >= 512
9784  __m256i h = _mm512_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9785  _mm256_storeu_si256 ((__m256i *)values, h);
9786 #else
9787  m_8[0].store (values);
9788  m_8[1].store (values+8);
9789 #endif
9790 }
9791 #endif
9792 
9793 
9794 OIIO_FORCEINLINE void vfloat16::load_mask (const vbool16 &mask, const float *values) {
9795 #if OIIO_SIMD_AVX >= 512
9796  m_simd = _mm512_maskz_loadu_ps (mask, (const simd_t *)values);
9797 #else
9798  m_8[0].load_mask (mask.lo(), values);
9799  m_8[1].load_mask (mask.hi(), values+8);
9800 #endif
9801 }
9802 
9803 
9804 OIIO_FORCEINLINE void vfloat16::store_mask (const vbool16 &mask, float *values) const {
9805 #if OIIO_SIMD_AVX >= 512
9806  _mm512_mask_storeu_ps (values, mask.bitmask(), m_simd);
9807 #else
9808  lo().store_mask (mask.lo(), values);
9809  hi().store_mask (mask.hi(), values+8);
9810 #endif
9811 }
9812 
9813 
9814 
9815 template <int scale>
9816 OIIO_FORCEINLINE void
9817 vfloat16::gather (const value_t *baseptr, const vint_t& vindex)
9818 {
9819 #if OIIO_SIMD_AVX >= 512
9820  m_simd = _mm512_i32gather_ps (vindex, baseptr, scale);
9821 #else
9822  m_8[0].gather<scale> (baseptr, vindex.lo());
9823  m_8[1].gather<scale> (baseptr, vindex.hi());
9824 #endif
9825 }
9826 
9827 template<int scale>
9828 OIIO_FORCEINLINE void
9829 vfloat16::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
9830 {
9831 #if OIIO_SIMD_AVX >= 512
9832  m_simd = _mm512_mask_i32gather_ps (m_simd, mask, vindex, baseptr, scale);
9833 #else
9834  m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
9835  m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
9836 #endif
9837 }
9838 
9839 template<int scale>
9840 OIIO_FORCEINLINE void
9841 vfloat16::scatter (value_t *baseptr, const vint_t& vindex) const
9842 {
9843 #if OIIO_SIMD_AVX >= 512
9844  _mm512_i32scatter_ps (baseptr, vindex, m_simd, scale);
9845 #else
9846  lo().scatter<scale> (baseptr, vindex.lo());
9847  hi().scatter<scale> (baseptr, vindex.hi());
9848 #endif
9849 }
9850 
9851 template<int scale>
9852 OIIO_FORCEINLINE void
9854  const vint_t& vindex) const
9855 {
9856 #if OIIO_SIMD_AVX >= 512
9857  _mm512_mask_i32scatter_ps (baseptr, mask, vindex, m_simd, scale);
9858 #else
9859  lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
9860  hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
9861 #endif
9862 }
9863 
9864 
9865 
9867 #if OIIO_SIMD_AVX >= 512
9868  return _mm512_add_ps (a.m_simd, b.m_simd);
9869 #else
9870  return vfloat16 (a.lo()+b.lo(), a.hi()+b.hi());
9871 #endif
9872 }
9873 
9875  return a = a + b;
9876 }
9877 
9879 #if OIIO_SIMD_AVX >= 512
9880  return _mm512_sub_ps (_mm512_setzero_ps(), a.simd());
9881 #else
9882  return vfloat16 (-a.lo(), -a.hi());
9883 #endif
9884 }
9885 
9887 #if OIIO_SIMD_AVX >= 512
9888  return _mm512_sub_ps (a.m_simd, b.m_simd);
9889 #else
9890  return vfloat16 (a.lo()-b.lo(), a.hi()-b.hi());
9891 #endif
9892 }
9893 
9895  return a = a - b;
9896 }
9897 
9898 
9900 #if OIIO_SIMD_AVX >= 512
9901  return _mm512_mul_ps (a.m_simd, _mm512_set1_ps(b));
9902 #else
9903  return vfloat16 (a.lo()*b, a.hi()*b);
9904 #endif
9905 }
9906 
9908  return b * a;
9909 }
9910 
9912 #if OIIO_SIMD_AVX >= 512
9913  return _mm512_mul_ps (a.m_simd, b.m_simd);
9914 #else
9915  return vfloat16 (a.lo()*b.lo(), a.hi()*b.hi());
9916 #endif
9917 }
9918 
9920  return a = a * b;
9921 }
9922 
9924 #if OIIO_SIMD_AVX >= 512
9925  return _mm512_div_ps (a.m_simd, b.m_simd);
9926 #else
9927  return vfloat16 (a.lo()/b.lo(), a.hi()/b.hi());
9928 #endif
9929 }
9930 
9932  return a = a / b;
9933 }
9934 
9935 
9937 #if OIIO_SIMD_AVX >= 512
9938  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_EQ_OQ);
9939 #else /* Fall back to 8-wide */
9940  return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
9941 #endif
9942 }
9943 
9944 
9946 #if OIIO_SIMD_AVX >= 512
9947  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_NEQ_OQ);
9948 #else /* Fall back to 8-wide */
9949  return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
9950 #endif
9951 }
9952 
9953 
9955 #if OIIO_SIMD_AVX >= 512
9956  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LT_OQ);
9957 #else /* Fall back to 8-wide */
9958  return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
9959 #endif
9960 }
9961 
9962 
9964 #if OIIO_SIMD_AVX >= 512
9965  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GT_OQ);
9966 #else /* Fall back to 8-wide */
9967  return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
9968 #endif
9969 }
9970 
9971 
9973 #if OIIO_SIMD_AVX >= 512
9974  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GE_OQ);
9975 #else /* Fall back to 8-wide */
9976  return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
9977 #endif
9978 }
9979 
9980 
9982 #if OIIO_SIMD_AVX >= 512
9983  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LE_OQ);
9984 #else /* Fall back to 8-wide */
9985  return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
9986 #endif
9987 }
9988 
9989 
9990 // Implementation had to be after the definition of vfloat16.
9992 {
9993 #if OIIO_SIMD_AVX >= 512
9994  m_simd = _mm512_cvttps_epi32(f);
9995 #else
9996  *this = vint16 (vint8(f.lo()), vint8(f.hi()));
9997 #endif
9998 }
9999 
10000 
10001 
10002 // Shuffle groups of 4
10003 template<int i0, int i1, int i2, int i3>
10005 #if OIIO_SIMD_AVX >= 512
10006  return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,i2,i1,i0));
10007 #else
10008  vfloat4 x[4];
10009  a.store ((float *)x);
10010  return vfloat16 (x[i0], x[i1], x[i2], x[i3]);
10011 #endif
10012 }
10013 
10014 template<int i> vfloat16 shuffle4 (const vfloat16& a) {
10015  return shuffle4<i,i,i,i> (a);
10016 }
10017 
10018 template<int i0, int i1, int i2, int i3>
10020 #if OIIO_SIMD_AVX >= 512
10021  return _mm512_permute_ps(a,_MM_SHUFFLE(i3,i2,i1,i0));
10022 #else
10023  vfloat4 x[4];
10024  a.store ((float *)x);
10025  return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
10026  shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
10027 #endif
10028 }
10029 
10030 template<int i> vfloat16 shuffle (const vfloat16& a) {
10031  return shuffle<i,i,i,i> (a);
10032 }
10033 
10034 
10035 template<int i>
10037  return a[i];
10038 }
10039 
10040 
10041 template<int i>
10043  vfloat16 tmp = a;
10044  tmp[i] = val;
10045  return tmp;
10046 }
10047 
10048 
10050 #if OIIO_SIMD_AVX >= 512
10051  return _mm_cvtss_f32(_mm512_castps512_ps128(m_simd));
10052 #else
10053  return m_val[0];
10054 #endif
10055 }
10056 
10057 OIIO_FORCEINLINE float vfloat16::y () const { return m_val[1]; }
10058 OIIO_FORCEINLINE float vfloat16::z () const { return m_val[2]; }
10059 OIIO_FORCEINLINE float vfloat16::w () const { return m_val[3]; }
10064 
10065 
10067 {
10068 #if OIIO_SIMD_AVX >= 512
10069  return _mm512_castps_si512 (x.simd());
10070 #else
10071  return vint16(bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
10072 #endif
10073 }
10074 
10076 {
10077 #if OIIO_SIMD_AVX >= 512
10078  return _mm512_castsi512_ps (x.simd());
10079 #else
10080  return vfloat16(bitcast_to_float(x.lo()), bitcast_to_float(x.hi()));
10081 #endif
10082 }
10083 
10084 
10086 #if OIIO_SIMD_AVX >= 512
10087  // Nomenclature: ABCD are the vint4's comprising v
10088  // First, add the vint4's and make them all the same
10089  vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
10090  vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
10091  // Now, add within each vint4
10092  vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed
10093  return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
10094 #else
10095  vfloat8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
10096  return vfloat16 (sum, sum);
10097 #endif
10098 }
10099 
10100 
10102 #if OIIO_SIMD_AVX >= 512
10103  return vreduce_add(v).x();
10104 #else
10105  return reduce_add(v.lo()) + reduce_add(v.hi());
10106 #endif
10107 }
10108 
10109 
10111 {
10112 #if OIIO_SIMD_AVX >= 512
10113  return _mm512_mask_blend_ps (mask, a, b);
10114 #else
10115  return vfloat16 (blend (a.lo(), b.lo(), mask.lo()),
10116  blend (a.hi(), b.hi(), mask.hi()));
10117 #endif
10118 }
10119 
10120 
10122 {
10123 #if OIIO_SIMD_AVX >= 512
10124  return _mm512_maskz_mov_ps (mask, a);
10125 #else
10126  return vfloat16 (blend0 (a.lo(), mask.lo()),
10127  blend0 (a.hi(), mask.hi()));
10128 #endif
10129 }
10130 
10131 
10133 {
10134 #if OIIO_SIMD_AVX >= 512
10135  return _mm512_maskz_mov_ps (!mask, a);
10136 #else
10137  return vfloat16 (blend0not (a.lo(), mask.lo()),
10138  blend0not (a.hi(), mask.hi()));
10139 #endif
10140 }
10141 
10142 
10144 {
10145  return blend (b, a, mask);
10146 }
10147 
10148 
10150 #if OIIO_SIMD_SSE
10151  return blend0not (a/b, b == vfloat16::Zero());
10152 #else
10153  SIMD_RETURN (vfloat16, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
10154 #endif
10155 }
10156 
10157 
10159 {
10160 #if OIIO_SIMD_AVX >= 512
10161  // Not available? return _mm512_abs_ps (a.simd());
10162  // Just clear the sign bit for cheap fabsf
10163  return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.simd()),
10164  _mm512_set1_epi32(0x7fffffff)));
10165 #else
10166  return vfloat16(abs(a.lo()), abs(a.hi()));
10167 #endif
10168 }
10169 
10170 
10172 {
10173  vfloat16 one(1.0f);
10174  return blend (one, -one, a < vfloat16::Zero());
10175 }
10176 
10177 
10179 {
10180 #if OIIO_SIMD_AVX >= 512
10181  return _mm512_ceil_ps (a);
10182 #else
10183  return vfloat16(ceil(a.lo()), ceil(a.hi()));
10184 #endif
10185 }
10186 
10188 {
10189 #if OIIO_SIMD_AVX >= 512
10190  return _mm512_floor_ps (a);
10191 #else
10192  return vfloat16(floor(a.lo()), floor(a.hi()));
10193 #endif
10194 }
10195 
10196 
10198 {
10199 #if OIIO_SIMD_AVX >= 512
10200  return _mm512_roundscale_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
10201 #else
10202  return vfloat16(round(a.lo()), round(a.hi()));
10203 #endif
10204 }
10205 
10207 {
10208 #if OIIO_SIMD_AVX >= 512
10209  return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC));
10210 #else
10211  return vint16(floor(a));
10212 #endif
10213 }
10214 
10215 
10217 {
10218  return vint16(round(a));
10219 }
10220 
10221 
10223 {
10224 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10225  return _mm512_rcp28_ps(a);
10226 #elif OIIO_SIMD_AVX >= 512
10227  vfloat16 r = _mm512_rcp14_ps(a);
10228  return r * nmadd (r, a, vfloat16(2.0f));
10229 #else
10230  return vfloat16(rcp_fast(a.lo()), rcp_fast(a.hi()));
10231 #endif
10232 }
10233 
10234 
10236 {
10237 #if OIIO_SIMD_AVX >= 512
10238  return _mm512_sqrt_ps (a);
10239 #else
10240  return vfloat16(sqrt(a.lo()), sqrt(a.hi()));
10241 #endif
10242 }
10243 
10244 
10246 {
10247 #if OIIO_SIMD_AVX >= 512
10248  return _mm512_div_ps (_mm512_set1_ps(1.0f), _mm512_sqrt_ps (a));
10249 #else
10250  return vfloat16(rsqrt(a.lo()), rsqrt(a.hi()));
10251 #endif
10252 }
10253 
10254 
10256 {
10257 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10258  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
10259 #elif OIIO_SIMD_AVX >= 512
10260  return _mm512_rsqrt14_ps (a);
10261 #else
10262  return vfloat16(rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
10263 #endif
10264 }
10265 
10266 
10268 {
10269 #if OIIO_SIMD_AVX >= 512
10270  return _mm512_min_ps (a, b);
10271 #else
10272  return vfloat16(min(a.lo(),b.lo()), min(a.hi(),b.hi()));
10273 #endif
10274 }
10275 
10277 {
10278 #if OIIO_SIMD_AVX >= 512
10279  return _mm512_max_ps (a, b);
10280 #else
10281  return vfloat16(max(a.lo(),b.lo()), max(a.hi(),b.hi()));
10282 #endif
10283 }
10284 
10285 
10287 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__)
10288  return _mm512_andnot_ps (a, b);
10289 #else
10290  return vfloat16(andnot(a.lo(),b.lo()), andnot(a.hi(),b.hi()));
10291 #endif
10292 }
10293 
10294 
10296  const simd::vfloat16& c)
10297 {
10298 #if OIIO_SIMD_AVX >= 512
10299  return _mm512_fmadd_ps (a, b, c);
10300 #else
10301  return vfloat16 (madd(a.lo(), b.lo(), c.lo()),
10302  madd(a.hi(), b.hi(), c.hi()));
10303 #endif
10304 }
10305 
10306 
10308  const simd::vfloat16& c)
10309 {
10310 #if OIIO_SIMD_AVX >= 512
10311  return _mm512_fmsub_ps (a, b, c);
10312 #else
10313  return vfloat16 (msub(a.lo(), b.lo(), c.lo()),
10314  msub(a.hi(), b.hi(), c.hi()));
10315 #endif
10316 }
10317 
10318 
10319 
10321  const simd::vfloat16& c)
10322 {
10323 #if OIIO_SIMD_AVX >= 512
10324  return _mm512_fnmadd_ps (a, b, c);
10325 #else
10326  return vfloat16 (nmadd(a.lo(), b.lo(), c.lo()),
10327  nmadd(a.hi(), b.hi(), c.hi()));
10328 #endif
10329 }
10330 
10331 
10332 
10334  const simd::vfloat16& c)
10335 {
10336 #if OIIO_SIMD_AVX >= 512
10337  return _mm512_fnmsub_ps (a, b, c);
10338 #else
10339  return vfloat16 (nmsub(a.lo(), b.lo(), c.lo()),
10340  nmsub(a.hi(), b.hi(), c.hi()));
10341 #endif
10342 }
10343 
10344 
10345 
10346 
10347 } // end namespace simd
10348 
10350 
10351 
10352 /// Custom fmtlib formatters for our SIMD types.
10353 
10354 namespace fmt {
10355 template<> struct formatter<OIIO::simd::vfloat3>
10356  : OIIO::pvt::index_formatter<OIIO::simd::vfloat3> {};
10357 template<> struct formatter<OIIO::simd::vfloat4>
10358  : OIIO::pvt::index_formatter<OIIO::simd::vfloat4> {};
10359 template<> struct formatter<OIIO::simd::vfloat8>
10360  : OIIO::pvt::index_formatter<OIIO::simd::vfloat8> {};
10361 template<> struct formatter<OIIO::simd::vfloat16>
10362  : OIIO::pvt::index_formatter<OIIO::simd::vfloat16> {};
10363 template<> struct formatter<OIIO::simd::vint4>
10364  : OIIO::pvt::index_formatter<OIIO::simd::vint4> {};
10365 template<> struct formatter<OIIO::simd::vint8>
10366  : OIIO::pvt::index_formatter<OIIO::simd::vint8> {};
10367 template<> struct formatter<OIIO::simd::vint16>
10368  : OIIO::pvt::index_formatter<OIIO::simd::vint16> {};
10369 template<> struct formatter<OIIO::simd::matrix44>
10370  : OIIO::pvt::array_formatter<OIIO::simd::matrix44, float, 16> {};
10371 } // namespace fmt
10372 
10373 #undef SIMD_DO
10374 #undef SIMD_CONSTRUCT
10375 #undef SIMD_CONSTRUCT_PAD
10376 #undef SIMD_RETURN
10377 #undef SIMD_RETURN_REDUCE
friend const vfloat8 & operator/=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9072
friend vfloat8 operator+(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9008
static const char * type_name()
Definition: simd.h:524
static const char * type_name()
Definition: simd.h:2545
static const vbool4 True()
Return a vbool4 the is 'true' for all values.
Definition: simd.h:3367
simd_t simd() const
Definition: simd.h:2913
vint16()
Default constructor (contents undefined)
Definition: simd.h:1566
void set_x(value_t val)
Definition: simd.h:7423
type
Definition: core.h:556
OIIO_FORCEINLINE matrix44(M44fParam M)
Construct from an OIIO::M44fParam (including an Imath::M44f)
Definition: simd.h:2445
friend vint4 operator|(const vint4 &a, const vint4 &b)
Definition: simd.h:4623
simd_t & simd()
Definition: simd.h:1014
void set_x(value_t val)
Definition: simd.h:4867
vfloat16(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2885
friend const vint8 & operator%=(vint8 &a, const vint8 &b)
Definition: simd.h:5461
static const vint4 NegOne()
Return an vint4 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:4475
friend vbool8 operator!(const vbool8 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3786
friend const vint4 & operator%=(vint4 &a, const vint4 &b)
Definition: simd.h:4596
int operator[](int i) const
Component access (get)
Definition: simd.h:3927
friend const vfloat16 & operator*=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9919
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:5012
friend vfloat3 operator*(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:8172
simd_t & simd()
Definition: simd.h:859
static const char * type_name()
Definition: simd.h:2253
void set_y(value_t val)
Definition: simd.h:10061
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
vfloat4(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:1881
static vbool4 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool4.
Definition: simd.h:3344
bool none(const vbool4 &v)
Definition: simd.h:3601
void clear()
Set all components to 0.0.
Definition: simd.h:9617
vbool4(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:536
friend const vbool16 & operator|=(vbool16 &a, const vbool16 &b)
Definition: simd.h:4112
vfloat3 operator-() const
Definition: simd.h:8160
friend vbool8 operator!=(const vint8 &a, const vint8 &b)
Definition: simd.h:5562
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
const vfloat3 & operator-=(const vfloat3 &a)
Definition: simd.h:8168
void store(float *values) const
Definition: simd.h:9748
static const vfloat8 One()
Return a vfloat8 with all components set to 1.0.
Definition: simd.h:8709
friend vfloat4 operator*(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7167
simd_t m_simd
Definition: simd.h:1771
vint16 shuffle4(const vint16 &a)
Shuffle groups of 4.
Definition: simd.h:6455
friend vbool16 operator<=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9981
OIIO_FORCEINLINE const vint4 & operator/=(vint4 &a, const vint4 &b)
Definition: simd.h:4587
friend vint8 operator~(const vint8 &a)
Definition: simd.h:5502
void store_mask(int mask, value_t *values) const
Definition: simd.h:4384
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator-(const vfloat16 &a)
Definition: simd.h:9878
value_t z() const
Definition: simd.h:5701
vfloat8(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2564
void set_x(value_t val)
Definition: simd.h:9187
OIIO_FORCEINLINE vbool4 shuffle(const vbool4 &a)
Definition: simd.h:3525
friend vfloat3 operator+(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:8152
friend vbool16 operator!=(const vint16 &a, const vint16 &b)
Definition: simd.h:6356
friend const vbool8 & operator|=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3823
value_t x() const
Definition: simd.h:10049
static const char * name()
Definition: simd.h:402
SYS_API float expf(float x)
Matrix44< float > M44f
4x4 matrix of float
Definition: ImathMatrix.h:1137
simd_t simd() const
Definition: simd.h:2587
value_t m_val[elements]
Definition: simd.h:1467
friend vfloat16 operator%(const vfloat16 &a, const vfloat16 &b)
#define OIIO_FORCEINLINE
Definition: platform.h:403
friend std::ostream & operator<<(std::ostream &cout, const vfloat4 &val)
Stream output.
Definition: simd.h:7316
friend vint8 operator/(const vint8 &a, const vint8 &b)
Definition: simd.h:5448
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:7440
vfloat4 m_4[2]
Definition: simd.h:2756
static const vint4 Giota()
Return an vint4 with "geometric" iota: (1, 2, 4, 8).
Definition: simd.h:4497
value_t y() const
Definition: simd.h:6510
OIIO_FORCEINLINE const vint4 & operator>>=(vint4 &a, const unsigned int bits)
Definition: simd.h:4685
friend const vint8 & operator>>=(vint8 &a, unsigned int bits)
Definition: simd.h:5536
void load_mask(const vbool_t &mask, const value_t *values)
Definition: simd.h:6039
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
Definition: simd.h:1729
friend vint4 operator&(const vint4 &a, const vint4 &b)
Definition: simd.h:4608
int operator[](int i) const
Component access (get)
Definition: simd.h:4182
void set_w(value_t val)
Definition: simd.h:4870
friend vbool8 operator>(const vint8 &a, const vint8 &b)
Definition: simd.h:5568
void set_x(value_t val)
Definition: simd.h:6513
static const vfloat3 Zero()
Return a vfloat3 with all components set to 0.0.
Definition: simd.h:8098
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7292
Vec4< float > V4f
Vec4 of float.
Definition: ImathVec.h:864
static const char * type_name()
Definition: simd.h:815
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
Definition: half.h:266
void store_mask(int mask, value_t *values) const
Definition: simd.h:8939
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4690
OIIO_FORCEINLINE vint4 operator%(const vint4 &a, const vint4 &b)
Definition: simd.h:4589
void store(V &vec) const
Store into a generic subscripted or xyz 3-vector, including Imath::V3f.
Definition: simd.h:2370
vint4 bitcast_to_int4(const vfloat4 &x)
Definition: simd.h:7453
friend const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3434
friend const vint4 & operator>>=(vint4 &a, unsigned int bits)
Definition: simd.h:4685
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend vint8 operator|(const vint8 &a, const vint8 &b)
Definition: simd.h:5481
vfloat4 vfloat_t
SIMD int type.
Definition: simd.h:1868
friend vfloat16 operator*(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9911
int operator[](int i) const
Component access (get)
Definition: simd.h:5077
friend const vfloat8 & operator+=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9016
int bitmask() const
Extract the bitmask.
Definition: simd.h:3698
vfloat16 vfloat_t
float type of the same length
Definition: simd.h:1557
vfloat3(const float *f)
Construct from a pointer to 3 values.
Definition: simd.h:2268
value_t x() const
Definition: simd.h:9183
const GLdouble * v
Definition: glcorearb.h:837
friend vfloat4 operator/(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7199
vfloat16(const unsigned short *vals)
Construct from a pointer to unsigned short values.
Definition: simd.h:2921
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:1252
OIIO_FORCEINLINE const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3442
GLuint start
Definition: glcorearb.h:475
void clear()
Sset all components to 0.
Definition: simd.h:4454
GLsizei const GLfloat * value
Definition: glcorearb.h:824
vint8()
Default constructor (contents undefined)
Definition: simd.h:1267
friend vbool16 operator==(const vbool16 &a, const vbool16 &b)
Comparison operators, component by component.
Definition: simd.h:4126
friend vint4 operator/(const vint4 &a, const vint4 &b)
Definition: simd.h:4581
vfloat8(const short *vals)
Construct from a pointer to short values.
Definition: simd.h:2598
vfloat3 transformv(const vfloat3 &V) const
Transform 3-vector V by 4x4 matrix M.
Definition: simd.h:8393
void set_w(value_t val)
Definition: simd.h:5706
const vfloat4 & operator[](int i) const
Return one row.
Definition: simd.h:8348
int bitmask() const
Extract the bitmask.
Definition: simd.h:3325
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3936
void clear()
Set all components to false.
Definition: simd.h:4032
value_t m_val[elements]
Definition: simd.h:1167
static vbool16 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool16.
Definition: simd.h:864
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:8729
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7694
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
static const vfloat16 One()
Return a vfloat16 with all components set to 1.0.
Definition: simd.h:9605
GLboolean GLboolean g
Definition: glcorearb.h:1222
vfloat4 m_row[rows]
Definition: simd.h:2521
simd_t m_simd
Definition: simd.h:2118
static const char * name()
Definition: simd.h:405
vfloat8()
Default constructor (contents undefined)
Definition: simd.h:2561
vbool8 vbool_t
bool type of the same length
Definition: simd.h:1257
OIIO_FORCEINLINE vbool4 operator!(const vbool4 &a)
Definition: simd.h:3393
const vfloat4 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:1945
const vfloat4 & operator/=(const vfloat4 &a)
Definition: simd.h:7209
friend vint16 operator+(const vint16 &a, const vint16 &b)
Definition: simd.h:6195
friend const vint8 & operator*=(vint8 &a, const vint8 &b)
Definition: simd.h:5444
value_t z() const
Definition: simd.h:9185
friend const vint4 & operator*=(vint4 &a, const vint4 &b)
Definition: simd.h:4577
friend vint8 operator<<(const vint8 &a, unsigned int bits)
Definition: simd.h:5511
static const char * name()
Definition: simd.h:403
vfloat8(const float *f)
Construct from a pointer to 8 values.
Definition: simd.h:2571
vfloat4(const vfloat4 &other)
Copy construct from another vfloat4.
Definition: simd.h:1890
friend vbool8 operator!=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9084
friend vbool8 operator>=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9108
vfloat4(const V &v)
Definition: simd.h:1908
void clear()
Set all components to false.
Definition: simd.h:3350
GLint GLint i2
Definition: glad.h:2724
friend const vint16 & operator&=(vint16 &a, const vint16 &b)
Definition: simd.h:6276
simd_t simd() const
Definition: simd.h:1307
vfloat4 rsqrt_fast(const vfloat4 &a)
Fast, approximate 1/sqrt.
Definition: simd.h:7718
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
vint16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1603
friend const vbool16 & operator&=(vbool16 &a, const vbool16 &b)
Definition: simd.h:4108
static const vfloat4 Zero()
Return a vfloat4 with all components set to 0.0.
Definition: simd.h:6704
vbool4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:555
float operator[](int i) const
Component access (get)
Definition: simd.h:6735
vfloat16 min(const vfloat16 &a, const vfloat16 &b)
Per-element min.
Definition: simd.h:10267
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:1863
vfloat3 transformvT(const vfloat3 &V) const
Transform 3-vector V by the transpose of 4x4 matrix M.
Definition: simd.h:8407
void set_z(value_t val)
Definition: simd.h:5705
int operator[](int i) const
Component access (get)
Definition: simd.h:3609
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:8107
simd_t simd() const
Definition: simd.h:1912
#define SIMD_CONSTRUCT_PAD(x)
Definition: simd.h:494
friend const vint16 & operator<<=(vint16 &a, unsigned int bits)
Definition: simd.h:6319
value_t z() const
Definition: simd.h:10058
OIIO_FORCEINLINE const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3438
friend vint16 operator*(const vint16 &a, const vint16 &b)
Definition: simd.h:6232
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 max(const vfloat16 &a, const vfloat16 &b)
Per-element max.
Definition: simd.h:10276
vfloat3()
Default constructor (contents undefined)
Definition: simd.h:2259
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:525
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:1256
static const vint8 Zero()
Return an vint8 with all components set to 0.
Definition: simd.h:5344
vbool8 vbool_t
SIMD bool type.
Definition: simd.h:2553
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
GLint y
Definition: glcorearb.h:103
vint16 vint_t
SIMD int type.
Definition: simd.h:2873
vfloat4(float a, float b, float c, float d=0.0f)
Construct from 3 or 4 values.
Definition: simd.h:1884
simd_t m_simd
Definition: simd.h:2754
#define OIIO_SIMD_UINT4_CONST(name, val)
Definition: simd.h:438
static const vint8 NegOne()
Return an vint8 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:5354
value_t * data()
Definition: simd.h:1018
static const char * type_name()
Definition: simd.h:2866
bool reduce_or(const vbool4 &v)
Definition: simd.h:3586
friend vfloat8 operator-(const vfloat8 &a)
Definition: simd.h:9020
friend vint16 operator/(const vint16 &a, const vint16 &b)
Definition: simd.h:6245
**But if you need a result
Definition: thread.h:622
static const vint16 Giota()
Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
Definition: simd.h:6148
vfloat4()
Default constructor (contents undefined)
Definition: simd.h:1878
value_t * data()
Definition: simd.h:2592
void load_mask(int mask, const value_t *values)
Definition: simd.h:7009
GLfloat GLfloat GLfloat v2
Definition: glcorearb.h:818
value_t x() const
Definition: simd.h:6502
friend const vint16 & operator^=(vint16 &a, const vint16 &b)
Definition: simd.h:6296
const vfloat8 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2612
uint16_t m_bits
Definition: simd.h:931
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1249
void clear()
Set all components to 0.0.
Definition: simd.h:6721
vfloat4(const char *vals)
Construct from a pointer to 4 char values.
Definition: simd.h:1937
simd_t simd() const
Definition: simd.h:1607
static const char * name()
Definition: simd.h:410
friend const vint4 & operator<<=(vint4 &a, unsigned int bits)
Definition: simd.h:4670
GLfloat GLfloat GLfloat GLfloat v3
Definition: glcorearb.h:819
static constexpr size_t size() noexcept
Definition: simd.h:2256
OIIO_FORCEINLINE vbool4 insert(const vbool4 &a, bool val)
Helper: substitute val for a[i].
Definition: simd.h:3556
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:1555
vfloat3 hdiv(const vfloat4 &a)
Homogeneous divide to turn a vfloat4 into a vfloat3.
Definition: simd.h:7591
vfloat3 transformv(const matrix44 &M, const vfloat3 &V)
Transform 3-vector V by 4x4 matrix M.
Definition: simd.h:8613
const vbool8 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:3688
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
vbool8 lo() const
Extract the lower precision vbool8.
Definition: simd.h:4058
void load_bitmask(int a)
Helper: load all components from a bitmask in an int.
Definition: simd.h:3958
friend vbool4 operator!=(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3467
vint8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1303
void set_y(value_t val)
Definition: simd.h:5704
void clear()
Sset all components to 0.
Definition: simd.h:5335
value_t y() const
Definition: simd.h:5700
friend vbool4 operator<(const vint4 &a, const vint4 &b)
Definition: simd.h:4728
vbool8(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:678
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1611
friend const vint4 & operator^=(vint4 &a, const vint4 &b)
Definition: simd.h:4647
OIIO_FORCEINLINE vbool4 operator>=(const vint4 &a, const vint4 &b)
Definition: simd.h:4738
int operator[](int i) const
Component access (get)
Definition: simd.h:5902
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator/(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9923
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
OIIO_FORCEINLINE vbool4 operator~(const vbool4 &a)
Definition: simd.h:3446
float dot3(const vfloat4 &a, const vfloat4 &b)
Return the float 3-component dot (inner) product of a and b.
Definition: simd.h:7524
value_t m_val[paddedelements]
Definition: simd.h:2755
OutGridT const XformOp bool bool
void load_mask(int mask, const value_t *values)
Definition: simd.h:5245
friend const vfloat16 & operator+=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9874
vint8 vint_t
SIMD int type.
Definition: simd.h:2552
friend vbool4 operator|(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3413
friend const vint16 & operator%=(vint16 &a, const vint16 &b)
Definition: simd.h:6258
const matrix44 & operator=(const matrix44 &m)
Assignment.
Definition: simd.h:8353
simd_bool_t< 8 >::type simd_t
the native SIMD type used
Definition: simd.h:671
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:2421
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
static const char * type_name()
Definition: simd.h:1550
Template giving a printable name for each type.
Definition: simd.h:401
vint4 abs(const vint4 &a)
Definition: simd.h:4989
vfloat4 safe_div(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7579
void store(int *values) const
Store the values into memory.
Definition: simd.h:5230
friend vbool8 operator<=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9116
friend std::ostream & operator<<(std::ostream &cout, const vfloat8 &val)
Stream output.
Definition: simd.h:8654
vbool16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:854
vfloat8 lo() const
Extract the lower precision vfloat8.
Definition: simd.h:9540
value_t w() const
Definition: simd.h:4866
simd_t & simd()
Definition: simd.h:1308
vbool4(const vbool4 &other)
Copy construct from another vbool4.
Definition: simd.h:544
value_t m_vals[rows][cols]
Definition: simd.h:2522
static const vbool4 False()
Return a vbool4 the is 'false' for all values.
Definition: simd.h:3359
value_t y() const
Definition: simd.h:4864
vbool4 lo() const
Extract the lower precision vbool4.
Definition: simd.h:3757
static const vbool16 False()
Return a vbool16 the is 'false' for all values.
Definition: simd.h:4036
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3618
value_t w() const
Definition: simd.h:6512
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vint4 operator~(const vint4 &a)
Definition: simd.h:4650
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
Definition: simd.h:3041
friend vbool4 operator==(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7231
simd_t & simd()
Definition: simd.h:2588
OIIO_FORCEINLINE bool extract(const vbool4 &a)
Definition: simd.h:3542
vfloat3 normalized() const
Return a normalized version of the vector.
Definition: simd.h:8311
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7638
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
Definition: simd.h:1718
OIIO_FORCEINLINE matrix44(const float *f)
Construct from a float array.
Definition: simd.h:2437
matrix44 inverse() const
Return the inverse of the matrix.
Definition: simd.h:8503
static const vint4 One()
Return an vint4 with all components set to 1.
Definition: simd.h:4473
vint4 blend(const vint4 &a, const vint4 &b, const vbool4 &mask)
Definition: simd.h:4949
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:961
vbool16(const vbool16 &other)
Copy construct from another vbool16.
Definition: simd.h:838
vfloat8(const vfloat8 &other)
Copy construct from another vfloat8.
Definition: simd.h:2574
vint4 blend0not(const vint4 &a, const vbool4 &mask)
Definition: simd.h:4974
vint4 vint_t
SIMD int type.
Definition: simd.h:1869
GA_API const UT_StringHolder scale
friend const vbool16 & operator^=(vbool16 &a, const vbool16 &b)
Definition: simd.h:4116
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
value_t w() const
Definition: simd.h:7422
vbool4 hi() const
Extract the higher precision vbool4.
Definition: simd.h:3765
value_t * data()
Definition: simd.h:2918
OIIO_FORCEINLINE vbool4 operator>(const vint4 &a, const vint4 &b)
Definition: simd.h:4718
friend vbool16 operator!=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9945
vfloat16(const float *f)
Construct from a pointer to 16 values.
Definition: simd.h:2894
simd_t & simd()
Definition: simd.h:1608
GLdouble n
Definition: glcorearb.h:2008
vbool8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:698
bool operator!=(const matrix44 &m) const
Definition: simd.h:8472
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3393
vfloat8 hi() const
Extract the higher precision vfloat8.
Definition: simd.h:9548
OIIO_FORCEINLINE vbool4 operator<=(const vint4 &a, const vint4 &b)
Definition: simd.h:4742
friend const vint8 & operator+=(vint8 &a, const vint8 &b)
Definition: simd.h:5407
GLfloat f
Definition: glcorearb.h:1926
friend vfloat8 operator/(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9064
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:2591
value_t x() const
Definition: simd.h:7419
vfloat4 vdot3(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7516
friend const vint8 & operator-=(vint8 &a, const vint8 &b)
Definition: simd.h:5430
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:2546
vint8(const vint8 &other)
Copy construct from another vint8.
Definition: simd.h:1294
void set_y(value_t val)
Definition: simd.h:4868
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:965
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:958
#define SIMD_RETURN_REDUCE(T, init, op)
Definition: simd.h:497
friend vbool8 operator<(const vint8 &a, const vint8 &b)
Definition: simd.h:5580
vint8 vint_t
int type of the same length
Definition: simd.h:1259
OIIO_FORCEINLINE vbool4 operator==(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3457
friend vfloat3 operator/(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:8192
OIIO_DEPRECATED("use bitcast_to_int() (1.8)") inline vint4 bitcast_to_int4(const vbool4 &x)
Definition: simd.h:4883
vfloat4 lo() const
Extract the lower precision vfloat4.
Definition: simd.h:8662
vint16(const vint16 &other)
Copy construct from another vint16.
Definition: simd.h:1591
simd_raw_t< float, 8 >::type simd_t
the native SIMD type used
Definition: simd.h:2550
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:1551
float length() const
Length of the vector.
Definition: simd.h:8305
bool any(const vbool4 &v)
Definition: simd.h:3600
const vfloat4 & operator-=(const vfloat4 &a)
Definition: simd.h:7142
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3637
bool reduce_and(const vbool4 &v)
Logical reduction across all components.
Definition: simd.h:3573
vint16 vint_t
int type of the same length
Definition: simd.h:1558
vfloat8(const unsigned short *vals)
Construct from a pointer to unsigned short values.
Definition: simd.h:2595
friend vbool16 operator<=(const vint16 &a, const vint16 &b)
Definition: simd.h:6392
vfloat3 transformp(const matrix44 &M, const vfloat3 &V)
Transform 3-point V by 4x4 matrix M.
Definition: simd.h:8609
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend const vint4 & operator+=(vint4 &a, const vint4 &b)
Definition: simd.h:4512
vfloat8 vfloat_t
float type of the same length
Definition: simd.h:1258
OIIO_FORCEINLINE const vint4 & operator+=(vint4 &a, const vint4 &b)
Definition: simd.h:4512
void transpose(vint4 &a, vint4 &b, vint4 &c, vint4 &d)
Definition: simd.h:8008
vint4 select(const vbool4 &mask, const vint4 &a, const vint4 &b)
Definition: simd.h:4983
void set_w(value_t val)
Definition: simd.h:10063
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3285
vfloat4(const short *vals)
Construct from a pointer to 4 short values.
Definition: simd.h:1931
static vbool8 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool8.
Definition: simd.h:3708
value_t y() const
Definition: simd.h:7420
vint4 rotl(const vint4 &x, const int s)
Circular bit rotate by s bits, for N values at once.
Definition: simd.h:5023
vint4 m_4[2]
Definition: simd.h:1468
vbool16(int bitmask)
Definition: simd.h:829
void load_pairs(const float *lo, const float *hi)
Definition: simd.h:6911
friend vbool16 operator>=(const vint16 &a, const vint16 &b)
Definition: simd.h:6383
value_t z() const
Definition: simd.h:6511
static constexpr size_t size() noexcept
Definition: simd.h:821
const vfloat4 & operator*=(const vfloat4 &a)
Definition: simd.h:7177
#define OIIO_DASSERT
Definition: dassert.h:55
vbool4 m_4[2]
Definition: simd.h:777
vfloat4 msub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7791
vbool16 vbool_t
bool type of the same length
Definition: simd.h:1556
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7771
friend vint8 operator&(const vint8 &a, const vint8 &b)
Definition: simd.h:5471
friend vbool16 operator^(const vbool16 &a, const vbool16 &b)
Definition: simd.h:4099
friend vbool8 operator<(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9092
value_t z() const
Definition: simd.h:7421
void store_mask(int mask, value_t *values) const
Definition: simd.h:1709
float operator[](int i) const
Component access (get)
Definition: simd.h:9526
simd_t m_simd
Definition: simd.h:1166
vfloat3 normalized_fast() const
Return a fast, approximate normalized version of the vector.
Definition: simd.h:8323
void load_mask(int mask, const value_t *values)
Definition: simd.h:3027
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1916
friend vbool4 operator<(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7252
static const char * type_name()
Definition: simd.h:1251
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:4192
OIIO_FORCEINLINE T exp(const T &v)
Definition: simd.h:7850
void load_mask(int mask, const value_t *values)
Definition: simd.h:1704
static const vfloat3 One()
Return a vfloat3 with all components set to 1.0.
Definition: simd.h:8100
vfloat16(const vfloat16 &other)
Copy construct from another vfloat16.
Definition: simd.h:2897
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:5093
bool set_denorms_zero_mode(bool on)
Definition: simd.h:3202
static const char * name()
Definition: simd.h:401
vbool16()
Default constructor (contents undefined)
Definition: simd.h:824
simd_t simd() const
Definition: simd.h:559
simd_bool_t< 16 >::type simd_t
the native SIMD type used
Definition: simd.h:820
friend vint16 operator<<(const vint16 &a, unsigned int bits)
Definition: simd.h:6308
void store_mask(int mask, value_t *values) const
Definition: simd.h:3032
vbool4(bool a, bool b, bool c, bool d)
Construct from 4 bool values.
Definition: simd.h:541
OIIO_FORCEINLINE std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Definition: simd.h:3277
static const vfloat16 Zero()
Return a vfloat16 with all components set to 0.0.
Definition: simd.h:9597
vfloat4 operator-() const
Definition: simd.h:7122
simd_t & simd()
Definition: simd.h:560
value_t m_val[paddedelements]
Definition: simd.h:3087
vfloat4(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
Definition: simd.h:1928
friend vint4 operator%(const vint4 &a, const vint4 &b)
Definition: simd.h:4589
GLint GLuint mask
Definition: glcorearb.h:124
friend vbool8 operator!=(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3852
void set_x(value_t val)
Definition: simd.h:5703
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 vfloat_t
SIMD int type.
Definition: simd.h:2872
friend const vfloat16 & operator/=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9931
value_t y() const
Definition: simd.h:9184
static const vbool16 True()
Return a vbool16 the is 'true' for all values.
Definition: simd.h:4041
const vbool16 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:4012
vfloat4 nmsub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7829
void set_y(value_t val)
Definition: simd.h:7424
vbool4()
Default constructor (contents undefined)
Definition: simd.h:533
static const vbool8 True()
Return a vbool8 the is 'true' for all values.
Definition: simd.h:3731
static const char * name()
Definition: simd.h:404
friend vbool8 operator==(const vbool8 &a, const vbool8 &b)
Comparison operators, component by component.
Definition: simd.h:3842
vbool8(const vbool8 &other)
Copy construct from another vbool8.
Definition: simd.h:686
friend vint4 operator^(const vint4 &a, const vint4 &b)
Definition: simd.h:4636
OIIO_FORCEINLINE vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7300
simd_t simd() const
Definition: simd.h:702
vint4 blend0(const vint4 &a, const vbool4 &mask)
Definition: simd.h:4963
vfloat3(const short *vals)
Construct from a pointer to 4 short values.
Definition: simd.h:2306
void store(float *values) const
Definition: simd.h:6923
friend vint8 operator*(const vint8 &a, const vint8 &b)
Definition: simd.h:5435
friend vfloat16 operator+(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9866
vfloat4(const float *f)
Construct from a pointer to 4 values.
Definition: simd.h:1887
const vint4 & operator=(int a)
Assign one value to all components.
Definition: simd.h:4345
GLint i1
Definition: glad.h:2724
static constexpr size_t size() noexcept
Definition: simd.h:672
void set_y(value_t val)
Definition: simd.h:9188
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:4046
static const vint4 Zero()
Return an vint4 with all components set to 0.
Definition: simd.h:4464
simd_t m_simd
Definition: simd.h:1466
friend vbool16 operator<(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9954
friend const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3442
vfloat3(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2262
friend vint8 operator-(const vint8 &a)
Definition: simd.h:5412
static const char * name()
Definition: simd.h:406
friend const vint16 & operator+=(vint16 &a, const vint16 &b)
Definition: simd.h:6204
bool set_flush_zero_mode(bool on)
Definition: simd.h:3192
friend vint8 operator^(const vint8 &a, const vint8 &b)
Definition: simd.h:5491
void clear()
Set all components to 0.0.
Definition: simd.h:8719
SYS_API float logf(float x)
float length2() const
Square of the length of the vector.
Definition: simd.h:8299
friend vbool4 operator!=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7241
void set_w(value_t val)
Definition: simd.h:7426
friend vbool16 operator<(const vint16 &a, const vint16 &b)
Definition: simd.h:6374
vbool4(int a, int b, int c, int d)
Construct from 4 int values.
Definition: simd.h:547
vfloat3(float a, float b, float c)
Construct from 3 values.
Definition: simd.h:2265
signed char int8
Definition: SYS_Types.h:35
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:3381
friend std::ostream & operator<<(std::ostream &cout, const matrix44 &M)
Stream output.
Definition: simd.h:8599
matrix44 transposed() const
Return the transposed matrix.
Definition: simd.h:8363
OIIO_FORCEINLINE vint4 operator>>(const vint4 &a, const unsigned int bits)
Definition: simd.h:4675
vfloat3(const V &v)
Definition: simd.h:2274
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7647
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3393
static const vfloat4 One()
Return a vfloat4 with all components set to 1.0.
Definition: simd.h:6712
GLint GLenum GLint x
Definition: glcorearb.h:409
void store_mask(int mask, value_t *values) const
Definition: simd.h:7031
#define SIMD_RETURN(T, x)
Definition: simd.h:496
friend vbool16 operator>(const vint16 &a, const vint16 &b)
Definition: simd.h:6365
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1017
const vint16 & operator=(int a)
Assign one value to all components.
Definition: simd.h:6036
vfloat8(const char *vals)
Construct from a pointer to char values.
Definition: simd.h:2604
vint8 m_8[2]
Definition: simd.h:1773
bool get_denorms_zero_mode()
Definition: simd.h:3219
friend vbool8 operator~(const vbool8 &a)
Definition: simd.h:3832
friend vbool4 operator!=(const vint4 &a, const vint4 &b)
Definition: simd.h:4713
vfloat3 transformvT(const matrix44 &M, const vfloat3 &V)
Definition: simd.h:8617
const vfloat3 & operator/=(const vfloat3 &a)
Definition: simd.h:8196
value_t * data()
Definition: simd.h:1917
friend vbool4 operator>(const vint4 &a, const vint4 &b)
Definition: simd.h:4718
friend const vint4 & operator-=(vint4 &a, const vint4 &b)
Definition: simd.h:4539
bool get_flush_zero_mode()
Definition: simd.h:3211
void set_x(value_t val)
Definition: simd.h:10060
static constexpr size_t size() noexcept
Definition: simd.h:530
float operator[](int i) const
Component access (get)
Definition: simd.h:8648
GLdouble t
Definition: glad.h:2397
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:9626
static const vfloat8 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:8713
void set_w(value_t val)
Definition: simd.h:6516
value_t w() const
Definition: simd.h:10059
friend const vint4 & operator&=(vint4 &a, const vint4 &b)
Definition: simd.h:4619
value_t w() const
Definition: simd.h:9186
void set_y(value_t val)
Definition: simd.h:6514
simd_raw_t< float, 4 >::type simd_t
the native SIMD type used
Definition: simd.h:1867
vbool8()
Default constructor (contents undefined)
Definition: simd.h:675
friend vbool16 operator|(const vbool16 &a, const vbool16 &b)
Definition: simd.h:4091
vfloat16()
Default constructor (contents undefined)
Definition: simd.h:2882
GLfloat v0
Definition: glcorearb.h:816
vint8 lo() const
Extract the lower precision vint8.
Definition: simd.h:6154
vfloat4 vdot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b in every component.
Definition: simd.h:7496
static const vfloat8 Zero()
Return a vfloat8 with all components set to 0.0.
Definition: simd.h:8701
friend vint16 operator|(const vint16 &a, const vint16 &b)
Definition: simd.h:6278
vfloat4 rcp_fast(const vfloat4 &a)
Fast, approximate 1/a.
Definition: simd.h:7675
friend vint4 operator+(const vint4 &a, const vint4 &b)
Definition: simd.h:4502
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1311
friend vint16 operator>>(const vint16 &a, unsigned int bits)
Definition: simd.h:6323
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1548
vfloat16(const unsigned char *vals)
Construct from a pointer to unsigned char values.
Definition: simd.h:2927
static const char * type_name()
Definition: simd.h:666
void set_w(value_t val)
Definition: simd.h:9190
friend vbool4 operator^(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3423
GLint j
Definition: glad.h:2733
const vfloat3 & operator*=(const vfloat3 &a)
Definition: simd.h:8184
static const char * type_name()
Definition: simd.h:1862
OIIO_FORCEINLINE vint4 operator/(const vint4 &a, const vint4 &b)
Definition: simd.h:4581
vfloat16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:2909
OIIO_FORCEINLINE vbool4 operator^(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3423
static const vint16 One()
Return an vint16 with all components set to 1.
Definition: simd.h:6135
vfloat16(const char *vals)
Construct from a pointer to char values.
Definition: simd.h:2930
vbool16 vbool_t
SIMD bool type.
Definition: simd.h:2874
vint4 hi() const
Extract the higher precision vint4.
Definition: simd.h:5376
vfloat3 transformp(const vfloat3 &V) const
Transform 3-point V by 4x4 matrix M.
Definition: simd.h:8377
OIIO_FORCEINLINE matrix44(const float *a, const float *b, const float *c, const float *d)
Construct from 4 float[4] rows.
Definition: simd.h:2456
friend vbool8 operator>(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9100
OIIO_FORCEINLINE vbool4 operator|(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3413
Vec3< float > V3f
Vec3 of float.
Definition: ImathVec.h:849
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
GLsizeiptr size
Definition: glcorearb.h:664
GLfloat GLfloat GLfloat GLfloat h
Definition: glcorearb.h:2002
void store_mask(const vbool_t &mask, value_t *values) const
Definition: simd.h:6049
static const vfloat3 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:8102
vint4 safe_mod(const vint4 &a, const vint4 &b)
Definition: simd.h:5057
friend const vint8 & operator/=(vint8 &a, const vint8 &b)
Definition: simd.h:5453
static const vfloat4 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:6716
friend const vfloat8 & operator-=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9036
IMATH_NAMESPACE::V2f IMATH_NAMESPACE::Box2i std::string this attribute is obsolete as of OpenEXR v3 float
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:5918
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:3746
vint4 vreduce_add(const vint4 &v)
The sum of all components, returned in all components.
Definition: simd.h:4887
OIIO_FORCEINLINE const vint4 & operator<<=(vint4 &a, const unsigned int bits)
Definition: simd.h:4670
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
Definition: simd.h:3052
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:2867
OIIO_FORCEINLINE vbool4 operator<(const vint4 &a, const vint4 &b)
Definition: simd.h:4728
friend vbool8 operator|(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3802
friend const vint16 & operator*=(vint16 &a, const vint16 &b)
Definition: simd.h:6241
#define SIMD_DO(x)
Definition: simd.h:492
OIIO_FORCEINLINE vbool4 operator!=(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3467
const T * data() const noexcept
Return a pointer to the contiguous values comprising the matrix.
Definition: vecparam.h:376
vfloat4 ceil(const vfloat4 &a)
Definition: simd.h:7629
friend vbool16 operator==(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9936
vbool4 vbool_t
SIMD bool type.
Definition: simd.h:1870
void store(int *values) const
Store the values into memory.
Definition: simd.h:4348
void clear()
Sset all components to 0.
Definition: simd.h:6118
vfloat4 vfloat_t
float type of the same length
Definition: simd.h:967
GLenum GLsizei GLsizei GLint * values
Definition: glcorearb.h:1602
friend vint16 operator%(const vint16 &a, const vint16 &b)
Definition: simd.h:6253
value_t x() const
Definition: simd.h:5699
OIIO_FORCEINLINE vbool4 operator&(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3403
#define SIMD_CONSTRUCT(x)
Definition: simd.h:493
vfloat8 m_8[2]
Definition: simd.h:3088
vfloat4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1896
OIIO_FORCEINLINE matrix44()
Definition: simd.h:2426
value_t * data()
Definition: simd.h:1312
friend const vint16 & operator-=(vint16 &a, const vint16 &b)
Definition: simd.h:6227
vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
Construct from 8 values.
Definition: simd.h:2567
static const vint4 Iota(int start=0, int step=1)
Definition: simd.h:4492
void load_mask(int mask, const value_t *values)
Definition: simd.h:8917
friend vfloat4 operator+(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7101
void clear()
Set all components to false.
Definition: simd.h:3714
int m_val[paddedelements]
Definition: simd.h:626
friend vbool8 operator&(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3794
static const vint16 NegOne()
Return an vint16 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:6137
vfloat4 bitcast_to_float4(const vint4 &x)
Definition: simd.h:7454
simd_t m_simd
Definition: simd.h:775
GLuint index
Definition: glcorearb.h:786
int bitmask() const
Definition: simd.h:4023
vint4 lo() const
Extract the lower precision vint4.
Definition: simd.h:5368
friend vbool8 operator^(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3810
friend vint4 operator-(const vint4 &a)
Definition: simd.h:4517
vfloat4 nmadd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7810
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
vfloat3(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
Definition: simd.h:2303
void load_mask(int mask, const value_t *values)
Definition: simd.h:4362
friend vbool16 operator~(const vbool16 &a)
Definition: simd.h:4121
friend const vint16 & operator/=(vint16 &a, const vint16 &b)
Definition: simd.h:6250
vint4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1009
friend vbool8 operator>=(const vint8 &a, const vint8 &b)
Definition: simd.h:5593
vfloat8 vfloat_t
SIMD int type.
Definition: simd.h:2551
void set_z(value_t val)
Definition: simd.h:7425
friend vint16 operator^(const vint16 &a, const vint16 &b)
Definition: simd.h:6288
friend vint8 operator>>(const vint8 &a, unsigned int bits)
Definition: simd.h:5526
GLfloat GLfloat v1
Definition: glcorearb.h:817
GLuint GLfloat * val
Definition: glcorearb.h:1608
int m_val[paddedelements]
Definition: simd.h:776
value_t m_val[paddedelements]
Definition: simd.h:2119
vint4()
Default constructor (contents undefined)
Definition: simd.h:976
const vbool4 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:578
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4873
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vint4 operator<<(const vint4 &a, unsigned int bits)
Definition: simd.h:4660
vfloat4(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
Definition: simd.h:1934
GA_API const UT_StringHolder N
friend vint16 operator&(const vint16 &a, const vint16 &b)
Definition: simd.h:6268
simd_t m_simd
Definition: simd.h:625
simd_t simd() const
Definition: simd.h:858
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
const vint8 & operator=(int a)
Assign one value to all components.
Definition: simd.h:5227
void store_mask(int mask, value_t *values) const
Definition: simd.h:5267
static const vbool8 False()
Return a vbool8 the is 'false' for all values.
Definition: simd.h:3722
friend vint16 operator-(const vint16 &a)
Definition: simd.h:6209
value_t w() const
Definition: simd.h:5702
friend vbool4 operator&(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3403
friend vbool16 operator&(const vbool16 &a, const vbool16 &b)
Definition: simd.h:4083
vbool4 vbool_t
bool type of the same length
Definition: simd.h:966
friend vfloat8 operator%(const vfloat8 &a, const vfloat8 &b)
vbool8 hi() const
Extract the higher precision vbool8.
Definition: simd.h:4066
friend vint8 operator+(const vint8 &a, const vint8 &b)
Definition: simd.h:5398
friend const vbool8 & operator^=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3827
static const char * name()
Definition: simd.h:407
value_t z() const
Definition: simd.h:4865
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend const vfloat16 & operator-=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9894
void set_z(value_t val)
Definition: simd.h:6515
friend vbool16 operator!=(const vbool16 &a, const vbool16 &b)
Definition: simd.h:4134
const vfloat16 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2938
vint4 rotl32(const vint4 &x, const unsigned int k)
Definition: simd.h:5036
friend const vbool8 & operator&=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3819
vint4 floori(const vfloat4 &a)
Definition: simd.h:2197
OIIO_FORCEINLINE const vint4 & operator-=(vint4 &a, const vint4 &b)
Definition: simd.h:4539
friend const vint4 & operator|=(vint4 &a, const vint4 &b)
Definition: simd.h:4633
vint4(const vint4 &other)
Copy construct from another vint4.
Definition: simd.h:1003
friend vbool16 operator>=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9972
friend const vint16 & operator>>=(vint16 &a, unsigned int bits)
Definition: simd.h:6332
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:5087
static const vint8 Iota(int start=0, int step=1)
Definition: simd.h:5357
friend vint4 operator>>(const vint4 &a, unsigned int bits)
Definition: simd.h:4675
OIIO_FORCEINLINE vint4 operator*(const vint4 &a, const vint4 &b)
Definition: simd.h:4566
OIIO_FORCEINLINE const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3434
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:2917
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:857
friend vbool8 operator==(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9076
static const vint8 Giota()
Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
Definition: simd.h:5363
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
Definition: ImathFun.h:26
friend const vfloat8 & operator*=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9060
friend vfloat4 operator*(const vfloat4 &V, const matrix44 &M)
Definition: simd.h:8422
friend const vint8 & operator^=(vint8 &a, const vint8 &b)
Definition: simd.h:5499
vfloat3(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
Definition: simd.h:2309
const vfloat4 & operator+=(const vfloat4 &a)
Definition: simd.h:7111
friend vint4 operator*(const vint4 &a, const vint4 &b)
Definition: simd.h:4566
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3953
friend const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3438
simd_t m_simd
Definition: simd.h:930
static const char * name()
Definition: simd.h:408
friend vbool4 operator~(const vbool4 &a)
Definition: simd.h:3446
void set_z(value_t val)
Definition: simd.h:4869
const vfloat3 & operator+=(const vfloat3 &a)
Definition: simd.h:8156
void store_mask(const vbool_t &mask, value_t *values) const
Definition: simd.h:9804
GLboolean r
Definition: glcorearb.h:1222
friend vbool4 operator<=(const vint4 &a, const vint4 &b)
Definition: simd.h:4742
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:127
friend const vint16 & operator|=(vint16 &a, const vint16 &b)
Definition: simd.h:6286
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:6741
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:5001
friend const vint8 & operator&=(vint8 &a, const vint8 &b)
Definition: simd.h:5479
friend std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Stream output.
Definition: simd.h:3277
static const char * name()
Definition: simd.h:409
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:816
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:667
vfloat4 hi() const
Extract the higher precision vfloat4.
Definition: simd.h:8670
void load_mask(const vbool_t &mask, const value_t *values)
Definition: simd.h:9794
OIIO_FORCEINLINE T log(const T &v)
Definition: simd.h:7905
friend vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7300
friend vfloat8 operator*(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:9052
friend vbool8 operator<=(const vint8 &a, const vint8 &b)
Definition: simd.h:5599
simd_t & simd()
Definition: simd.h:703
OIIO_FORCEINLINE vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7292
vfloat4 rsqrt(const vfloat4 &a)
Fully accurate 1/sqrt.
Definition: simd.h:7706
vbool16(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:827
value_t y() const
Definition: simd.h:10057
void set_z(value_t val)
Definition: simd.h:9189
friend vbool4 operator<=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7282
static const vint8 One()
Return an vint8 with all components set to 1.
Definition: simd.h:5352
simd_t simd() const
Definition: simd.h:1013
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:4198
vint4 vint_t
int type of the same length
Definition: simd.h:968
vfloat16(const short *vals)
Construct from a pointer to short values.
Definition: simd.h:2924
OIIO_FORCEINLINE matrix44(float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33)
Construct from 16 floats.
Definition: simd.h:2465
vint4 rint(const vfloat4 &a)
Definition: simd.h:7669
void setcomp(int i, float value)
Component access (set).
simd_raw_t< float, 16 >::type simd_t
the native SIMD type used
Definition: simd.h:2871
vfloat4 sign(const vfloat4 &a)
1.0 when value >= 0, -1 when negative
Definition: simd.h:7622
simd_t & simd()
Definition: simd.h:2914
static const vint16 Zero()
Return an vint16 with all components set to 0.
Definition: simd.h:6127
value_t * data()
Definition: simd.h:1612
value_t m_val[elements]
Definition: simd.h:1772
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vbool16 operator>(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9963
OIIO_FORCEINLINE const vint4 & operator*=(vint4 &a, const vint4 &b)
Definition: simd.h:4577
bool all(const vbool4 &v)
Definition: simd.h:3599
void set_z(value_t val)
Definition: simd.h:10062
void store(float *values) const
Definition: simd.h:8867
static constexpr int elements
Definition: simd.h:2423
friend const vint4 & operator/=(vint4 &a, const vint4 &b)
Definition: simd.h:4587
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
vfloat8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:2583
vint4 andnot(const vint4 &a, const vint4 &b)
andnot(a,b) returns ((~a) & b)
Definition: simd.h:5041
vfloat3(const char *vals)
Construct from a pointer to 4 char values.
Definition: simd.h:2312
void store(int *values) const
Store the values into memory.
Definition: simd.h:6105
OIIO_FORCEINLINE matrix44(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d)
Construct from 4 vfloat4 rows.
Definition: simd.h:2448
const float * data() const
Definition: simd.h:2517
void normalize()
Normalize in place.
Definition: simd.h:2399
static const vint16 Iota(int start=0, int step=1)
Definition: simd.h:6140
friend vbool4 operator>=(const vint4 &a, const vint4 &b)
Definition: simd.h:4738
bool operator==(const matrix44 &m) const
Definition: simd.h:8460
simd_t & simd()
Definition: simd.h:1913
vfloat4 xyz1() const
Return xyz components, plus 1 for w.
Definition: simd.h:7312
static const char * type_name()
Definition: simd.h:2420
const vfloat3 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2320
Is a type T one of our SIMD-based types?
Definition: simd.h:413
static const vfloat16 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:9609
OIIO_FORCEINLINE vint4 operator-(const vint4 &a)
Definition: simd.h:4517
friend vbool4 operator==(const vint4 &a, const vint4 &b)
Definition: simd.h:4703
friend std::ostream & operator<<(std::ostream &cout, const vfloat16 &val)
Stream output.
Definition: simd.h:9532
friend std::ostream & operator<<(std::ostream &cout, const vbool8 &a)
Stream output.
Definition: simd.h:3629
friend std::ostream & operator<<(std::ostream &cout, const vbool16 &a)
Stream output.
Definition: simd.h:3945
vfloat8(const unsigned char *vals)
Construct from a pointer to unsigned char values.
Definition: simd.h:2601
Definition: format.h:1821
friend vbool16 operator==(const vint16 &a, const vint16 &b)
Definition: simd.h:6347
friend vint16 operator~(const vint16 &a)
Definition: simd.h:6299
int operator[](int i) const
Component access (get)
Definition: simd.h:3256
vint4 ifloor(const vfloat4 &a)
(int)floor
Definition: simd.h:7658
friend vbool4 operator==(const vbool4 &a, const vbool4 &b)
Comparison operators, component by component.
Definition: simd.h:3457
float dot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b.
Definition: simd.h:7508
Definition: format.h:4365
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vbool8 operator==(const vint8 &a, const vint8 &b)
Definition: simd.h:5550
OIIO_FORCEINLINE vint4 operator+(const vint4 &a, const vint4 &b)
Definition: simd.h:4502
OIIO_FORCEINLINE const vint4 & operator%=(vint4 &a, const vint4 &b)
Definition: simd.h:4596
value_t x() const
Definition: simd.h:4863
friend vbool4 operator>(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7262
vint4 AxBxCxDx(const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d)
Definition: simd.h:8065
vint8 hi() const
Extract the higher precision vint8.
Definition: simd.h:6162
OIIO_FORCEINLINE matrix44(const matrix44 &M)
Copy constructor.
Definition: simd.h:2429
friend vint8 operator%(const vint8 &a, const vint8 &b)
Definition: simd.h:5456
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3271
vfloat4 xyz0() const
Return xyz components, plus 0 for w.
Definition: simd.h:7308
simd_bool_t< 4 >::type simd_t
the native SIMD type used
Definition: simd.h:529
simd_t m_simd
Definition: simd.h:3086
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:5912
int reduce_add(const vint4 &v)
Definition: simd.h:4915
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:126
GLenum src
Definition: glcorearb.h:1793
friend std::ostream & operator<<(std::ostream &cout, const vfloat3 &val)
Stream output.
Definition: simd.h:8205
void store(float *values) const
Definition: simd.h:8137
friend const vint8 & operator<<=(vint8 &a, unsigned int bits)
Definition: simd.h:5522
static const char * type_name()
Definition: simd.h:960
friend vbool4 operator>=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7272
friend const vint8 & operator|=(vint8 &a, const vint8 &b)
Definition: simd.h:5489