HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio
4 
5 /// @file simd.h
6 ///
7 /// @brief Classes for SIMD processing.
8 ///
9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
10 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
11 ///
12 /// Similar guide for ARM intrinsics:
13 /// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
14 ///
15 /// It helped me a lot to peruse the source of these packages:
16 /// Syrah: https://github.com/boulos/syrah
17 /// Embree: https://github.com/embree
18 /// Vectorial: https://github.com/scoopr/vectorial
19 ///
20 /// To find out which CPU features you have:
21 /// Linux: cat /proc/cpuinfo
22 /// OSX: sysctl machdep.cpu.features
23 ///
24 /// Additional web resources:
25 /// http://www.codersnotes.com/notes/maths-lib-2016/
26 
27 // clang-format off
28 
29 #pragma once
30 
31 #include <algorithm>
32 #include <cstring>
33 
34 #include <OpenImageIO/Imath.h>
35 #include <OpenImageIO/dassert.h>
36 #include <OpenImageIO/platform.h>
37 
38 
39 //////////////////////////////////////////////////////////////////////////
40 // Sort out which SIMD capabilities we have and set definitions
41 // appropriately. This is mostly for internal (within this file) use,
42 // but client applications using this header may find a few of the macros
43 // we define to be useful:
44 //
45 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
46 // hardware is available, this will hold the width in number of
47 // float SIMD "lanes" of widest SIMD registers available. For
48 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
49 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
50 // etc. Using SIMD classes wider than this should work (will be
51 // emulated with narrower SIMD or scalar operations), but is not
52 // expected to have high performance.
53 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
54 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
55 // higher (including AVX).
56 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
57 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
58 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
59 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
60 // available (generally will be OIIO_SIMD*4).
61 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
62 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
63 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
64 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
65 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
66 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
67 
68 #if defined(__CUDA_ARCH__)
69  // Cuda -- don't include any of these headers
70 #elif defined(_WIN32)
71 # include <intrin.h>
72 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
73 # include <x86intrin.h>
74 #elif defined(__GNUC__) && defined(__ARM_NEON__)
75 # include <arm_neon.h>
76 #endif
77 
78 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us
79 // to test thoroughly. We presume that anybody needing high performance
80 // badly enough to want SIMD also is on a 64 bit CPU.
81 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
82 #define OIIO_NO_SSE 1
83 #endif
84 
85 // Make sure to disable SSE intrinsics when compiling for Cuda.
86 #if defined(__CUDA_ARCH__) && !defined(OIIO_NO_SSE)
87 #define OIIO_NO_SSE 1
88 #endif
89 
90 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
91 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
92 # define OIIO_SIMD_SSE 4
93  /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
94  * instructions specific to 4.2, but they are all related to string
95  * comparisons and CRCs, which don't currently seem relevant to OIIO,
96  * so for simplicity, we sweep this difference under the rug.
97  */
98 # elif defined(__SSSE3__)
99 # define OIIO_SIMD_SSE 3
100  /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
101  * there are a few older architectures that are SSE3 but not SSSE3,
102  * and this simplification means that these particular old platforms
103  * will only get SSE2 goodness out of our code. So be it. Anybody who
104  * cares about performance is probably using a 64 bit machine that's
105  * SSE 4.x or AVX by now.
106  */
107 # else
108 # define OIIO_SIMD_SSE 2
109 # endif
110 # define OIIO_SIMD 4
111 # define OIIO_SIMD_MAX_SIZE_BYTES 16
112 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
113 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
114 #else
115 # define OIIO_SIMD_SSE 0
116 #endif
117 
118 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
119  // N.B. Any machine with AVX will also have SSE
120 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
121 # define OIIO_SIMD_AVX 2
122 # else
123 # define OIIO_SIMD_AVX 1
124 # endif
125 # undef OIIO_SIMD
126 # define OIIO_SIMD 8
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 32
129 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
130 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
131 # if defined(__AVX512F__)
132 # undef OIIO_SIMD_AVX
133 # define OIIO_SIMD_AVX 512
134 # undef OIIO_SIMD_MAX_SIZE_BYTES
135 # define OIIO_SIMD_MAX_SIZE_BYTES 64
136 # undef OIIO_SIMD
137 # define OIIO_SIMD 16
138 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
139 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
140 # define OIIO_AVX512F_ENABLED 1
141 # endif
142 # if defined(__AVX512DQ__)
143 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */
144 # else
145 # define OIIO_AVX512DQ_ENABLED 0
146 # endif
147 # if defined(__AVX512PF__)
148 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */
149 # else
150 # define OIIO_AVX512PF_ENABLED 0
151 # endif
152 # if defined(__AVX512ER__)
153 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */
154 # else
155 # define OIIO_AVX512ER_ENABLED 0
156 # endif
157 # if defined(__AVX512CD__)
158 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */
159 # else
160 # define OIIO_AVX512CD_ENABLED 0
161 # endif
162 # if defined(__AVX512BW__)
163 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */
164 # else
165 # define OIIO_AVX512BW_ENABLED 0
166 # endif
167 # if defined(__AVX512VL__)
168 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */
169 # else
170 # define OIIO_AVX512VL_ENABLED 0
171 # endif
172 #else
173 # define OIIO_SIMD_AVX 0
174 # define OIIO_AVX512VL_ENABLED 0
175 # define OIIO_AVX512DQ_ENABLED 0
176 # define OIIO_AVX512PF_ENABLED 0
177 # define OIIO_AVX512ER_ENABLED 0
178 # define OIIO_AVX512CD_ENABLED 0
179 # define OIIO_AVX512BW_ENABLED 0
180 #endif
181 
182 #if defined(__FMA__)
183 # define OIIO_FMA_ENABLED 1
184 #else
185 # define OIIO_FMA_ENABLED 0
186 #endif
187 #if defined(__AVX512IFMA__)
188 # define OIIO_AVX512IFMA_ENABLED 1
189 #else
190 # define OIIO_AVX512IFMA_ENABLED 0
191 #endif
192 
193 #if defined(__F16C__)
194 # define OIIO_F16C_ENABLED 1
195 #else
196 # define OIIO_F16C_ENABLED 0
197 #endif
198 
199 // FIXME Future: support ARM Neon
200 // Uncomment this when somebody with Neon can verify it works
201 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
202 # define OIIO_SIMD 4
203 # define OIIO_SIMD_NEON 1
204 # define OIIO_SIMD_MAX_SIZE_BYTES 16
205 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
206 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
207 #else
208 # define OIIO_SIMD_NEON 0
209 #endif
210 
211 #ifndef OIIO_SIMD
212  // No SIMD available
213 # define OIIO_SIMD 0
214 # define OIIO_SIMD4_ALIGN
215 # define OIIO_SIMD_MAX_SIZE_BYTES 16
216 #endif
217 
218 #ifndef OIIO_SIMD8_ALIGN
219 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
220 #endif
221 #ifndef OIIO_SIMD16_ALIGN
222 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
223 #endif
224 
225 
226 // General features that client apps may want to test for, for conditional
227 // compilation. Will add to this over time as needed. Note that just
228 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
229 // the vfloat8 class (and friends) are in this version of simd.h, but that's
230 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
231 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */
232 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */
233 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */
234 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */
235 
236 
237 // Embarrassing hack: Xlib.h #define's True and False!
238 #ifdef True
239 # undef True
240 #endif
241 #ifdef False
242 # undef False
243 #endif
244 
245 
246 
248 
249 namespace simd {
250 
251 //////////////////////////////////////////////////////////////////////////
252 // Forward declarations of our main SIMD classes
253 
254 class vbool4;
255 class vint4;
256 class vfloat4;
257 class vfloat3;
258 class matrix44;
259 class vbool8;
260 class vint8;
261 class vfloat8;
262 class vbool16;
263 class vint16;
264 class vfloat16;
265 
266 // Deprecated names -- remove these in 1.9
267 typedef vbool4 mask4; // old name
268 typedef vbool4 bool4;
269 typedef vbool8 bool8;
270 typedef vint4 int4;
271 typedef vint8 int8;
272 typedef vfloat3 float3;
273 typedef vfloat4 float4;
274 typedef vfloat8 float8;
275 
276 
277 
278 //////////////////////////////////////////////////////////////////////////
279 // Template magic to determine the raw SIMD types involved, and other
280 // things helpful for metaprogramming.
281 
282 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
283 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
284 
285 #if OIIO_SIMD_SSE
286 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
287 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
288 template<> struct simd_bool_t<4> { typedef __m128 type; };
289 #endif
290 
291 #if OIIO_SIMD_AVX
292 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
293 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
294 template<> struct simd_bool_t<8> { typedef __m256 type; };
295 #endif
296 
297 #if OIIO_SIMD_AVX >= 512
298 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
299 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
300 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
301 #else
302 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
303 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
304 template<> struct simd_bool_t<16> { typedef uint16_t type; };
305 #endif
306 
307 #if OIIO_SIMD_NEON
308 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; };
309 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
310 template<> struct simd_bool_t<4> { typedef uint32x4_t type; };
311 #endif
312 
313 
314 /// Template to retrieve the vector type from the scalar. For example,
315 /// simd::VecType<int,4> will be vfloat4.
316 template<typename T,int elements> struct VecType {};
317 template<> struct VecType<int,1> { typedef int type; };
318 template<> struct VecType<float,1> { typedef float type; };
319 template<> struct VecType<int,4> { typedef vint4 type; };
320 template<> struct VecType<float,4> { typedef vfloat4 type; };
321 template<> struct VecType<float,3> { typedef vfloat3 type; };
322 template<> struct VecType<bool,4> { typedef vbool4 type; };
323 template<> struct VecType<int,8> { typedef vint8 type; };
324 template<> struct VecType<float,8> { typedef vfloat8 type; };
325 template<> struct VecType<bool,8> { typedef vbool8 type; };
326 template<> struct VecType<int,16> { typedef vint16 type; };
327 template<> struct VecType<float,16> { typedef vfloat16 type; };
328 template<> struct VecType<bool,16> { typedef vbool16 type; };
329 
330 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
331 /// anything but our SIMD types.
332 template<typename T> struct SimdSize { static const int size = 1; };
333 template<> struct SimdSize<vint4> { static const int size = 4; };
334 template<> struct SimdSize<vfloat4> { static const int size = 4; };
335 template<> struct SimdSize<vfloat3> { static const int size = 4; };
336 template<> struct SimdSize<vbool4> { static const int size = 4; };
337 template<> struct SimdSize<vint8> { static const int size = 8; };
338 template<> struct SimdSize<vfloat8> { static const int size = 8; };
339 template<> struct SimdSize<vbool8> { static const int size = 8; };
340 template<> struct SimdSize<vint16> { static const int size = 16; };
341 template<> struct SimdSize<vfloat16> { static const int size = 16; };
342 template<> struct SimdSize<vbool16> { static const int size = 16; };
343 
344 /// Template to retrieve the number of elements size of a SIMD type. Rigged
345 /// to be 1 for anything but our SIMD types.
346 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
347 template<> struct SimdElements<vfloat3> { static const int size = 3; };
348 
349 /// Template giving a printable name for each type
350 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
351 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } };
352 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } };
353 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } };
354 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } };
355 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } };
356 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } };
357 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } };
358 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } };
359 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } };
360 
361 
362 //////////////////////////////////////////////////////////////////////////
363 // Macros helpful for making static constants in code.
364 
365 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
366  static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
367 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
368  static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
369 # define OIIO_SIMD_INT4_CONST(name,val) \
370  static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
371 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
372  static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
373 # define OIIO_SIMD_UINT4_CONST(name,val) \
374  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
375 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
376  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
377 
378 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
379  static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
380  (val), (val), (val), (val) }
381 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
382  static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
383  (v4), (v5), (v6), (v7) }
384 # define OIIO_SIMD_INT8_CONST(name,val) \
385  static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
386  (val), (val), (val), (val) }
387 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
388  static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
389  (v4), (v5), (v6), (v7) }
390 # define OIIO_SIMD_UINT8_CONST(name,val) \
391  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
392  (val), (val), (val), (val) }
393 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
394  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
395  (v4), (v5), (v6), (v7) }
396 
397 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
398  static const OIIO_SIMD16_ALIGN float name[16] = { \
399  (val), (val), (val), (val), (val), (val), (val), (val), \
400  (val), (val), (val), (val), (val), (val), (val), (val) }
401 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
402  static const OIIO_SIMD16_ALIGN float name[16] = { \
403  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
404  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
405 # define OIIO_SIMD_INT16_CONST(name,val) \
406  static const OIIO_SIMD16_ALIGN int name[16] = { \
407  (val), (val), (val), (val), (val), (val), (val), (val), \
408  (val), (val), (val), (val), (val), (val), (val), (val) }
409 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
410  static const OIIO_SIMD16_ALIGN int name[16] = { \
411  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
412  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
413 # define OIIO_SIMD_UINT16_CONST(name,val) \
414  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
415  (val), (val), (val), (val), (val), (val), (val), (val), \
416  (val), (val), (val), (val), (val), (val), (val), (val) }
417 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
418  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
419  (val), (val), (val), (val), (val), (val), (val), (val), \
420  (val), (val), (val), (val), (val), (val), (val), (val) }
421 
422 
423 //////////////////////////////////////////////////////////////////////////
424 // Some macros just for use in this file (#undef-ed at the end) making
425 // it more succinct to express per-element operations.
426 
427 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
428 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
429 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
430  for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
431 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
432 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
433 
434 
435 
436 //////////////////////////////////////////////////////////////////////////
437 //////////////////////////////////////////////////////////////////////////
438 // The public declarations of the main SIMD classes follow: boolN, intN,
439 // floatN, matrix44.
440 //
441 // These class declarations are intended to be brief and self-documenting,
442 // and give all the information that users or client applications need to
443 // know to use these classes.
444 //
445 // No implementations are given inline except for the briefest, completely
446 // generic methods that don't have any architecture-specific overloads.
447 // After the class defintions, there will be an immense pile of full
448 // implementation definitions, which casual users are not expected to
449 // understand.
450 //////////////////////////////////////////////////////////////////////////
451 //////////////////////////////////////////////////////////////////////////
452 
453 
454 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
455 /// SIMD instructions when available. This is what is naturally produced by
456 /// SIMD comparison operators on the vfloat4 and vint4 types.
457 class vbool4 {
458 public:
459  static const char* type_name() { return "vbool4"; }
460  typedef bool value_t; ///< Underlying equivalent scalar value type
461  enum { elements = 4 }; ///< Number of scalar elements
462  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
463  enum { bits = elements*32 }; ///< Total number of bits
464  typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used
465 
466  /// Default constructor (contents undefined)
467  vbool4 () { }
468 
469  /// Construct from a single value (store it in all slots)
470  vbool4 (bool a) { load(a); }
471 
472  explicit vbool4 (const bool *a);
473 
474  /// Construct from 4 bool values
475  vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
476 
477  /// Copy construct from another vbool4
478  vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
479 
480  /// Construct from 4 int values
481  vbool4 (int a, int b, int c, int d) {
482  load (bool(a), bool(b), bool(c), bool(d));
483  }
484 
485  /// Construct from a SIMD int (is each element nonzero?)
486  vbool4 (const vint4 &i);
487 
488  /// Construct from the underlying SIMD type
489  vbool4 (const simd_t& m) : m_simd(m) { }
490 
491  /// Return the raw SIMD type
492  operator simd_t () const { return m_simd; }
493  simd_t simd () const { return m_simd; }
494  simd_t& simd () { return m_simd; }
495 
496  /// Extract the bitmask
497  int bitmask () const;
498 
499  /// Convert from integer bitmask to a true vbool4
500  static vbool4 from_bitmask (int bitmask);
501 
502  /// Set all components to false
503  void clear ();
504 
505  /// Return a vbool4 the is 'false' for all values
506  static const vbool4 False ();
507 
508  /// Return a vbool4 the is 'true' for all values
509  static const vbool4 True ();
510 
511  /// Assign one value to all components
512  const vbool4 & operator= (bool a) { load(a); return *this; }
513 
514  /// Assignment of another vbool4
515  const vbool4 & operator= (const vbool4 & other);
516 
517  /// Component access (get)
518  int operator[] (int i) const;
519 
520  /// Component access (set).
521  void setcomp (int i, bool value);
522 
523  /// Component access (set).
524  /// NOTE: avoid this unsafe construct. It will go away some day.
525  int& operator[] (int i);
526 
527  /// Helper: load a single value into all components.
528  void load (bool a);
529 
530  /// Helper: load separate values into each component.
531  void load (bool a, bool b, bool c, bool d);
532 
533  /// Helper: store the values into memory as bools.
534  void store (bool *values) const;
535 
536  /// Store the first n values into memory.
537  void store (bool *values, int n) const;
538 
539  /// Logical/bitwise operators, component-by-component
540  friend vbool4 operator! (const vbool4& a);
541  friend vbool4 operator& (const vbool4& a, const vbool4& b);
542  friend vbool4 operator| (const vbool4& a, const vbool4& b);
543  friend vbool4 operator^ (const vbool4& a, const vbool4& b);
544  friend vbool4 operator~ (const vbool4& a);
545  friend const vbool4& operator&= (vbool4& a, const vbool4& b);
546  friend const vbool4& operator|= (vbool4& a, const vbool4& b);
547  friend const vbool4& operator^= (vbool4& a, const vbool4& b);
548 
549  /// Comparison operators, component by component
550  friend vbool4 operator== (const vbool4& a, const vbool4& b);
551  friend vbool4 operator!= (const vbool4& a, const vbool4& b);
552 
553  /// Stream output
554  friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
555 
556 private:
557  // The actual data representation
558  union {
561  };
562 };
563 
564 
565 
566 /// Helper: shuffle/swizzle with constant (templated) indices.
567 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
568 template<int i0, int i1, int i2, int i3>
569 OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
570 
571 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
572 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
573 
574 /// Helper: as rapid as possible extraction of one component, when the
575 /// index is fixed.
576 template<int i> OIIO_FORCEINLINE bool extract (const vbool4& a);
577 
578 /// Helper: substitute val for a[i]
579 template<int i> OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val);
580 
581 /// Logical reduction across all components.
582 bool reduce_and (const vbool4& v);
583 bool reduce_or (const vbool4& v);
584 
585 // Are all/any/no components true?
586 bool all (const vbool4& v);
587 bool any (const vbool4& v);
588 bool none (const vbool4& v);
589 
590 // It's handy to have this defined for regular bool as well
591 inline bool all (bool v) { return v; }
592 
593 
594 
595 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
596 /// SIMD instructions when available. This is what is naturally produced by
597 /// SIMD comparison operators on the vfloat8 and vint8 types.
598 class vbool8 {
599 public:
600  static const char* type_name() { return "vbool8"; }
601  typedef bool value_t; ///< Underlying equivalent scalar value type
602  enum { elements = 8 }; ///< Number of scalar elements
603  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
604  enum { bits = elements*32 }; ///< Total number of bits
605  typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used
606 
607  /// Default constructor (contents undefined)
608  vbool8 () { }
609 
610  /// Construct from a single value (store it in all slots)
611  vbool8 (bool a) { load (a); }
612 
613  explicit vbool8 (const bool *values);
614 
615  /// Construct from 8 bool values
616  vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
617 
618  /// Copy construct from another vbool8
619  vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
620 
621  /// Construct from 8 int values
622  vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
623 
624  /// Construct from a SIMD int (is each element nonzero?)
625  vbool8 (const vint8 &i);
626 
627  /// Construct from two vbool4's
628  vbool8 (const vbool4 &lo, const vbool4 &hi);
629 
630  /// Construct from the underlying SIMD type
631  vbool8 (const simd_t& m) : m_simd(m) { }
632 
633  /// Return the raw SIMD type
634  operator simd_t () const { return m_simd; }
635  simd_t simd () const { return m_simd; }
636  simd_t& simd () { return m_simd; }
637 
638  /// Extract the bitmask
639  int bitmask () const;
640 
641  /// Convert from integer bitmask to a true vbool8
642  static vbool8 from_bitmask (int bitmask);
643 
644  /// Set all components to false
645  void clear ();
646 
647  /// Return a vbool8 the is 'false' for all values
648  static const vbool8 False ();
649 
650  /// Return a vbool8 the is 'true' for all values
651  static const vbool8 True ();
652 
653  /// Assign one value to all components
654  const vbool8 & operator= (bool a);
655 
656  /// Assignment of another vbool8
657  const vbool8 & operator= (const vbool8 & other);
658 
659  /// Component access (get)
660  int operator[] (int i) const;
661 
662  /// Component access (set).
663  void setcomp (int i, bool value);
664 
665  /// Component access (set).
666  /// NOTE: avoid this unsafe construct. It will go away some day.
667  int& operator[] (int i);
668 
669  /// Extract the lower precision vbool4
670  vbool4 lo () const;
671 
672  /// Extract the higher precision vbool4
673  vbool4 hi () const;
674 
675  /// Helper: load a single value into all components.
676  void load (bool a);
677 
678  /// Helper: load separate values into each component.
679  void load (bool a, bool b, bool c, bool d,
680  bool e, bool f, bool g, bool h);
681 
682  /// Helper: store the values into memory as bools.
683  void store (bool *values) const;
684 
685  /// Store the first n values into memory.
686  void store (bool *values, int n) const;
687 
688  /// Logical/bitwise operators, component-by-component
689  friend vbool8 operator! (const vbool8& a);
690  friend vbool8 operator& (const vbool8& a, const vbool8& b);
691  friend vbool8 operator| (const vbool8& a, const vbool8& b);
692  friend vbool8 operator^ (const vbool8& a, const vbool8& b);
693  friend vbool8 operator~ (const vbool8& a);
694  friend const vbool8& operator&= (vbool8& a, const vbool8& b);
695  friend const vbool8& operator|= (vbool8& a, const vbool8& b);
696  friend const vbool8& operator^= (vbool8& a, const vbool8& b);
697 
698  /// Comparison operators, component by component
699  friend vbool8 operator== (const vbool8& a, const vbool8& b);
700  friend vbool8 operator!= (const vbool8& a, const vbool8& b);
701 
702  /// Stream output
703  friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
704 
705 private:
706  // The actual data representation
707  union {
711  };
712 };
713 
714 
715 
716 /// Helper: shuffle/swizzle with constant (templated) indices.
717 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
718 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
719 OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
720 
721 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
722 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
723 
724 /// Helper: as rapid as possible extraction of one component, when the
725 /// index is fixed.
726 template<int i> OIIO_FORCEINLINE bool extract (const vbool8& a);
727 
728 /// Helper: substitute val for a[i]
729 template<int i> OIIO_FORCEINLINE vbool8 insert (const vbool8& a, bool val);
730 
731 /// Logical reduction across all components.
732 bool reduce_and (const vbool8& v);
733 bool reduce_or (const vbool8& v);
734 
735 // Are all/any/no components true?
736 bool all (const vbool8& v);
737 bool any (const vbool8& v);
738 bool none (const vbool8& v);
739 
740 
741 
742 
743 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
744 /// by SIMD instructions when available. This is what is naturally produced
745 /// by SIMD comparison operators on the vfloat16 and vint16 types.
746 class vbool16 {
747 public:
748  static const char* type_name() { return "vbool16"; }
749  typedef bool value_t; ///< Underlying equivalent scalar value type
750  enum { elements = 16 }; ///< Number of scalar elements
751  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
752  enum { bits = 16 }; ///< Total number of bits
753  typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used
754 
755  /// Default constructor (contents undefined)
756  vbool16 () { }
757 
758  /// Construct from a single value (store it in all slots)
759  vbool16 (bool a) { load (a); }
760 
761  explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
762 
763  explicit vbool16 (const bool *values);
764 
765  /// Construct from 16 bool values
766  vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
767  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
768 
769  /// Copy construct from another vbool16
770  vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
771 
772  /// Construct from 16 int values
773  vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
774  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
775 
776  /// Construct from a SIMD int (is each element nonzero?)
777  vbool16 (const vint16 &i);
778 
779  /// Construct from two vbool8's
780  vbool16 (const vbool8 &lo, const vbool8 &hi);
781 
782  /// Construct from four vbool4's
783  vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
784 
785  /// Construct from the underlying SIMD type
786  vbool16 (const simd_t& m) : m_simd(m) { }
787 
788  /// Return the raw SIMD type
789  operator simd_t () const { return m_simd; }
790  simd_t simd () const { return m_simd; }
791  simd_t& simd () { return m_simd; }
792 
793  int bitmask () const;
794 
795  /// Convert from integer bitmask to a true vbool16
796  static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
797 
798  /// Set all components to false
799  void clear ();
800 
801  /// Return a vbool16 the is 'false' for all values
802  static const vbool16 False ();
803 
804  /// Return a vbool16 the is 'true' for all values
805  static const vbool16 True ();
806 
807  /// Assign one value to all components
808  const vbool16 & operator= (bool a);
809 
810  /// Assignment of another vbool16
811  const vbool16 & operator= (const vbool16 & other);
812 
813  /// Component access (get)
814  int operator[] (int i) const;
815 
816  /// Component access (set).
817  void setcomp (int i, bool value);
818 
819  /// Extract the lower precision vbool8
820  vbool8 lo () const;
821 
822  /// Extract the higher precision vbool8
823  vbool8 hi () const;
824 
825  /// Helper: load a single value into all components.
826  void load (bool a);
827 
828  /// Helper: load separate values into each component.
829  void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
830  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
831 
832  /// Helper: load all components from a bitmask in an int.
833  void load_bitmask (int a);
834 
835  /// Helper: store the values into memory as bools.
836  void store (bool *values) const;
837 
838  /// Store the first n values into memory.
839  void store (bool *values, int n) const;
840 
841  /// Logical/bitwise operators, component-by-component
842  friend vbool4 operator! (const vbool4& a);
843  friend vbool16 operator! (const vbool16& a);
844  friend vbool16 operator& (const vbool16& a, const vbool16& b);
845  friend vbool16 operator| (const vbool16& a, const vbool16& b);
846  friend vbool16 operator^ (const vbool16& a, const vbool16& b);
847  friend vbool16 operator~ (const vbool16& a);
848  friend const vbool16& operator&= (vbool16& a, const vbool16& b);
849  friend const vbool16& operator|= (vbool16& a, const vbool16& b);
850  friend const vbool16& operator^= (vbool16& a, const vbool16& b);
851 
852  /// Comparison operators, component by component
853  friend vbool16 operator== (const vbool16& a, const vbool16& b);
854  friend vbool16 operator!= (const vbool16& a, const vbool16& b);
855 
856  /// Stream output
857  friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
858 
859 private:
860  // The actual data representation
861  union {
863  uint16_t m_bits;
864  };
865 };
866 
867 
868 
869 /// Helper: as rapid as possible extraction of one component, when the
870 /// index is fixed.
871 template<int i> OIIO_FORCEINLINE bool extract (const vbool16& a);
872 
873 /// Helper: substitute val for a[i]
874 template<int i> OIIO_FORCEINLINE vbool16 insert (const vbool16& a, bool val);
875 
876 /// Logical reduction across all components.
877 bool reduce_and (const vbool16& v);
878 bool reduce_or (const vbool16& v);
879 
880 // Are all/any/no components true?
881 bool all (const vbool16& v);
882 bool any (const vbool16& v);
883 bool none (const vbool16& v);
884 
885 
886 
887 
888 
889 /// Integer 4-vector, accelerated by SIMD instructions when available.
890 class vint4 {
891 public:
892  static const char* type_name() { return "vint4"; }
893  typedef int value_t; ///< Underlying equivalent scalar value type
894  enum { elements = 4 }; ///< Number of scalar elements
895  enum { paddedelements =4 }; ///< Number of scalar elements for full pad
896  enum { bits = 128 }; ///< Total number of bits
897  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
898  typedef vbool4 vbool_t; ///< bool type of the same length
899  typedef vfloat4 vfloat_t; ///< float type of the same length
900  typedef vint4 vint_t; ///< int type of the same length
901  OIIO_DEPRECATED("use vbool_t (1.8)")
902  typedef vbool4 bool_t; // old name (deprecated 1.8)
903  OIIO_DEPRECATED("use vfloat_t (1.8)")
904  typedef vfloat4 float_t; // old name (deprecated 1.8)
905 
906  /// Default constructor (contents undefined)
907  vint4 () { }
908 
909  /// Construct from a single value (store it in all slots)
910  vint4 (int a);
911 
912  /// Construct from 2 values -- (a,a,b,b)
913  vint4 (int a, int b);
914 
915  /// Construct from 4 values
916  vint4 (int a, int b, int c, int d);
917 
918  /// Construct from a pointer to values
919  vint4 (const int *vals);
920 
921  /// Construct from a pointer to unsigned short values
922  explicit vint4 (const unsigned short *vals);
923 
924  /// Construct from a pointer to signed short values
925  explicit vint4 (const short *vals);
926 
927  /// Construct from a pointer to unsigned char values (0 - 255)
928  explicit vint4 (const unsigned char *vals);
929 
930  /// Construct from a pointer to signed char values (-128 - 127)
931  explicit vint4 (const char *vals);
932 
933  /// Copy construct from another vint4
934  vint4 (const vint4 & other) { m_simd = other.m_simd; }
935 
936  /// Convert a vfloat to an vint. Equivalent to i = (int)f;
937  explicit vint4 (const vfloat4& f); // implementation below
938 
939  /// Construct from the underlying SIMD type
940  vint4 (const simd_t& m) : m_simd(m) { }
941 
942  /// Return the raw SIMD type
943  operator simd_t () const { return m_simd; }
944  simd_t simd () const { return m_simd; }
945  simd_t& simd () { return m_simd; }
946 
947  /// Return a pointer to the underlying scalar type
948  const value_t* data () const { return (const value_t*)this; }
949  value_t* data () { return (value_t*)this; }
950 
951  /// Sset all components to 0
952  void clear () ;
953 
954  /// Return an vint4 with all components set to 0
955  static const vint4 Zero ();
956 
957  /// Return an vint4 with all components set to 1
958  static const vint4 One ();
959 
960  /// Return an vint4 with all components set to -1 (aka 0xffffffff)
961  static const vint4 NegOne ();
962 
963  /// Return an vint4 with incremented components (e.g., 0,1,2,3).
964  /// Optional arguments can give a non-zero starting point and step size.
965  static const vint4 Iota (int start=0, int step=1);
966 
967  /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
968  static const vint4 Giota ();
969 
970  /// Assign one value to all components.
971  const vint4 & operator= (int a);
972 
973  /// Assignment from another vint4
974  const vint4 & operator= (const vint4& other) ;
975 
976  /// Component access (get)
977  int operator[] (int i) const;
978 
979  /// Component access (set)
980  int& operator[] (int i);
981 
982  /// Component access (set).
983  void setcomp (int i, int value);
984 
985  value_t x () const;
986  value_t y () const;
987  value_t z () const;
988  value_t w () const;
989  void set_x (value_t val);
990  void set_y (value_t val);
991  void set_z (value_t val);
992  void set_w (value_t val);
993 
994  /// Helper: load a single int into all components
995  void load (int a);
996 
997  /// Helper: load separate values into each component.
998  void load (int a, int b, int c, int d);
999 
1000  /// Load from an array of 4 values
1001  void load (const int *values);
1002 
1003  void load (const int *values, int n) ;
1004 
1005  /// Load from an array of 4 unsigned short values, convert to vint4
1006  void load (const unsigned short *values) ;
1007 
1008  /// Load from an array of 4 unsigned short values, convert to vint4
1009  void load (const short *values);
1010 
1011  /// Load from an array of 4 unsigned char values, convert to vint4
1012  void load (const unsigned char *values);
1013 
1014  /// Load from an array of 4 unsigned char values, convert to vint4
1015  void load (const char *values);
1016 
1017  /// Store the values into memory
1018  void store (int *values) const;
1019 
1020  /// Store the first n values into memory
1021  void store (int *values, int n) const;
1022 
1023  /// Store the least significant 16 bits of each element into adjacent
1024  /// unsigned shorts.
1025  void store (unsigned short *values) const;
1026 
1027  /// Store the least significant 8 bits of each element into adjacent
1028  /// unsigned chars.
1029  void store (unsigned char *values) const;
1030 
1031  /// Masked load -- read from values[] where mask is 1, load zero where
1032  /// mask is 0.
1033  void load_mask (int mask, const value_t *values);
1034  void load_mask (const vbool_t& mask, const value_t *values);
1035 
1036  /// Masked store -- write to values[] where mask is enabled, don't
1037  /// touch values[] where it's not.
1038  void store_mask (int mask, value_t *values) const;
1039  void store_mask (const vbool_t& mask, value_t *values) const;
1040 
1041  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1042  template<int scale=4>
1043  void gather (const value_t *baseptr, const vint_t& vindex);
1044  /// Gather elements defined by the mask, leave others unchanged.
1045  template<int scale=4>
1046  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1047  template<int scale=4>
1048  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1049 
1050  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1051  template<int scale=4>
1052  void scatter (value_t *baseptr, const vint_t& vindex) const;
1053  /// Scatter elements defined by the mask
1054  template<int scale=4>
1055  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1056  template<int scale=4>
1057  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1058 
1059  // Arithmetic operators (component-by-component)
1060  friend vint4 operator+ (const vint4& a, const vint4& b);
1061  friend vint4 operator- (const vint4& a);
1062  friend vint4 operator- (const vint4& a, const vint4& b);
1063  friend vint4 operator* (const vint4& a, const vint4& b);
1064  friend vint4 operator/ (const vint4& a, const vint4& b);
1065  friend vint4 operator% (const vint4& a, const vint4& b);
1066  friend const vint4 & operator+= (vint4& a, const vint4& b);
1067  friend const vint4 & operator-= (vint4& a, const vint4& b);
1068  friend const vint4 & operator*= (vint4& a, const vint4& b);
1069  friend const vint4 & operator/= (vint4& a, const vint4& b);
1070  friend const vint4 & operator%= (vint4& a, const vint4& b);
1071  // Bitwise operators (component-by-component)
1072  friend vint4 operator& (const vint4& a, const vint4& b);
1073  friend vint4 operator| (const vint4& a, const vint4& b);
1074  friend vint4 operator^ (const vint4& a, const vint4& b);
1075  friend const vint4& operator&= (vint4& a, const vint4& b);
1076  friend const vint4& operator|= (vint4& a, const vint4& b);
1077  friend const vint4& operator^= (vint4& a, const vint4& b);
1078  friend vint4 operator~ (const vint4& a);
1079  friend vint4 operator<< (const vint4& a, unsigned int bits);
1080  friend vint4 operator>> (const vint4& a, unsigned int bits);
1081  friend const vint4& operator<<= (vint4& a, unsigned int bits);
1082  friend const vint4& operator>>= (vint4& a, unsigned int bits);
1083  // Comparison operators (component-by-component)
1084  friend vbool4 operator== (const vint4& a, const vint4& b);
1085  friend vbool4 operator!= (const vint4& a, const vint4& b);
1086  friend vbool4 operator< (const vint4& a, const vint4& b);
1087  friend vbool4 operator> (const vint4& a, const vint4& b);
1088  friend vbool4 operator>= (const vint4& a, const vint4& b);
1089  friend vbool4 operator<= (const vint4& a, const vint4& b);
1090 
1091  /// Stream output
1092  friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1093 
1094 private:
1095  // The actual data representation
1096  union {
1099  };
1100 };
1101 
1102 
1103 
1104 // Shift right logical -- unsigned shift. This differs from operator>>
1105 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1106 // srl((1<<31),1) == 1<<30.
1107 vint4 srl (const vint4& val, const unsigned int bits);
1108 
1109 /// Helper: shuffle/swizzle with constant (templated) indices.
1110 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1111 template<int i0, int i1, int i2, int i3>
1112 OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1113 
1114 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1115 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1116 
1117 /// Helper: as rapid as possible extraction of one component, when the
1118 /// index is fixed.
1119 template<int i> OIIO_FORCEINLINE int extract (const vint4& v);
1120 
1121 /// The sum of all components, returned in all components.
1122 vint4 vreduce_add (const vint4& v);
1123 
1124 // Reduction across all components
1125 int reduce_add (const vint4& v);
1126 int reduce_and (const vint4& v);
1127 int reduce_or (const vint4& v);
1128 
1129 /// Use a bool mask to select between components of a (if mask[i] is false)
1130 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1131 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1132 
1133 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1134 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1135 /// blend(0,a,mask).
1136 vint4 blend0 (const vint4& a, const vbool4& mask);
1137 
1138 /// Use a bool mask to select between components of a (if mask[i] is false)
1139 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1140 /// blend(0,a,!mask), or blend(a,0,mask).
1141 vint4 blend0not (const vint4& a, const vbool4& mask);
1142 
1143 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1144 /// synonym for blend with arguments rearranged, but this is more clear
1145 /// because the arguments are symmetric to scalar (cond ? a : b).
1146 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1147 
1148 // Per-element math
1149 vint4 abs (const vint4& a);
1150 vint4 min (const vint4& a, const vint4& b);
1151 vint4 max (const vint4& a, const vint4& b);
1152 
1153 /// Circular bit rotate by s bits, for N values at once.
1154 vint4 rotl (const vint4& x, const int s);
1155 // DEPRECATED(2.1)
1156 vint4 rotl32 (const vint4& x, const unsigned int k);
1157 
1158 /// andnot(a,b) returns ((~a) & b)
1159 vint4 andnot (const vint4& a, const vint4& b);
1160 
1161 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1162 vint4 bitcast_to_int (const vbool4& x);
1163 vint4 bitcast_to_int (const vfloat4& x);
1164 vfloat4 bitcast_to_float (const vint4& x);
1165 
1166 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1167 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1168  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1169 
1170 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1171 
1172 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1173 vint4 safe_mod (const vint4& a, const vint4& b);
1174 vint4 safe_mod (const vint4& a, int b);
1175 
1176 
1177 
1178 
1179 /// Integer 8-vector, accelerated by SIMD instructions when available.
1180 class vint8 {
1181 public:
1182  static const char* type_name() { return "vint8"; }
1183  typedef int value_t; ///< Underlying equivalent scalar value type
1184  enum { elements = 8 }; ///< Number of scalar elements
1185  enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1186  enum { bits = elements*32 }; ///< Total number of bits
1187  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1188  typedef vbool8 vbool_t; ///< bool type of the same length
1189  typedef vfloat8 vfloat_t; ///< float type of the same length
1190  typedef vint8 vint_t; ///< int type of the same length
1191  OIIO_DEPRECATED("use vbool_t (1.8)")
1192  typedef vbool8 bool_t; // old name (deprecated 1.8)
1193  OIIO_DEPRECATED("use vfloat_t (1.8)")
1194  typedef vfloat8 float_t; // old name (deprecated 1.8)
1195 
1196  /// Default constructor (contents undefined)
1197  vint8 () { }
1198 
1199  /// Construct from a single value (store it in all slots)
1200  vint8 (int a);
1201 
1202  /// Construct from 2 values -- (a,a,b,b)
1203  vint8 (int a, int b);
1204 
1205  /// Construct from 8 values (won't work for vint8)
1206  vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1207 
1208  /// Construct from a pointer to values
1209  vint8 (const int *vals);
1210 
1211  /// Construct from a pointer to unsigned short values
1212  explicit vint8 (const unsigned short *vals);
1213 
1214  /// Construct from a pointer to signed short values
1215  explicit vint8 (const short *vals);
1216 
1217  /// Construct from a pointer to unsigned char values (0 - 255)
1218  explicit vint8 (const unsigned char *vals);
1219 
1220  /// Construct from a pointer to signed char values (-128 - 127)
1221  explicit vint8 (const char *vals);
1222 
1223  /// Copy construct from another vint8
1224  vint8 (const vint8 & other) { m_simd = other.m_simd; }
1225 
1226  /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1227  explicit vint8 (const vfloat8& f); // implementation below
1228 
1229  /// Construct from two vint4's
1230  vint8 (const vint4 &lo, const vint4 &hi);
1231 
1232  /// Construct from the underlying SIMD type
1233  vint8 (const simd_t& m) : m_simd(m) { }
1234 
1235  /// Return the raw SIMD type
1236  operator simd_t () const { return m_simd; }
1237  simd_t simd () const { return m_simd; }
1238  simd_t& simd () { return m_simd; }
1239 
1240  /// Return a pointer to the underlying scalar type
1241  const value_t* data () const { return (const value_t*)this; }
1242  value_t* data () { return (value_t*)this; }
1243 
1244  /// Sset all components to 0
1245  void clear () ;
1246 
1247  /// Return an vint8 with all components set to 0
1248  static const vint8 Zero ();
1249 
1250  /// Return an vint8 with all components set to 1
1251  static const vint8 One ();
1252 
1253  /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1254  static const vint8 NegOne ();
1255 
1256  /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1257  /// Optional arguments can give a non-zero starting point and step size.
1258  static const vint8 Iota (int start=0, int step=1);
1259 
1260  /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1261  static const vint8 Giota ();
1262 
1263  /// Assign one value to all components.
1264  const vint8 & operator= (int a);
1265 
1266  /// Assignment from another vint8
1267  const vint8 & operator= (const vint8& other) ;
1268 
1269  /// Component access (get)
1270  int operator[] (int i) const;
1271 
1272  /// Component access (set)
1273  int& operator[] (int i);
1274 
1275  /// Component access (set).
1276  void setcomp (int i, int value);
1277 
1278  value_t x () const;
1279  value_t y () const;
1280  value_t z () const;
1281  value_t w () const;
1282  void set_x (value_t val);
1283  void set_y (value_t val);
1284  void set_z (value_t val);
1285  void set_w (value_t val);
1286 
1287  /// Extract the lower precision vint4
1288  vint4 lo () const;
1289 
1290  /// Extract the higher precision vint4
1291  vint4 hi () const;
1292 
1293  /// Helper: load a single int into all components
1294  void load (int a);
1295 
1296  /// Load separate values into each component.
1297  void load (int a, int b, int c, int d, int e, int f, int g, int h);
1298 
1299  /// Load from an array of 8 values
1300  void load (const int *values);
1301 
1302  void load (const int *values, int n) ;
1303 
1304  /// Load from an array of 8 unsigned short values, convert to vint8
1305  void load (const unsigned short *values) ;
1306 
1307  /// Load from an array of 8 unsigned short values, convert to vint8
1308  void load (const short *values);
1309 
1310  /// Load from an array of 8 unsigned char values, convert to vint8
1311  void load (const unsigned char *values);
1312 
1313  /// Load from an array of 8 unsigned char values, convert to vint8
1314  void load (const char *values);
1315 
1316  /// Store the values into memory
1317  void store (int *values) const;
1318 
1319  /// Store the first n values into memory
1320  void store (int *values, int n) const;
1321 
1322  /// Store the least significant 16 bits of each element into adjacent
1323  /// unsigned shorts.
1324  void store (unsigned short *values) const;
1325 
1326  /// Store the least significant 8 bits of each element into adjacent
1327  /// unsigned chars.
1328  void store (unsigned char *values) const;
1329 
1330  /// Masked load -- read from values[] where mask is 1, load zero where
1331  /// mask is 0.
1332  void load_mask (int mask, const value_t *values);
1333  void load_mask (const vbool_t& mask, const value_t *values);
1334 
1335  /// Masked store -- write to values[] where mask is enabled, don't
1336  /// touch values[] where it's not.
1337  void store_mask (int mask, value_t *values) const;
1338  void store_mask (const vbool_t& mask, value_t *values) const;
1339 
1340  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1341  template<int scale=4>
1342  void gather (const value_t *baseptr, const vint_t& vindex);
1343  /// Gather elements defined by the mask, leave others unchanged.
1344  template<int scale=4>
1345  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1346  template<int scale=4>
1347  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1348 
1349  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1350  template<int scale=4>
1351  void scatter (value_t *baseptr, const vint_t& vindex) const;
1352  /// Scatter elements defined by the mask
1353  template<int scale=4>
1354  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1355  template<int scale=4>
1356  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1357 
1358  // Arithmetic operators (component-by-component)
1359  friend vint8 operator+ (const vint8& a, const vint8& b);
1360  friend vint8 operator- (const vint8& a);
1361  friend vint8 operator- (const vint8& a, const vint8& b);
1362  friend vint8 operator* (const vint8& a, const vint8& b);
1363  friend vint8 operator/ (const vint8& a, const vint8& b);
1364  friend vint8 operator% (const vint8& a, const vint8& b);
1365  friend const vint8 & operator+= (vint8& a, const vint8& b);
1366  friend const vint8 & operator-= (vint8& a, const vint8& b);
1367  friend const vint8 & operator*= (vint8& a, const vint8& b);
1368  friend const vint8 & operator/= (vint8& a, const vint8& b);
1369  friend const vint8 & operator%= (vint8& a, const vint8& b);
1370  // Bitwise operators (component-by-component)
1371  friend vint8 operator& (const vint8& a, const vint8& b);
1372  friend vint8 operator| (const vint8& a, const vint8& b);
1373  friend vint8 operator^ (const vint8& a, const vint8& b);
1374  friend const vint8& operator&= (vint8& a, const vint8& b);
1375  friend const vint8& operator|= (vint8& a, const vint8& b);
1376  friend const vint8& operator^= (vint8& a, const vint8& b);
1377  friend vint8 operator~ (const vint8& a);
1378  friend vint8 operator<< (const vint8& a, unsigned int bits);
1379  friend vint8 operator>> (const vint8& a, unsigned int bits);
1380  friend const vint8& operator<<= (vint8& a, unsigned int bits);
1381  friend const vint8& operator>>= (vint8& a, unsigned int bits);
1382  // Comparison operators (component-by-component)
1383  friend vbool8 operator== (const vint8& a, const vint8& b);
1384  friend vbool8 operator!= (const vint8& a, const vint8& b);
1385  friend vbool8 operator< (const vint8& a, const vint8& b);
1386  friend vbool8 operator> (const vint8& a, const vint8& b);
1387  friend vbool8 operator>= (const vint8& a, const vint8& b);
1388  friend vbool8 operator<= (const vint8& a, const vint8& b);
1389 
1390  /// Stream output
1391  friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1392 
1393 private:
1394  // The actual data representation
1395  union {
1399  };
1400 };
1401 
1402 
1403 
1404 // Shift right logical -- unsigned shift. This differs from operator>>
1405 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1406 // srl((1<<31),1) == 1<<30.
1407 vint8 srl (const vint8& val, const unsigned int bits);
1408 
1409 /// Helper: shuffle/swizzle with constant (templated) indices.
1410 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1411 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
1412 OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1413 
1414 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1415 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1416 
1417 /// Helper: as rapid as possible extraction of one component, when the
1418 /// index is fixed.
1419 template<int i> OIIO_FORCEINLINE int extract (const vint8& v);
1420 
1421 /// Helper: substitute val for a[i]
1422 template<int i> OIIO_FORCEINLINE vint8 insert (const vint8& a, int val);
1423 
1424 /// The sum of all components, returned in all components.
1425 vint8 vreduce_add (const vint8& v);
1426 
1427 // Reduction across all components
1428 int reduce_add (const vint8& v);
1429 int reduce_and (const vint8& v);
1430 int reduce_or (const vint8& v);
1431 
1432 /// Use a bool mask to select between components of a (if mask[i] is false)
1433 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1434 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1435 
1436 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1437 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1438 /// blend(0,a,mask).
1439 vint8 blend0 (const vint8& a, const vbool8& mask);
1440 
1441 /// Use a bool mask to select between components of a (if mask[i] is false)
1442 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1443 /// blend(0,a,!mask), or blend(a,0,mask).
1444 vint8 blend0not (const vint8& a, const vbool8& mask);
1445 
1446 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1447 /// synonym for blend with arguments rearranged, but this is more clear
1448 /// because the arguments are symmetric to scalar (cond ? a : b).
1449 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1450 
1451 // Per-element math
1452 vint8 abs (const vint8& a);
1453 vint8 min (const vint8& a, const vint8& b);
1454 vint8 max (const vint8& a, const vint8& b);
1455 
1456 /// Circular bit rotate by s bits, for N values at once.
1457 vint8 rotl (const vint8& x, const int s);
1458 // DEPRECATED(2.1)
1459 vint8 rotl32 (const vint8& x, const unsigned int k);
1460 
1461 /// andnot(a,b) returns ((~a) & b)
1462 vint8 andnot (const vint8& a, const vint8& b);
1463 
1464 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1465 vint8 bitcast_to_int (const vbool8& x);
1466 vint8 bitcast_to_int (const vfloat8& x);
1467 vfloat8 bitcast_to_float (const vint8& x);
1468 
1469 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1470 vint8 safe_mod (const vint8& a, const vint8& b);
1471 vint8 safe_mod (const vint8& a, int b);
1472 
1473 
1474 
1475 
1476 
1477 /// Integer 16-vector, accelerated by SIMD instructions when available.
1478 class vint16 {
1479 public:
1480  static const char* type_name() { return "vint16"; }
1481  typedef int value_t; ///< Underlying equivalent scalar value type
1482  enum { elements = 16 }; ///< Number of scalar elements
1483  enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1484  enum { bits = 128 }; ///< Total number of bits
1485  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1486  typedef vbool16 vbool_t; ///< bool type of the same length
1487  typedef vfloat16 vfloat_t; ///< float type of the same length
1488  typedef vint16 vint_t; ///< int type of the same length
1489  OIIO_DEPRECATED("use vbool_t (1.8)")
1490  typedef vbool16 bool_t; // old name (deprecated 1.8)
1491  OIIO_DEPRECATED("use vfloat_t (1.8)")
1492  typedef vfloat16 float_t; // old name (deprecated 1.8)
1493 
1494  /// Default constructor (contents undefined)
1495  vint16 () { }
1496 
1497  /// Construct from a single value (store it in all slots)
1498  vint16 (int a);
1499 
1500  /// Construct from 16 values (won't work for vint16)
1501  vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1502  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1503 
1504  /// Construct from a pointer to values
1505  vint16 (const int *vals);
1506 
1507  /// Construct from a pointer to unsigned short values
1508  explicit vint16 (const unsigned short *vals);
1509 
1510  /// Construct from a pointer to signed short values
1511  explicit vint16 (const short *vals);
1512 
1513  /// Construct from a pointer to unsigned char values (0 - 255)
1514  explicit vint16 (const unsigned char *vals);
1515 
1516  /// Construct from a pointer to signed char values (-128 - 127)
1517  explicit vint16 (const char *vals);
1518 
1519  /// Copy construct from another vint16
1520  vint16 (const vint16 & other) { m_simd = other.m_simd; }
1521 
1522  /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1523  explicit vint16 (const vfloat16& f); // implementation below
1524 
1525  /// Construct from two vint8's
1526  vint16 (const vint8 &lo, const vint8 &hi);
1527 
1528  /// Construct from four vint4's
1529  vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1530 
1531  /// Construct from the underlying SIMD type
1532  vint16 (const simd_t& m) : m_simd(m) { }
1533 
1534  /// Return the raw SIMD type
1535  operator simd_t () const { return m_simd; }
1536  simd_t simd () const { return m_simd; }
1537  simd_t& simd () { return m_simd; }
1538 
1539  /// Return a pointer to the underlying scalar type
1540  const value_t* data () const { return (const value_t*)this; }
1541  value_t* data () { return (value_t*)this; }
1542 
1543  /// Sset all components to 0
1544  void clear () ;
1545 
1546  /// Return an vint16 with all components set to 0
1547  static const vint16 Zero ();
1548 
1549  /// Return an vint16 with all components set to 1
1550  static const vint16 One ();
1551 
1552  /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1553  static const vint16 NegOne ();
1554 
1555  /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1556  /// Optional arguments can give a non-zero starting point and step size.
1557  static const vint16 Iota (int start=0, int step=1);
1558 
1559  /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1560  static const vint16 Giota ();
1561 
1562  /// Assign one value to all components.
1563  const vint16 & operator= (int a);
1564 
1565  /// Assignment from another vint16
1566  const vint16 & operator= (const vint16& other) ;
1567 
1568  /// Component access (get)
1569  int operator[] (int i) const;
1570 
1571  /// Component access (set)
1572  int& operator[] (int i);
1573 
1574  /// Component access (set).
1575  void setcomp (int i, int value);
1576 
1577  value_t x () const;
1578  value_t y () const;
1579  value_t z () const;
1580  value_t w () const;
1581  void set_x (value_t val);
1582  void set_y (value_t val);
1583  void set_z (value_t val);
1584  void set_w (value_t val);
1585 
1586  /// Extract the lower precision vint8
1587  vint8 lo () const;
1588 
1589  /// Extract the higher precision vint8
1590  vint8 hi () const;
1591 
1592  /// Helper: load a single int into all components
1593  void load (int a);
1594 
1595  /// Load separate values into each component.
1596  void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1597  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1598 
1599  /// Load from an array of 16 values
1600  void load (const int *values);
1601 
1602  void load (const int *values, int n) ;
1603 
1604  /// Load from an array of 16 unsigned short values, convert to vint16
1605  void load (const unsigned short *values) ;
1606 
1607  /// Load from an array of 16 unsigned short values, convert to vint16
1608  void load (const short *values);
1609 
1610  /// Load from an array of 16 unsigned char values, convert to vint16
1611  void load (const unsigned char *values);
1612 
1613  /// Load from an array of 16 unsigned char values, convert to vint16
1614  void load (const char *values);
1615 
1616  /// Store the values into memory
1617  void store (int *values) const;
1618 
1619  /// Store the first n values into memory
1620  void store (int *values, int n) const;
1621 
1622  /// Store the least significant 16 bits of each element into adjacent
1623  /// unsigned shorts.
1624  void store (unsigned short *values) const;
1625 
1626  /// Store the least significant 8 bits of each element into adjacent
1627  /// unsigned chars.
1628  void store (unsigned char *values) const;
1629 
1630  /// Masked load -- read from values[] where mask is 1, load zero where
1631  /// mask is 0.
1632  void load_mask (const vbool_t &mask, const value_t *values);
1633  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1634 
1635  /// Masked store -- write to values[] where mask is enabled, don't
1636  /// touch values[] where it's not.
1637  void store_mask (const vbool_t &mask, value_t *values) const;
1638  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1639 
1640  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1641  template<int scale=4>
1642  void gather (const value_t *baseptr, const vint_t& vindex);
1643  /// Gather elements defined by the mask, leave others unchanged.
1644  template<int scale=4>
1645  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1646  template<int scale=4>
1647  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1648  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1649  }
1650 
1651  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1652  template<int scale=4>
1653  void scatter (value_t *baseptr, const vint_t& vindex) const;
1654  /// Scatter elements defined by the mask
1655  template<int scale=4>
1656  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1657  template<int scale=4>
1658  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1659  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1660  }
1661 
1662  // Arithmetic operators (component-by-component)
1663  friend vint16 operator+ (const vint16& a, const vint16& b);
1664  friend vint16 operator- (const vint16& a);
1665  friend vint16 operator- (const vint16& a, const vint16& b);
1666  friend vint16 operator* (const vint16& a, const vint16& b);
1667  friend vint16 operator/ (const vint16& a, const vint16& b);
1668  friend vint16 operator% (const vint16& a, const vint16& b);
1669  friend const vint16 & operator+= (vint16& a, const vint16& b);
1670  friend const vint16 & operator-= (vint16& a, const vint16& b);
1671  friend const vint16 & operator*= (vint16& a, const vint16& b);
1672  friend const vint16 & operator/= (vint16& a, const vint16& b);
1673  friend const vint16 & operator%= (vint16& a, const vint16& b);
1674  // Bitwise operators (component-by-component)
1675  friend vint16 operator& (const vint16& a, const vint16& b);
1676  friend vint16 operator| (const vint16& a, const vint16& b);
1677  friend vint16 operator^ (const vint16& a, const vint16& b);
1678  friend const vint16& operator&= (vint16& a, const vint16& b);
1679  friend const vint16& operator|= (vint16& a, const vint16& b);
1680  friend const vint16& operator^= (vint16& a, const vint16& b);
1681  friend vint16 operator~ (const vint16& a);
1682  friend vint16 operator<< (const vint16& a, unsigned int bits);
1683  friend vint16 operator>> (const vint16& a, unsigned int bits);
1684  friend const vint16& operator<<= (vint16& a, unsigned int bits);
1685  friend const vint16& operator>>= (vint16& a, unsigned int bits);
1686  // Comparison operators (component-by-component)
1687  friend vbool16 operator== (const vint16& a, const vint16& b);
1688  friend vbool16 operator!= (const vint16& a, const vint16& b);
1689  friend vbool16 operator< (const vint16& a, const vint16& b);
1690  friend vbool16 operator> (const vint16& a, const vint16& b);
1691  friend vbool16 operator>= (const vint16& a, const vint16& b);
1692  friend vbool16 operator<= (const vint16& a, const vint16& b);
1693 
1694  /// Stream output
1695  friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1696 
1697 private:
1698  // The actual data representation
1699  union {
1703  };
1704 };
1705 
1706 
1707 
1708 /// Shift right logical -- unsigned shift. This differs from operator>>
1709 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1710 /// srl((1<<31),1) == 1<<30.
1711 vint16 srl (const vint16& val, const unsigned int bits);
1712 
1713 /// Shuffle groups of 4
1714 template<int i0, int i1, int i2, int i3>
1715 vint16 shuffle4 (const vint16& a);
1716 
1717 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1718 template<int i> vint16 shuffle4 (const vint16& a);
1719 
1720 /// Shuffle within each group of 4
1721 template<int i0, int i1, int i2, int i3>
1722 vint16 shuffle (const vint16& a);
1723 
1724 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1725 template<int i> vint16 shuffle (const vint16& a);
1726 
1727 /// Helper: as rapid as possible extraction of one component, when the
1728 /// index is fixed.
1729 template<int i> OIIO_FORCEINLINE int extract (const vint16& v);
1730 
1731 /// Helper: substitute val for a[i]
1732 template<int i> OIIO_FORCEINLINE vint16 insert (const vint16& a, int val);
1733 
1734 /// The sum of all components, returned in all components.
1735 vint16 vreduce_add (const vint16& v);
1736 
1737 // Reduction across all components
1738 int reduce_add (const vint16& v);
1739 int reduce_and (const vint16& v);
1740 int reduce_or (const vint16& v);
1741 
1742 /// Use a bool mask to select between components of a (if mask[i] is false)
1743 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1744 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1745 
1746 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1747 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1748 /// blend(0,a,mask).
1749 vint16 blend0 (const vint16& a, const vbool16& mask);
1750 
1751 /// Use a bool mask to select between components of a (if mask[i] is false)
1752 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1753 /// blend(0,a,!mask), or blend(a,0,mask).
1754 vint16 blend0not (const vint16& a, const vbool16& mask);
1755 
1756 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1757 /// synonym for blend with arguments rearranged, but this is more clear
1758 /// because the arguments are symmetric to scalar (cond ? a : b).
1759 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1760 
1761 // Per-element math
1762 vint16 abs (const vint16& a);
1763 vint16 min (const vint16& a, const vint16& b);
1764 vint16 max (const vint16& a, const vint16& b);
1765 
1766 /// Circular bit rotate by s bits, for N values at once.
1767 vint16 rotl (const vint16& x, const int s);
1768 // DEPRECATED(2.1)
1769 vint16 rotl32 (const vint16& x, const unsigned int k);
1770 
1771 /// andnot(a,b) returns ((~a) & b)
1772 vint16 andnot (const vint16& a, const vint16& b);
1773 
1774 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1775 vint16 bitcast_to_int (const vbool16& x);
1776 vint16 bitcast_to_int (const vfloat16& x);
1777 vfloat16 bitcast_to_float (const vint16& x);
1778 
1779 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1780 vint16 safe_mod (const vint16& a, const vint16& b);
1781 vint16 safe_mod (const vint16& a, int b);
1782 
1783 
1784 
1785 
1786 
1787 /// Floating point 4-vector, accelerated by SIMD instructions when
1788 /// available.
1789 class vfloat4 {
1790 public:
1791  static const char* type_name() { return "vfloat4"; }
1792  typedef float value_t; ///< Underlying equivalent scalar value type
1793  enum { elements = 4 }; ///< Number of scalar elements
1794  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1795  enum { bits = elements*32 }; ///< Total number of bits
1796  typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used
1797  typedef vfloat4 vfloat_t; ///< SIMD int type
1798  typedef vint4 vint_t; ///< SIMD int type
1799  typedef vbool4 vbool_t; ///< SIMD bool type
1800  OIIO_DEPRECATED("use vbool_t (1.8)")
1801  typedef vint4 int_t; // old name (deprecated 1.8)
1802  OIIO_DEPRECATED("use vfloat_t (1.8)")
1803  typedef vbool4 bool_t; // old name (deprecated 1.8)
1804 
1805  /// Default constructor (contents undefined)
1806  vfloat4 () { }
1807 
1808  /// Construct from a single value (store it in all slots)
1809  vfloat4 (float a) { load(a); }
1810 
1811  /// Construct from 3 or 4 values
1812  vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1813 
1814  /// Construct from a pointer to 4 values
1815  vfloat4 (const float *f) { load (f); }
1816 
1817  /// Copy construct from another vfloat4
1818  vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1819 
1820  /// Construct from an vint4 (promoting all components to float)
1821  explicit vfloat4 (const vint4& ival);
1822 
1823  /// Construct from the underlying SIMD type
1824  vfloat4 (const simd_t& m) : m_simd(m) { }
1825 
1826  /// Return the raw SIMD type
1827  operator simd_t () const { return m_simd; }
1828  simd_t simd () const { return m_simd; }
1829  simd_t& simd () { return m_simd; }
1830 
1831  /// Return a pointer to the underlying scalar type
1832  const value_t* data () const { return (const value_t*)this; }
1833  value_t* data () { return (value_t*)this; }
1834 
1835  /// Construct from a Imath::V3f
1836  explicit vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); }
1837 
1838  /// Cast to a Imath::V3f
1839  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1840 
1841  /// Construct from a Imath::V4f
1842  explicit vfloat4 (const Imath::V4f &v) { load ((const float *)&v); }
1843 
1844  /// Cast to a Imath::V4f
1845  const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1846 
1847  /// Construct from a pointer to 4 unsigned short values
1848  explicit vfloat4 (const unsigned short *vals) { load(vals); }
1849 
1850  /// Construct from a pointer to 4 short values
1851  explicit vfloat4 (const short *vals) { load(vals); }
1852 
1853  /// Construct from a pointer to 4 unsigned char values
1854  explicit vfloat4 (const unsigned char *vals) { load(vals); }
1855 
1856  /// Construct from a pointer to 4 char values
1857  explicit vfloat4 (const char *vals) { load(vals); }
1858 
1859 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1860  /// Construct from a pointer to 4 half (16 bit float) values
1861  explicit vfloat4 (const half *vals) { load(vals); }
1862 #endif
1863 
1864  /// Assign a single value to all components
1865  const vfloat4 & operator= (float a) { load(a); return *this; }
1866 
1867  /// Assign a vfloat4
1868  const vfloat4 & operator= (vfloat4 other) {
1869  m_simd = other.m_simd;
1870  return *this;
1871  }
1872 
1873  /// Return a vfloat4 with all components set to 0.0
1874  static const vfloat4 Zero ();
1875 
1876  /// Return a vfloat4 with all components set to 1.0
1877  static const vfloat4 One ();
1878 
1879  /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1880  /// Optional argument can give a non-zero starting point and non-1 step.
1881  static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1882 
1883  /// Set all components to 0.0
1884  void clear ();
1885 
1886  /// Assign from a Imath::V4f
1887  const vfloat4 & operator= (const Imath::V4f &v);
1888 
1889  /// Assign from a Imath::V3f
1890  const vfloat4 & operator= (const Imath::V3f &v);
1891 
1892  /// Component access (get)
1893  float operator[] (int i) const;
1894  /// Component access (set)
1895  float& operator[] (int i);
1896 
1897  /// Component access (set).
1898  void setcomp (int i, float value);
1899 
1900  value_t x () const;
1901  value_t y () const;
1902  value_t z () const;
1903  value_t w () const;
1904  void set_x (value_t val);
1905  void set_y (value_t val);
1906  void set_z (value_t val);
1907  void set_w (value_t val);
1908 
1909  /// Helper: load a single value into all components
1910  void load (float val);
1911 
1912  /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
1913  void load (float a, float b, float c, float d=0.0f);
1914 
1915  /// Load from an array of 4 values
1916  void load (const float *values);
1917 
1918  /// Load from a partial array of <=4 values. Unassigned values are
1919  /// undefined.
1920  void load (const float *values, int n);
1921 
1922  /// Load from an array of 4 unsigned short values, convert to float
1923  void load (const unsigned short *values);
1924 
1925  /// Load from an array of 4 short values, convert to float
1926  void load (const short *values);
1927 
1928  /// Load from an array of 4 unsigned char values, convert to float
1929  void load (const unsigned char *values);
1930 
1931  /// Load from an array of 4 char values, convert to float
1932  void load (const char *values);
1933 
1934 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1935  /// Load from an array of 4 half values, convert to float
1936  void load (const half *values);
1937 #endif /* _HALF_H_ or _IMATH_H_ */
1938 
1939  void store (float *values) const;
1940 
1941  /// Store the first n values into memory
1942  void store (float *values, int n) const;
1943 
1944 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1945  void store (half *values) const;
1946 #endif
1947 
1948  /// Masked load -- read from values[] where mask is 1, load zero where
1949  /// mask is 0.
1950  void load_mask (int mask, const value_t *values);
1951  void load_mask (const vbool_t& mask, const value_t *values);
1952 
1953  /// Masked store -- write to values[] where mask is enabled, don't
1954  /// touch values[] where it's not.
1955  void store_mask (int mask, value_t *values) const;
1956  void store_mask (const vbool_t& mask, value_t *values) const;
1957 
1958  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1959  template<int scale=4>
1960  void gather (const value_t *baseptr, const vint_t& vindex);
1961  /// Gather elements defined by the mask, leave others unchanged.
1962  template<int scale=4>
1963  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1964  template<int scale=4>
1965  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1966 
1967  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1968  template<int scale=4>
1969  void scatter (value_t *baseptr, const vint_t& vindex) const;
1970  /// Scatter elements defined by the mask
1971  template<int scale=4>
1972  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1973  template<int scale=4>
1974  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1975 
1976  // Arithmetic operators
1977  friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
1978  const vfloat4 & operator+= (const vfloat4& a);
1979  vfloat4 operator- () const;
1980  friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
1981  const vfloat4 & operator-= (const vfloat4& a);
1982  friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
1983  friend vfloat4 operator* (const vfloat4& a, float b);
1984  friend vfloat4 operator* (float a, const vfloat4& b);
1985  const vfloat4 & operator*= (const vfloat4& a);
1986  const vfloat4 & operator*= (float val);
1987  friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
1988  const vfloat4 & operator/= (const vfloat4& a);
1989  const vfloat4 & operator/= (float val);
1990 
1991  // Comparison operations
1992  friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
1993  friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
1994  friend vbool4 operator< (const vfloat4& a, const vfloat4& b);
1995  friend vbool4 operator> (const vfloat4& a, const vfloat4& b);
1996  friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
1997  friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
1998 
1999  // Some oddball items that are handy
2000 
2001  /// Combine the first two components of A with the first two components
2002  /// of B.
2003  friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
2004 
2005  /// Combine the first two components of A with the first two components
2006  /// of B, but interleaved.
2007  friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
2008 
2009  /// Return xyz components, plus 0 for w
2010  vfloat4 xyz0 () const;
2011 
2012  /// Return xyz components, plus 1 for w
2013  vfloat4 xyz1 () const;
2014 
2015  /// Stream output
2016  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
2017 
2018 protected:
2019  // The actual data representation
2020  union {
2023  };
2024 };
2025 
2026 
2027 /// Helper: shuffle/swizzle with constant (templated) indices.
2028 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2029 template<int i0, int i1, int i2, int i3>
2030 OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2031 
2032 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2033 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2034 
2035 /// Helper: as rapid as possible extraction of one component, when the
2036 /// index is fixed.
2037 template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
2038 
2039 /// Helper: substitute val for a[i]
2040 template<int i> OIIO_FORCEINLINE vfloat4 insert (const vfloat4& a, float val);
2041 
2042 /// The sum of all components, returned in all components.
2043 vfloat4 vreduce_add (const vfloat4& v);
2044 
2045 /// The sum of all components, returned as a scalar.
2046 float reduce_add (const vfloat4& v);
2047 
2048 /// Return the float dot (inner) product of a and b in every component.
2049 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2050 
2051 /// Return the float dot (inner) product of a and b.
2052 float dot (const vfloat4 &a, const vfloat4 &b);
2053 
2054 /// Return the float 3-component dot (inner) product of a and b in
2055 /// all components.
2056 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2057 
2058 /// Return the float 3-component dot (inner) product of a and b.
2059 float dot3 (const vfloat4 &a, const vfloat4 &b);
2060 
2061 /// Use a bool mask to select between components of a (if mask[i] is false)
2062 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2063 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2064 
2065 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2066 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2067 /// blend(0,a,mask).
2068 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2069 
2070 /// Use a bool mask to select between components of a (if mask[i] is false)
2071 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2072 /// blend(0,a,!mask), or blend(a,0,mask).
2073 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2074 
2075 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2076 /// that is 0, return 0 rather than Inf.
2077 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2078 
2079 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2080 vfloat3 hdiv (const vfloat4 &a);
2081 
2082 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2083 /// synonym for blend with arguments rearranged, but this is more clear
2084 /// because the arguments are symmetric to scalar (cond ? a : b).
2085 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2086 
2087 // Per-element math
2088 vfloat4 abs (const vfloat4& a); ///< absolute value (float)
2089 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative
2090 vfloat4 ceil (const vfloat4& a);
2091 vfloat4 floor (const vfloat4& a);
2092 vint4 ifloor (const vfloat4& a); ///< (int)floor
2093 OIIO_DEPRECATED("use ifloor (1.8)")
2094 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2095 
2096 /// Per-element round to nearest integer.
2097 /// CAVEAT: the rounding when mid-way between integers may differ depending
2098 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2099 /// integer) but std::round() says to round away from 0 regardless of
2100 /// current rounding mode (but that is multiple instructions on x64).
2101 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2102 /// match std::round().
2103 vfloat4 round (const vfloat4& a);
2104 
2105 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2106 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2107 /// C++ std::rint() which says to use the current rounding mode.
2108 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2109 /// match std::rint().
2110 vint4 rint (const vfloat4& a);
2111 
2112 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a
2113 vfloat4 sqrt (const vfloat4 &a);
2114 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt
2115 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt
2116 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2117 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2118 template <typename T> OIIO_FORCEINLINE T exp (const T& v); // template for all SIMD variants
2119 template <typename T> OIIO_FORCEINLINE T log (const T& v);
2120 
2121 /// andnot(a,b) returns ((~a) & b)
2122 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2123 
2124 // Fused multiply and add (or subtract):
2125 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2126 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2127 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2128 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2129 
2130 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2131 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2132 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2133 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2134 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2135  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2136 
2137 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2138 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2139  const vfloat4& c, const vfloat4& d);
2140 
2141 
2142 
2143 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2144 /// The way it differs from vfloat4 is that all of he load functions only
2145 /// load three values, and all the stores only store 3 values. The vast
2146 /// majority of ops just fall back to the vfloat4 version, and so will
2147 /// operate on the 4th component, but we won't care about that results.
2148 class vfloat3 : public vfloat4 {
2149 public:
2150  static const char* type_name() { return "vfloat3"; }
2151  enum { elements = 3 }; ///< Number of scalar elements
2152  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2153 
2154  /// Default constructor (contents undefined)
2155  vfloat3 () { }
2156 
2157  /// Construct from a single value (store it in all slots)
2158  vfloat3 (float a) { load(a); }
2159 
2160  /// Construct from 3 or 4 values
2161  vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2162 
2163  /// Construct from a pointer to 4 values
2164  vfloat3 (const float *f) { load (f); }
2165 
2166  /// Copy construct from another vfloat3
2167  vfloat3 (const vfloat3 &other);
2168 
2169  /// Construct from a vfloat4. Note: it will not zero out the internal
2170  /// 4th component, but rather accept on faith that the vfloat4 you are
2171  /// giving it is a valid vfloat3. Be careful!
2172  explicit vfloat3 (const vfloat4 &other);
2173 
2174 #if OIIO_SIMD
2175  /// Construct from the underlying SIMD type. Note: it will not zero out
2176  /// the internal 4th component, but rather accept on faith that the
2177  /// vfloat4 you are giving it is a valid vfloat3. Be careful!
2178  explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2179 #endif
2180 
2181  /// Construct from a Imath::V3f
2182  vfloat3 (const Imath::V3f &v) : vfloat4(v) { }
2183 
2184  /// Cast to a Imath::V3f
2185  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2186 
2187  /// Construct from a pointer to 4 unsigned short values
2188  explicit vfloat3 (const unsigned short *vals) { load(vals); }
2189 
2190  /// Construct from a pointer to 4 short values
2191  explicit vfloat3 (const short *vals) { load(vals); }
2192 
2193  /// Construct from a pointer to 4 unsigned char values
2194  explicit vfloat3 (const unsigned char *vals) { load(vals); }
2195 
2196  /// Construct from a pointer to 4 char values
2197  explicit vfloat3 (const char *vals) { load(vals); }
2198 
2199 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2200  /// Construct from a pointer to 4 half (16 bit float) values
2201  explicit vfloat3 (const half *vals) { load(vals); }
2202 #endif
2203 
2204  /// Assign a single value to all components
2205  const vfloat3 & operator= (float a) { load(a); return *this; }
2206 
2207  /// Return a vfloat3 with all components set to 0.0
2208  static const vfloat3 Zero ();
2209 
2210  /// Return a vfloat3 with all components set to 1.0
2211  static const vfloat3 One ();
2212 
2213  /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2214  /// Optional argument can give a non-zero starting point and non-1 step.
2215  static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2216 
2217  /// Helper: load a single value into all components
2218  void load (float val);
2219 
2220  /// Load from an array of 4 values
2221  void load (const float *values);
2222 
2223  /// Load from an array of 4 values
2224  void load (const float *values, int n);
2225 
2226  /// Load from an array of 4 unsigned short values, convert to float
2227  void load (const unsigned short *values);
2228 
2229  /// Load from an array of 4 short values, convert to float
2230  void load (const short *values);
2231 
2232  /// Load from an array of 4 unsigned char values, convert to float
2233  void load (const unsigned char *values);
2234 
2235  /// Load from an array of 4 char values, convert to float
2236  void load (const char *values);
2237 
2238 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2239  /// Load from an array of 4 half values, convert to float
2240  void load (const half *values);
2241 #endif /* _HALF_H_ or _IMATH_H_ */
2242 
2243  void store (float *values) const;
2244 
2245  void store (float *values, int n) const;
2246 
2247 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2248  void store (half *values) const;
2249 #endif
2250 
2251  /// Store into an Imath::V3f reference.
2252  void store (Imath::V3f &vec) const;
2253 
2254  // Math operators -- define in terms of vfloat3.
2255  friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2256  const vfloat3 & operator+= (const vfloat3& a);
2257  vfloat3 operator- () const;
2258  friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2259  const vfloat3 & operator-= (const vfloat3& a);
2260  friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2261  friend vfloat3 operator* (const vfloat3& a, float b);
2262  friend vfloat3 operator* (float a, const vfloat3& b);
2263  const vfloat3 & operator*= (const vfloat3& a);
2264  const vfloat3 & operator*= (float a);
2265  friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2266  const vfloat3 & operator/= (const vfloat3& a);
2267  const vfloat3 & operator/= (float a);
2268 
2269  /// Square of the length of the vector
2270  float length2() const;
2271  /// Length of the vector
2272  float length() const;
2273 
2274  /// Return a normalized version of the vector.
2275  vfloat3 normalized () const;
2276  /// Return a fast, approximate normalized version of the vector.
2277  vfloat3 normalized_fast () const;
2278  /// Normalize in place.
2279  void normalize() { *this = normalized(); }
2280 
2281  /// Stream output
2282  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2283 };
2284 
2285 
2286 
2287 // Per-element math on float3
2288 vfloat3 abs (const vfloat3& a);
2289 vfloat3 sign (const vfloat3& a);
2290 vfloat3 ceil (const vfloat3& a);
2291 vfloat3 floor (const vfloat3& a);
2292 vfloat3 round (const vfloat3& a);
2293 
2294 
2295 
2296 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2297 /// not in registers) isomorphic to Imath::M44f.
2298 class matrix44 {
2299 public:
2300  // Uninitialized
2302 #ifndef OIIO_SIMD_SSE
2303  : m_mat(Imath::UNINITIALIZED)
2304 #endif
2305  { }
2306 
2307  /// Construct from a reference to an Imath::M44f
2308  OIIO_FORCEINLINE explicit matrix44 (const Imath::M44f &M) {
2309 #if OIIO_SIMD_SSE
2310  m_row[0].load (M[0]);
2311  m_row[1].load (M[1]);
2312  m_row[2].load (M[2]);
2313  m_row[3].load (M[3]);
2314 #else
2315  m_mat = M;
2316 #endif
2317  }
2318 
2319  /// Construct from a float array
2320  OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2321 #if OIIO_SIMD_SSE
2322  m_row[0].load (f+0);
2323  m_row[1].load (f+4);
2324  m_row[2].load (f+8);
2325  m_row[3].load (f+12);
2326 #else
2327  m_mat = *(const Imath::M44f*)f;
2328 #endif
2329  }
2330 
2331  /// Construct from 4 vfloat4 rows
2332  OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2333  const vfloat4& c, const vfloat4& d) {
2334 #if OIIO_SIMD_SSE
2335  m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d;
2336 #else
2337  a.store (m_mat[0]);
2338  b.store (m_mat[1]);
2339  c.store (m_mat[2]);
2340  d.store (m_mat[3]);
2341 #endif
2342  }
2343  /// Construct from 4 float[4] rows
2344  OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2345  const float *c, const float *d) {
2346 #if OIIO_SIMD_SSE
2347  m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2348 #else
2349  memcpy (m_mat[0], a, 4*sizeof(float));
2350  memcpy (m_mat[1], b, 4*sizeof(float));
2351  memcpy (m_mat[2], c, 4*sizeof(float));
2352  memcpy (m_mat[3], d, 4*sizeof(float));
2353 #endif
2354  }
2355 
2356  /// Construct from 16 floats
2357  OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2358  float f10, float f11, float f12, float f13,
2359  float f20, float f21, float f22, float f23,
2360  float f30, float f31, float f32, float f33)
2361  {
2362 #if OIIO_SIMD_SSE
2363  m_row[0].load (f00, f01, f02, f03);
2364  m_row[1].load (f10, f11, f12, f13);
2365  m_row[2].load (f20, f21, f22, f23);
2366  m_row[3].load (f30, f31, f32, f33);
2367 #else
2368  m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2369  m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2370  m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2371  m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2372 #endif
2373  }
2374 
2375  /// Present as an Imath::M44f
2376  const Imath::M44f& M44f() const;
2377 
2378  /// Return one row
2379  vfloat4 operator[] (int i) const;
2380 
2381  /// Return the transposed matrix
2382  matrix44 transposed () const;
2383 
2384  /// Transform 3-point V by 4x4 matrix M.
2385  vfloat3 transformp (const vfloat3 &V) const;
2386 
2387  /// Transform 3-vector V by 4x4 matrix M.
2388  vfloat3 transformv (const vfloat3 &V) const;
2389 
2390  /// Transform 3-vector V by the transpose of 4x4 matrix M.
2391  vfloat3 transformvT (const vfloat3 &V) const;
2392 
2393  friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2394  friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2395 
2396  bool operator== (const matrix44& m) const;
2397 
2398  bool operator== (const Imath::M44f& m) const ;
2399  friend bool operator== (const Imath::M44f& a, const matrix44 &b);
2400 
2401  bool operator!= (const matrix44& m) const;
2402 
2403  bool operator!= (const Imath::M44f& m) const;
2404  friend bool operator!= (const Imath::M44f& a, const matrix44 &b);
2405 
2406  /// Return the inverse of the matrix.
2407  matrix44 inverse() const;
2408 
2409  /// Stream output
2410  friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2411 
2412 private:
2413 #if OIIO_SIMD_SSE
2414  vfloat4 m_row[4];
2415 #else
2416  Imath::M44f m_mat;
2417 #endif
2418 };
2419 
2420 /// Transform 3-point V by 4x4 matrix M.
2421 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2422 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V);
2423 
2424 /// Transform 3-vector V by 4x4 matrix M.
2425 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2426 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V);
2427 
2428 // Transform 3-vector by the transpose of 4x4 matrix M.
2429 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2430 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V);
2431 
2432 
2433 
2434 
2435 /// Floating point 8-vector, accelerated by SIMD instructions when
2436 /// available.
2437 class vfloat8 {
2438 public:
2439  static const char* type_name() { return "vfloat8"; }
2440  typedef float value_t; ///< Underlying equivalent scalar value type
2441  enum { elements = 8 }; ///< Number of scalar elements
2442  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2443  enum { bits = elements*32 }; ///< Total number of bits
2444  typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used
2445  typedef vfloat8 vfloat_t; ///< SIMD int type
2446  typedef vint8 vint_t; ///< SIMD int type
2447  typedef vbool8 vbool_t; ///< SIMD bool type
2448  OIIO_DEPRECATED("use vint_t (1.8)")
2449  typedef vint8 int_t; // old name (deprecated 1.8)
2450  OIIO_DEPRECATED("use vbool_t (1.8)")
2451  typedef vbool8 bool_t; // old name (deprecated 1.8)
2452 
2453  /// Default constructor (contents undefined)
2454  vfloat8 () { }
2455 
2456  /// Construct from a single value (store it in all slots)
2457  vfloat8 (float a) { load(a); }
2458 
2459  /// Construct from 8 values
2460  vfloat8 (float a, float b, float c, float d,
2461  float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2462 
2463  /// Construct from a pointer to 8 values
2464  vfloat8 (const float *f) { load (f); }
2465 
2466  /// Copy construct from another vfloat8
2467  vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2468 
2469  /// Construct from an int vector (promoting all components to float)
2470  explicit vfloat8 (const vint8& ival);
2471 
2472  /// Construct from two vfloat4's
2473  vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2474 
2475  /// Construct from the underlying SIMD type
2476  vfloat8 (const simd_t& m) : m_simd(m) { }
2477 
2478  /// Return the raw SIMD type
2479  operator simd_t () const { return m_simd; }
2480  simd_t simd () const { return m_simd; }
2481  simd_t& simd () { return m_simd; }
2482 
2483  /// Return a pointer to the underlying scalar type
2484  const value_t* data () const { return (const value_t*)this; }
2485  value_t* data () { return (value_t*)this; }
2486 
2487  /// Construct from a pointer to unsigned short values
2488  explicit vfloat8 (const unsigned short *vals) { load(vals); }
2489 
2490  /// Construct from a pointer to short values
2491  explicit vfloat8 (const short *vals) { load(vals); }
2492 
2493  /// Construct from a pointer to unsigned char values
2494  explicit vfloat8 (const unsigned char *vals) { load(vals); }
2495 
2496  /// Construct from a pointer to char values
2497  explicit vfloat8 (const char *vals) { load(vals); }
2498 
2499 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2500  /// Construct from a pointer to half (16 bit float) values
2501  explicit vfloat8 (const half *vals) { load(vals); }
2502 #endif
2503 
2504  /// Assign a single value to all components
2505  const vfloat8& operator= (float a) { load(a); return *this; }
2506 
2507  /// Assign a vfloat8
2508  const vfloat8& operator= (vfloat8 other) {
2509  m_simd = other.m_simd;
2510  return *this;
2511  }
2512 
2513  /// Return a vfloat8 with all components set to 0.0
2514  static const vfloat8 Zero ();
2515 
2516  /// Return a vfloat8 with all components set to 1.0
2517  static const vfloat8 One ();
2518 
2519  /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2520  /// Optional argument can give a non-zero starting point and non-1 step.
2521  static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2522 
2523  /// Set all components to 0.0
2524  void clear ();
2525 
2526  /// Component access (get)
2527  float operator[] (int i) const;
2528  /// Component access (set)
2529  float& operator[] (int i);
2530 
2531  /// Component access (set).
2532  void setcomp (int i, float value);
2533 
2534  value_t x () const;
2535  value_t y () const;
2536  value_t z () const;
2537  value_t w () const;
2538  void set_x (value_t val);
2539  void set_y (value_t val);
2540  void set_z (value_t val);
2541  void set_w (value_t val);
2542 
2543  /// Extract the lower precision vfloat4
2544  vfloat4 lo () const;
2545 
2546  /// Extract the higher precision vfloat4
2547  vfloat4 hi () const;
2548 
2549  /// Helper: load a single value into all components
2550  void load (float val);
2551 
2552  /// Helper: load 8 values
2553  void load (float a, float b, float c, float d,
2554  float e, float f, float g, float h);
2555 
2556  /// Load from an array of values
2557  void load (const float *values);
2558 
2559  /// Load from a partial array of <=8 values. Unassigned values are
2560  /// undefined.
2561  void load (const float *values, int n);
2562 
2563  /// Load from an array of 8 unsigned short values, convert to float
2564  void load (const unsigned short *values);
2565 
2566  /// Load from an array of 8 short values, convert to float
2567  void load (const short *values);
2568 
2569  /// Load from an array of 8 unsigned char values, convert to float
2570  void load (const unsigned char *values);
2571 
2572  /// Load from an array of 8 char values, convert to float
2573  void load (const char *values);
2574 
2575 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2576  /// Load from an array of 8 half values, convert to float
2577  void load (const half *values);
2578 #endif /* _HALF_H_ or _IMATH_H_ */
2579 
2580  void store (float *values) const;
2581 
2582  /// Store the first n values into memory
2583  void store (float *values, int n) const;
2584 
2585 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2586  void store (half *values) const;
2587 #endif
2588 
2589  /// Masked load -- read from values[] where mask is 1, load zero where
2590  /// mask is 0.
2591  void load_mask (int mask, const value_t *values);
2592  void load_mask (const vbool_t& mask, const value_t *values);
2593 
2594  /// Masked store -- write to values[] where mask is enabled, don't
2595  /// touch values[] where it's not.
2596  void store_mask (int mask, value_t *values) const;
2597  void store_mask (const vbool_t& mask, value_t *values) const;
2598 
2599  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2600  template<int scale=4>
2601  void gather (const value_t *baseptr, const vint_t& vindex);
2602  template<int scale=4>
2603  // Fastest way to fill with all 1 bits is to cmp any value to itself.
2604  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2605  template<int scale=4>
2606  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2607 
2608  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2609  template<int scale=4>
2610  void scatter (value_t *baseptr, const vint_t& vindex) const;
2611  /// Scatter elements defined by the mask
2612  template<int scale=4>
2613  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2614  template<int scale=4>
2615  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2616 
2617  // Arithmetic operators (component-by-component)
2618  friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2619  friend vfloat8 operator- (const vfloat8& a);
2620  friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2621  friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2622  friend vfloat8 operator* (const vfloat8& a, float b);
2623  friend vfloat8 operator* (float a, const vfloat8& b);
2624  friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2625  friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2626  friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2627  friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2628  friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2629  friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2630 
2631  // Comparison operations
2632  friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2633  friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2634  friend vbool8 operator< (const vfloat8& a, const vfloat8& b);
2635  friend vbool8 operator> (const vfloat8& a, const vfloat8& b);
2636  friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2637  friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2638 
2639  // Some oddball items that are handy
2640 
2641  /// Stream output
2642  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2643 
2644 protected:
2645  // The actual data representation
2646  union {
2650  };
2651 };
2652 
2653 
2654 /// Helper: shuffle/swizzle with constant (templated) indices.
2655 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2656 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2657 OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2658 
2659 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2660 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2661 
2662 /// Helper: as rapid as possible extraction of one component, when the
2663 /// index is fixed.
2664 template<int i> OIIO_FORCEINLINE float extract (const vfloat8& a);
2665 
2666 /// Helper: substitute val for a[i]
2667 template<int i> OIIO_FORCEINLINE vfloat8 insert (const vfloat8& a, float val);
2668 
2669 /// The sum of all components, returned in all components.
2670 vfloat8 vreduce_add (const vfloat8& v);
2671 
2672 /// The sum of all components, returned as a scalar.
2673 float reduce_add (const vfloat8& v);
2674 
2675 /// Return the float dot (inner) product of a and b in every component.
2676 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2677 
2678 /// Return the float dot (inner) product of a and b.
2679 float dot (const vfloat8 &a, const vfloat8 &b);
2680 
2681 /// Return the float 3-component dot (inner) product of a and b in
2682 /// all components.
2683 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2684 
2685 /// Return the float 3-component dot (inner) product of a and b.
2686 float dot3 (const vfloat8 &a, const vfloat8 &b);
2687 
2688 /// Use a bool mask to select between components of a (if mask[i] is false)
2689 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2690 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2691 
2692 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2693 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2694 /// blend(0,a,mask).
2695 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2696 
2697 /// Use a bool mask to select between components of a (if mask[i] is false)
2698 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2699 /// blend(0,a,!mask), or blend(a,0,mask).
2700 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2701 
2702 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2703 /// that is 0, return 0 rather than Inf.
2704 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2705 
2706 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2707 /// synonym for blend with arguments rearranged, but this is more clear
2708 /// because the arguments are symmetric to scalar (cond ? a : b).
2709 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2710 
2711 // Per-element math
2712 vfloat8 abs (const vfloat8& a); ///< absolute value (float)
2713 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative
2714 vfloat8 ceil (const vfloat8& a);
2715 vfloat8 floor (const vfloat8& a);
2716 vint8 ifloor (const vfloat8& a); ///< (int)floor
2717 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2718 
2719 /// Per-element round to nearest integer.
2720 /// CAVEAT: the rounding when mid-way between integers may differ depending
2721 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2722 /// integer) but std::round() says to round away from 0 regardless of
2723 /// current rounding mode (but that is multiple instructions on x64).
2724 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2725 /// match std::round().
2726 vfloat8 round (const vfloat8& a);
2727 
2728 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2729 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2730 /// C++ std::rint() which says to use the current rounding mode.
2731 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2732 /// match std::rint().
2733 vint8 rint (const vfloat8& a);
2734 
2735 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a
2736 vfloat8 sqrt (const vfloat8 &a);
2737 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt
2738 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt
2739 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2740 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2741 // vfloat8 exp (const vfloat8& v); // See template with vfloat4
2742 // vfloat8 log (const vfloat8& v); // See template with vfloat4
2743 
2744 /// andnot(a,b) returns ((~a) & b)
2745 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2746 
2747 // Fused multiply and add (or subtract):
2748 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2749 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2750 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2751 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2752 
2753 
2754 
2755 /// Floating point 16-vector, accelerated by SIMD instructions when
2756 /// available.
2757 class vfloat16 {
2758 public:
2759  static const char* type_name() { return "vfloat16"; }
2760  typedef float value_t; ///< Underlying equivalent scalar value type
2761  enum { elements = 16 }; ///< Number of scalar elements
2762  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2763  enum { bits = elements*32 }; ///< Total number of bits
2764  typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used
2765  typedef vfloat16 vfloat_t; ///< SIMD int type
2766  typedef vint16 vint_t; ///< SIMD int type
2767  typedef vbool16 vbool_t; ///< SIMD bool type
2768  OIIO_DEPRECATED("use vint_t (1.8)")
2769  typedef vint16 int_t; // old name (deprecated 1.8)
2770  OIIO_DEPRECATED("use vbool_t (1.8)")
2771  typedef vbool16 bool_t; // old name (deprecated 1.8)
2772 
2773  /// Default constructor (contents undefined)
2774  vfloat16 () { }
2775 
2776  /// Construct from a single value (store it in all slots)
2777  vfloat16 (float a) { load(a); }
2778 
2779  /// Construct from 16 values
2780  vfloat16 (float v0, float v1, float v2, float v3,
2781  float v4, float v5, float v6, float v7,
2782  float v8, float v9, float v10, float v11,
2783  float v12, float v13, float v14, float v15);
2784 
2785  /// Construct from a pointer to 16 values
2786  vfloat16 (const float *f) { load (f); }
2787 
2788  /// Copy construct from another vfloat16
2789  vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2790 
2791  /// Construct from an int vector (promoting all components to float)
2792  explicit vfloat16 (const vint16& ival);
2793 
2794  /// Construct from two vfloat8's
2795  vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2796 
2797  /// Construct from four vfloat4's
2798  vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2799 
2800  /// Construct from the underlying SIMD type
2801  vfloat16 (const simd_t& m) : m_simd(m) { }
2802 
2803  /// Return the raw SIMD type
2804  operator simd_t () const { return m_simd; }
2805  simd_t simd () const { return m_simd; }
2806  simd_t& simd () { return m_simd; }
2807 
2808  /// Return a pointer to the underlying scalar type
2809  const value_t* data () const { return (const value_t*)this; }
2810  value_t* data () { return (value_t*)this; }
2811 
2812  /// Construct from a pointer to unsigned short values
2813  explicit vfloat16 (const unsigned short *vals) { load(vals); }
2814 
2815  /// Construct from a pointer to short values
2816  explicit vfloat16 (const short *vals) { load(vals); }
2817 
2818  /// Construct from a pointer to unsigned char values
2819  explicit vfloat16 (const unsigned char *vals) { load(vals); }
2820 
2821  /// Construct from a pointer to char values
2822  explicit vfloat16 (const char *vals) { load(vals); }
2823 
2824 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2825  /// Construct from a pointer to half (16 bit float) values
2826  explicit vfloat16 (const half *vals) { load(vals); }
2827 #endif
2828 
2829  /// Assign a single value to all components
2830  const vfloat16& operator= (float a) { load(a); return *this; }
2831 
2832  /// Assign a vfloat16
2833  const vfloat16& operator= (vfloat16 other) {
2834  m_simd = other.m_simd;
2835  return *this;
2836  }
2837 
2838  /// Return a vfloat16 with all components set to 0.0
2839  static const vfloat16 Zero ();
2840 
2841  /// Return a vfloat16 with all components set to 1.0
2842  static const vfloat16 One ();
2843 
2844  /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2845  /// Optional argument can give a non-zero starting point and non-1 step.
2846  static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2847 
2848  /// Set all components to 0.0
2849  void clear ();
2850 
2851  /// Component access (get)
2852  float operator[] (int i) const;
2853  /// Component access (set)
2854  float& operator[] (int i);
2855 
2856  /// Component access (set).
2857  void setcomp (int i, float value);
2858 
2859  value_t x () const;
2860  value_t y () const;
2861  value_t z () const;
2862  value_t w () const;
2863  void set_x (value_t val);
2864  void set_y (value_t val);
2865  void set_z (value_t val);
2866  void set_w (value_t val);
2867 
2868  /// Extract the lower precision vfloat8
2869  vfloat8 lo () const;
2870 
2871  /// Extract the higher precision vfloat8
2872  vfloat8 hi () const;
2873 
2874  /// Helper: load a single value into all components
2875  void load (float val);
2876 
2877  /// Load separate values into each component.
2878  void load (float v0, float v1, float v2, float v3,
2879  float v4, float v5, float v6, float v7,
2880  float v8, float v9, float v10, float v11,
2881  float v12, float v13, float v14, float v15);
2882 
2883  /// Load from an array of values
2884  void load (const float *values);
2885 
2886  /// Load from a partial array of <=16 values. Unassigned values are
2887  /// undefined.
2888  void load (const float *values, int n);
2889 
2890  /// Load from an array of 16 unsigned short values, convert to float
2891  void load (const unsigned short *values);
2892 
2893  /// Load from an array of 16 short values, convert to float
2894  void load (const short *values);
2895 
2896  /// Load from an array of 16 unsigned char values, convert to float
2897  void load (const unsigned char *values);
2898 
2899  /// Load from an array of 16 char values, convert to float
2900  void load (const char *values);
2901 
2902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2903  /// Load from an array of 16 half values, convert to float
2904  void load (const half *values);
2905 #endif /* _HALF_H_ or _IMATH_H_ */
2906 
2907  void store (float *values) const;
2908 
2909  /// Store the first n values into memory
2910  void store (float *values, int n) const;
2911 
2912 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2913  void store (half *values) const;
2914 #endif
2915 
2916  /// Masked load -- read from values[] where mask is 1, load zero where
2917  /// mask is 0.
2918  void load_mask (const vbool_t &mask, const value_t *values);
2919  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
2920 
2921  /// Masked store -- write to values[] where mask is enabled, don't
2922  /// touch values[] where it's not.
2923  void store_mask (const vbool_t &mask, value_t *values) const;
2924  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
2925 
2926  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2927  template<int scale=4>
2928  void gather (const value_t *baseptr, const vint_t& vindex);
2929  /// Gather elements defined by the mask, leave others unchanged.
2930  template<int scale=4>
2931  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2932  template<int scale=4>
2933  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
2934  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
2935  }
2936 
2937  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2938  template<int scale=4>
2939  void scatter (value_t *baseptr, const vint_t& vindex) const;
2940  /// Scatter elements defined by the mask
2941  template<int scale=4>
2942  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2943  template<int scale=4>
2944  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
2945  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
2946  }
2947 
2948  // Arithmetic operators (component-by-component)
2949  friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
2950  friend vfloat16 operator- (const vfloat16& a);
2951  friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
2952  friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
2953  friend vfloat16 operator* (const vfloat16& a, float b);
2954  friend vfloat16 operator* (float a, const vfloat16& b);
2955  friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
2956  friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
2957  friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
2958  friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
2959  friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
2960  friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
2961 
2962  // Comparison operations
2963  friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
2964  friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
2965  friend vbool16 operator< (const vfloat16& a, const vfloat16& b);
2966  friend vbool16 operator> (const vfloat16& a, const vfloat16& b);
2967  friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
2968  friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
2969 
2970  // Some oddball items that are handy
2971 
2972  /// Stream output
2973  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
2974 
2975 protected:
2976  // The actual data representation
2977  union {
2981  };
2982 };
2983 
2984 
2985 /// Shuffle groups of 4
2986 template<int i0, int i1, int i2, int i3>
2987 OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
2988 
2989 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
2990 template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
2991 
2992 /// Shuffle within each group of 4
2993 template<int i0, int i1, int i2, int i3>
2994 OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);
2995 
2996 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2997 template<int i> vfloat16 shuffle (const vfloat16& a);
2998 
2999 /// Helper: as rapid as possible extraction of one component, when the
3000 /// index is fixed.
3001 template<int i> OIIO_FORCEINLINE float extract (const vfloat16& a);
3002 
3003 /// Helper: substitute val for a[i]
3004 template<int i> OIIO_FORCEINLINE vfloat16 insert (const vfloat16& a, float val);
3005 
3006 /// The sum of all components, returned in all components.
3007 vfloat16 vreduce_add (const vfloat16& v);
3008 
3009 /// The sum of all components, returned as a scalar.
3010 float reduce_add (const vfloat16& v);
3011 
3012 /// Use a bool mask to select between components of a (if mask[i] is false)
3013 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
3014 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
3015 
3016 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
3017 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
3018 /// blend(0,a,mask).
3019 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
3020 
3021 /// Use a bool mask to select between components of a (if mask[i] is false)
3022 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
3023 /// blend(0,a,!mask), or blend(a,0,mask).
3024 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
3025 
3026 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
3027 /// that is 0, return 0 rather than Inf.
3028 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
3029 
3030 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
3031 /// synonym for blend with arguments rearranged, but this is more clear
3032 /// because the arguments are symmetric to scalar (cond ? a : b).
3033 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
3034 
3035 // Per-element math
3036 vfloat16 abs (const vfloat16& a); ///< absolute value (float)
3037 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative
3038 vfloat16 ceil (const vfloat16& a);
3039 vfloat16 floor (const vfloat16& a);
3040 vint16 ifloor (const vfloat16& a); ///< (int)floor
3041 OIIO_DEPRECATED("use ifloor (1.8)")
3042 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias
3043 
3044 /// Per-element round to nearest integer.
3045 /// CAVEAT: the rounding when mid-way between integers may differ depending
3046 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
3047 /// integer) but std::round() says to round away from 0 regardless of
3048 /// current rounding mode (but that is multiple instructions on x64).
3049 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3050 /// match std::round().
3051 vfloat16 round (const vfloat16& a);
3052 
3053 /// Per-element round to nearest integer (equivalent to vint(round(a))).
3054 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
3055 /// C++ std::rint() which says to use the current rounding mode.
3056 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3057 /// match std::rint().
3058 vint16 rint (const vfloat16& a);
3059 
3060 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a
3061 vfloat16 sqrt (const vfloat16 &a);
3062 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt
3063 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt
3064 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
3065 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
3066 // vfloat16 exp (const vfloat16& v); // See template with vfloat4
3067 // vfloat16 log (const vfloat16& v); // See template with vfloat4
3068 
3069 /// andnot(a,b) returns ((~a) & b)
3070 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
3071 
3072 // Fused multiply and add (or subtract):
3073 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
3074 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
3075 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
3076 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
3077 
3078 
3079 
3080 // Odds and ends, other CPU hardware tricks
3081 
3082 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3083 // able, otherwise false (because it's not available on that platform).
3084 inline bool set_flush_zero_mode (bool on) {
3085 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3086  _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3087  return true;
3088 #endif
3089  return false;
3090 }
3091 
3092 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3093 // able, otherwise false (because it's not available on that platform).
3094 inline bool set_denorms_zero_mode (bool on) {
3095 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3096  _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3097  return true;
3098 #endif
3099  return false;
3100 }
3101 
3102 // Get the flush_zero_mode CPU flag on x86.
3103 inline bool get_flush_zero_mode () {
3104 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3105  return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3106 #endif
3107  return false;
3108 }
3109 
3110 // Get the denorms_zero_mode CPU flag on x86.
3111 inline bool get_denorms_zero_mode () {
3112 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3113  return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3114 #endif
3115  return false;
3116 }
3117 
3118 
3119 
3120 
3121 
3122 
3123 //////////////////////////////////////////////////////////////////////////
3124 //////////////////////////////////////////////////////////////////////////
3125 //
3126 // Gory implementation details follow.
3127 //
3128 // ^^^ All declarations and documention is above ^^^
3129 //
3130 // vvv Below is the implementation, often considerably cluttered with
3131 // #if's for each architeture, and unapologitic use of intrinsics and
3132 // every manner of dirty trick we can think of to make things fast.
3133 // Some of this isn't pretty. We won't recapitulate comments or
3134 // documentation of what the functions are supposed to do, please
3135 // consult the declarations above for that.
3136 //
3137 // Here be dragons.
3138 //
3139 //////////////////////////////////////////////////////////////////////////
3140 //////////////////////////////////////////////////////////////////////////
3141 
3142 
3143 
3144 //////////////////////////////////////////////////////////////////////
3145 // vbool4 implementation
3146 
3147 
3149  OIIO_DASSERT(i >= 0 && i < elements);
3150 #if OIIO_SIMD_SSE
3151  return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3152 #else
3153  return m_val[i];
3154 #endif
3155 }
3156 
3158  OIIO_DASSERT(i >= 0 && i < elements);
3159  return m_val[i];
3160 }
3161 
3162 
3164  OIIO_DASSERT(i >= 0 && i < elements);
3165  m_val[i] = value ? -1 : 0;
3166 }
3167 
3168 
3169 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3170  cout << a[0];
3171  for (int i = 1; i < a.elements; ++i)
3172  cout << ' ' << a[i];
3173  return cout;
3174 }
3175 
3176 
3178 #if OIIO_SIMD_SSE
3179  m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3180 #elif OIIO_SIMD_NEON
3181  m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3182 #else
3183  int val = -int(a);
3184  SIMD_CONSTRUCT (val);
3185 #endif
3186 }
3187 
3188 
3189 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3190 #if OIIO_SIMD_SSE
3191  // N.B. -- we need to reverse the order because of our convention
3192  // of storing a,b,c,d in the same order in memory.
3193  m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3194 // #elif OIIO_SIMD_NEON
3195 // FIXME
3196 #else
3197  m_val[0] = -int(a);
3198  m_val[1] = -int(b);
3199  m_val[2] = -int(c);
3200  m_val[3] = -int(d);
3201 #endif
3202 }
3203 
3205  load (a[0], a[1], a[2], a[3]);
3206 }
3207 
3209  m_simd = other.m_simd;
3210  return *this;
3211 }
3212 
3213 
3215 #if OIIO_SIMD_SSE
3216  return _mm_movemask_ps(m_simd);
3217 #else
3218  int r = 0;
3219  for (int i = 0; i < elements; ++i)
3220  if (m_val[i])
3221  r |= 1<<i;
3222  return r;
3223 #endif
3224 }
3225 
3226 
3228 vbool4::from_bitmask (int bitmask) {
3229  // I think this is a fast conversion from int bitmask to vbool4
3230  return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3231 }
3232 
3233 
3235 #if OIIO_SIMD_SSE
3236  m_simd = _mm_setzero_ps();
3237 #else
3238  *this = false;
3239 #endif
3240 }
3241 
3242 
3244 #if OIIO_SIMD_SSE
3245  return _mm_setzero_ps();
3246 #else
3247  return false;
3248 #endif
3249 }
3250 
3252  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3253 #if OIIO_SIMD_SSE
3254 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3255  __m128i anyval = _mm_undefined_si128();
3256 # else
3257  __m128i anyval = _mm_setzero_si128();
3258 # endif
3259  return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3260 #else
3261  return true;
3262 #endif
3263 }
3264 
3266  SIMD_DO (values[i] = m_val[i] ? true : false);
3267 }
3268 
3269 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3270  OIIO_DASSERT (n >= 0 && n <= elements);
3271  for (int i = 0; i < n; ++i)
3272  values[i] = m_val[i] ? true : false;
3273 }
3274 
3275 
3276 
3278 #if OIIO_SIMD_SSE
3279  return _mm_xor_ps (a.simd(), vbool4::True());
3280 #elif OIIO_SIMD_NEON
3281  return vmvnq_u32(a.simd());
3282 #else
3283  SIMD_RETURN (vbool4, a[i] ^ (-1));
3284 #endif
3285 }
3286 
3288 #if OIIO_SIMD_SSE
3289  return _mm_and_ps (a.simd(), b.simd());
3290 #elif OIIO_SIMD_NEON
3291  return vandq_u32(a.simd(), b.simd());
3292 #else
3293  SIMD_RETURN (vbool4, a[i] & b[i]);
3294 #endif
3295 }
3296 
3298 #if OIIO_SIMD_SSE
3299  return _mm_or_ps (a.simd(), b.simd());
3300 #elif OIIO_SIMD_NEON
3301  return vorrq_u32(a.simd(), b.simd());
3302 #else
3303  SIMD_RETURN (vbool4, a[i] | b[i]);
3304 #endif
3305 }
3306 
3308 #if OIIO_SIMD_SSE
3309  return _mm_xor_ps (a.simd(), b.simd());
3310 #elif OIIO_SIMD_NEON
3311  return veorq_u32(a.simd(), b.simd());
3312 #else
3313  SIMD_RETURN (vbool4, a[i] ^ b[i]);
3314 #endif
3315 }
3316 
3317 
3319  return a = a & b;
3320 }
3321 
3323  return a = a | b;
3324 }
3325 
3327  return a = a ^ b;
3328 }
3329 
3331 #if OIIO_SIMD_SSE
3332  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3333  return _mm_xor_ps (a.simd(), vbool4::True());
3334 #elif OIIO_SIMD_NEON
3335  return vmvnq_u32(a.m_simd);
3336 #else
3337  SIMD_RETURN (vbool4, ~a[i]);
3338 #endif
3339 }
3340 
3342 #if OIIO_SIMD_SSE
3343  return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3344 #elif OIIO_SIMD_NEON
3345  return vceqq_u32 (a.m_simd, b.m_simd);
3346 #else
3347  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3348 #endif
3349 }
3350 
3352 #if OIIO_SIMD_SSE
3353  return _mm_xor_ps (a, b);
3354 #elif OIIO_SIMD_NEON
3355  return !(a == b);
3356 #else
3357  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3358 #endif
3359 }
3360 
3361 
3362 
3363 
3364 #if OIIO_SIMD_SSE
3365 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b)
3366 template<int i0, int i1, int i2, int i3>
3367 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3368  return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3369 }
3370 #endif
3371 
3372 #if OIIO_SIMD_SSE >= 3
3373 // SSE3 has intrinsics for a few special cases
3374 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3375  return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3376 }
3377 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3378  return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3379 }
3380 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3381  return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3382 }
3383 #endif
3384 
3385 #if OIIO_SIMD_SSE
3386 template<int i0, int i1, int i2, int i3>
3387 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3388  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3389 }
3390 #endif
3391 
3392 #if OIIO_SIMD_SSE >= 3
3393 // SSE3 has intrinsics for a few special cases
3394 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3395  return _mm_moveldup_ps(a);
3396 }
3397 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3398  return _mm_movehdup_ps(a);
3399 }
3400 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3401  return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3402 }
3403 #endif
3404 
3405 
3406 /// Helper: shuffle/swizzle with constant (templated) indices.
3407 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3408 template<int i0, int i1, int i2, int i3>
3410 #if OIIO_SIMD_SSE
3411  return shuffle_sse<i0,i1,i2,i3> (a.simd());
3412 #else
3413  return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3414 #endif
3415 }
3416 
3417 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3418 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3419  return shuffle<i,i,i,i>(a);
3420 }
3421 
3422 
3423 /// Helper: as rapid as possible extraction of one component, when the
3424 /// index is fixed.
3425 template<int i>
3427 #if OIIO_SIMD_SSE >= 4
3428  return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only
3429 #else
3430  return a[i];
3431 #endif
3432 }
3433 
3434 /// Helper: substitute val for a[i]
3435 template<int i>
3437 #if OIIO_SIMD_SSE >= 4
3438  int ival = -int(val);
3439  return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3440 #else
3441  vbool4 tmp = a;
3442  tmp[i] = -int(val);
3443  return tmp;
3444 #endif
3445 }
3446 
3448 #if OIIO_SIMD_AVX
3449  return _mm_testc_ps (v, vbool4(true)) != 0;
3450 #elif OIIO_SIMD_SSE
3451  return _mm_movemask_ps(v.simd()) == 0xf;
3452 #else
3453  SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3454 #endif
3455 }
3456 
3458 #if OIIO_SIMD_AVX
3459  return ! _mm_testz_ps (v, v);
3460 #elif OIIO_SIMD_SSE
3461  return _mm_movemask_ps(v) != 0;
3462 #else
3463  SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3464 #endif
3465 }
3466 
3467 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3468 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3469 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3470 
3471 
3472 
3473 //////////////////////////////////////////////////////////////////////
3474 // vbool8 implementation
3475 
3476 
3478  OIIO_DASSERT(i >= 0 && i < elements);
3479 #if OIIO_SIMD_AVX
3480  return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3481 #else
3482  return m_val[i];
3483 #endif
3484 }
3485 
3487  OIIO_DASSERT(i >= 0 && i < elements);
3488  m_val[i] = value ? -1 : 0;
3489 }
3490 
3492  OIIO_DASSERT(i >= 0 && i < elements);
3493  return m_val[i];
3494 }
3495 
3496 
3497 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3498  cout << a[0];
3499  for (int i = 1; i < a.elements; ++i)
3500  cout << ' ' << a[i];
3501  return cout;
3502 }
3503 
3504 
3506 #if OIIO_SIMD_AVX
3507  m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3509  m_4[0].load(a);
3510  m_4[1].load(a);
3511 #else
3512  int val = -int(a);
3513  SIMD_CONSTRUCT (val);
3514 #endif
3515 }
3516 
3517 
3518 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3519  bool e, bool f, bool g, bool h) {
3520 #if OIIO_SIMD_AVX
3521  // N.B. -- we need to reverse the order because of our convention
3522  // of storing a,b,c,d in the same order in memory.
3523  m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3524  -int(d), -int(c), -int(b), -int(a)));
3525 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3526  m_4[0].load(a, b, c, d);
3527  m_4[1].load(e, f, g, h);
3528 #else
3529  m_val[0] = -int(a);
3530  m_val[1] = -int(b);
3531  m_val[2] = -int(c);
3532  m_val[3] = -int(d);
3533  m_val[4] = -int(e);
3534  m_val[5] = -int(f);
3535  m_val[6] = -int(g);
3536  m_val[7] = -int(h);
3537 #endif
3538 }
3539 
3540 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3541  bool e, bool f, bool g, bool h) {
3542  load (a, b, c, d, e, f, g, h);
3543 }
3544 
3545 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d,
3546  int e, int f, int g, int h) {
3547  load (bool(a), bool(b), bool(c), bool(d),
3548  bool(e), bool(f), bool(g), bool(h));
3549 }
3550 
3552  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3553 }
3554 
3555 
3557  load(a);
3558  return *this;
3559 }
3560 
3562  m_simd = other.m_simd;
3563  return *this;
3564 }
3565 
3567 #if OIIO_SIMD_AVX
3568  return _mm256_movemask_ps(m_simd);
3569 #else
3570  return lo().bitmask() | (hi().bitmask() << 4);
3571 #endif
3572 }
3573 
3574 
3576 vbool8::from_bitmask (int bitmask) {
3577  // I think this is a fast conversion from int bitmask to vbool8
3578  return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3579 }
3580 
3581 
3583 #if OIIO_SIMD_AVX
3584  m_simd = _mm256_setzero_ps();
3585 #else
3586  *this = false;
3587 #endif
3588 }
3589 
3591 #if OIIO_SIMD_AVX
3592  return _mm256_setzero_ps();
3593 #else
3594  return false;
3595 #endif
3596 }
3597 
3598 
3600 #if OIIO_SIMD_AVX
3601 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3602  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3603  __m256i anyval = _mm256_undefined_si256();
3604  return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3605 # else
3606  return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3607 # endif
3608 #else
3609  return true;
3610 #endif
3611 }
3612 
3613 
3615  SIMD_DO (values[i] = m_val[i] ? true : false);
3616 }
3617 
3618 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3619  OIIO_DASSERT (n >= 0 && n <= elements);
3620  for (int i = 0; i < n; ++i)
3621  values[i] = m_val[i] ? true : false;
3622 }
3623 
3624 
3626 #if OIIO_SIMD_AVX
3627  return _mm256_castps256_ps128 (simd());
3628 #else
3629  return m_4[0];
3630 #endif
3631 }
3632 
3634 #if OIIO_SIMD_AVX
3635  return _mm256_extractf128_ps (simd(), 1);
3636 #else
3637  return m_4[1];
3638 #endif
3639 }
3640 
3641 
3643 #if OIIO_SIMD_AVX
3644  __m256 r = _mm256_castps128_ps256 (lo);
3645  m_simd = _mm256_insertf128_ps (r, hi, 1);
3646  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3647 #else
3648  m_4[0] = lo;
3649  m_4[1] = hi;
3650 #endif
3651 }
3652 
3653 
3655 #if OIIO_SIMD_AVX
3656  return _mm256_xor_ps (a.simd(), vbool8::True());
3657 #else
3658  SIMD_RETURN (vbool8, a[i] ^ (-1));
3659 #endif
3660 }
3661 
3663 #if OIIO_SIMD_AVX
3664  return _mm256_and_ps (a.simd(), b.simd());
3665 #else
3666  SIMD_RETURN (vbool8, a[i] & b[i]);
3667 #endif
3668 }
3669 
3671 #if OIIO_SIMD_AVX
3672  return _mm256_or_ps (a.simd(), b.simd());
3673 #else
3674  SIMD_RETURN (vbool8, a[i] | b[i]);
3675 #endif
3676 }
3677 
3679 #if OIIO_SIMD_AVX
3680  return _mm256_xor_ps (a.simd(), b.simd());
3681 #else
3682  SIMD_RETURN (vbool8, a[i] ^ b[i]);
3683 #endif
3684 }
3685 
3686 
3688  return a = a & b;
3689 }
3690 
3692  return a = a | b;
3693 }
3694 
3696  return a = a ^ b;
3697 }
3698 
3699 
3701 #if OIIO_SIMD_AVX
3702  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3703  return _mm256_xor_ps (a.simd(), vbool8::True());
3704 #else
3705  SIMD_RETURN (vbool8, ~a[i]);
3706 #endif
3707 }
3708 
3709 
3711 #if OIIO_SIMD_AVX >= 2
3712  return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3713 #elif OIIO_SIMD_AVX
3714  return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3715 #else
3716  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3717 #endif
3718 }
3719 
3721 #if OIIO_SIMD_AVX
3722  return _mm256_xor_ps (a, b);
3723 #else
3724  SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3725 #endif
3726 }
3727 
3728 
3729 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3731 #if OIIO_SIMD_AVX >= 2
3732  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3733  return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3734 #else
3735  return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3736 #endif
3737 }
3738 
3739 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3740  return shuffle<i,i,i,i,i,i,i,i>(a);
3741 }
3742 
3743 
3744 template<int i>
3746 #if OIIO_SIMD_AVX && !_WIN32
3747  return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only
3748 #else
3749  return a[i];
3750 #endif
3751 }
3752 
3753 template<int i>
3755 #if OIIO_SIMD_AVX && !_WIN32
3756  int ival = -int(val);
3757  return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3758 #else
3759  vbool8 tmp = a;
3760  tmp[i] = -int(val);
3761  return tmp;
3762 #endif
3763 }
3764 
3765 
3767 #if OIIO_SIMD_AVX
3768  return _mm256_testc_ps (v, vbool8(true)) != 0;
3769  // return _mm256_movemask_ps(v.simd()) == 0xff;
3770 #else
3771  SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3772 #endif
3773 }
3774 
3776 #if OIIO_SIMD_AVX
3777  return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h !
3778  // return _mm256_movemask_ps(v) != 0;
3779 #else
3780  SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3781 #endif
3782 }
3783 
3784 
3785 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3786 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3787 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3788 
3789 
3790 
3791 //////////////////////////////////////////////////////////////////////
3792 // vbool16 implementation
3793 
3794 
3796  OIIO_DASSERT(i >= 0 && i < elements);
3797 #if OIIO_SIMD_AVX >= 512
3798  return (int(m_simd) >> i) & 1;
3799 #else
3800  return (m_bits >> i) & 1;
3801 #endif
3802 }
3803 
3805  OIIO_DASSERT(i >= 0 && i < elements);
3806  int bits = m_bits;
3807  bits &= (0xffff ^ (1<<i));
3808  bits |= (int(value)<<i);
3809  m_bits = bits;
3810 }
3811 
3812 
3813 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3814  cout << a[0];
3815  for (int i = 1; i < a.elements; ++i)
3816  cout << ' ' << a[i];
3817  return cout;
3818 }
3819 
3820 
3822  m_simd = a ? 0xffff : 0;
3823 }
3824 
3825 
3827  m_simd = simd_t(a);
3828 }
3829 
3830 
3831 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3832  bool v4, bool v5, bool v6, bool v7,
3833  bool v8, bool v9, bool v10, bool v11,
3834  bool v12, bool v13, bool v14, bool v15) {
3835  m_simd = simd_t((int(v0) << 0) |
3836  (int(v1) << 1) |
3837  (int(v2) << 2) |
3838  (int(v3) << 3) |
3839  (int(v4) << 4) |
3840  (int(v5) << 5) |
3841  (int(v6) << 6) |
3842  (int(v7) << 7) |
3843  (int(v8) << 8) |
3844  (int(v9) << 9) |
3845  (int(v10) << 10) |
3846  (int(v11) << 11) |
3847  (int(v12) << 12) |
3848  (int(v13) << 13) |
3849  (int(v14) << 14) |
3850  (int(v15) << 15));
3851 }
3852 
3853 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3854  bool v4, bool v5, bool v6, bool v7,
3855  bool v8, bool v9, bool v10, bool v11,
3856  bool v12, bool v13, bool v14, bool v15) {
3857  load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3858 }
3859 
3861  int v4, int v5, int v6, int v7,
3862  int v8, int v9, int v10, int v11,
3863  int v12, int v13, int v14, int v15) {
3864  load (bool(v0), bool(v1), bool(v2), bool(v3),
3865  bool(v4), bool(v5), bool(v6), bool(v7),
3866  bool(v8), bool(v9), bool(v10), bool(v11),
3867  bool(v12), bool(v13), bool(v14), bool(v15));
3868 }
3869 
3871  load_bitmask (a.bitmask() | (b.bitmask() << 8));
3872 }
3873 
3875  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3876  a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3877 }
3878 
3879 
3881  load(a);
3882  return *this;
3883 }
3884 
3886  m_simd = other.m_simd;
3887  return *this;
3888 }
3889 
3890 
3892 #if OIIO_SIMD_AVX >= 512
3893  return int(m_simd);
3894 #else
3895  return int(m_bits);
3896 #endif
3897 }
3898 
3899 
3901  m_simd = simd_t(0);
3902 }
3903 
3905  return simd_t(0);
3906 }
3907 
3908 
3910  return simd_t(0xffff);
3911 }
3912 
3913 
3915  SIMD_DO (values[i] = m_bits & (1<<i));
3916 }
3917 
3918 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
3919  OIIO_DASSERT (n >= 0 && n <= elements);
3920  for (int i = 0; i < n; ++i)
3921  values[i] = m_bits & (1<<i);
3922 }
3923 
3924 
3925 
3927 #if OIIO_SIMD_AVX >= 512
3928  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
3929 #else
3930  SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
3931 #endif
3932 }
3933 
3935 #if OIIO_SIMD_AVX >= 512
3936  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
3937 #else
3938  SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
3939 #endif
3940 }
3941 
3942 
3944 #if OIIO_SIMD_AVX >= 512
3945  return _mm512_knot (a.simd());
3946 #else
3947  return vbool16 (a.m_bits ^ 0xffff);
3948 #endif
3949 }
3950 
3952 #if OIIO_SIMD_AVX >= 512
3953  return _mm512_kand (a.simd(), b.simd());
3954 #else
3955  return vbool16 (a.m_bits & b.m_bits);
3956 #endif
3957 }
3958 
3960 #if OIIO_SIMD_AVX >= 512
3961  return _mm512_kor (a.simd(), b.simd());
3962 #else
3963  return vbool16 (a.m_bits | b.m_bits);
3964 #endif
3965 }
3966 
3968 #if OIIO_SIMD_AVX >= 512
3969  return _mm512_kxor (a.simd(), b.simd());
3970 #else
3971  return vbool16 (a.m_bits ^ b.m_bits);
3972 #endif
3973 }
3974 
3975 
3977  return a = a & b;
3978 }
3979 
3981  return a = a | b;
3982 }
3983 
3985  return a = a ^ b;
3986 }
3987 
3988 
3990  return a ^ vbool16::True();
3991 }
3992 
3993 
3995 #if OIIO_SIMD_AVX >= 512
3996  return _mm512_kxnor (a.simd(), b.simd());
3997 #else
3998