HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 /// @file simd.h
6 ///
7 /// @brief Classes for SIMD processing.
8 ///
9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
10 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
11 ///
12 /// Similar guide for ARM intrinsics:
13 /// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
14 ///
15 /// It helped me a lot to peruse the source of these packages:
16 /// Syrah: https://github.com/boulos/syrah
17 /// Embree: https://github.com/embree
18 /// Vectorial: https://github.com/scoopr/vectorial
19 ///
20 /// To find out which CPU features you have:
21 /// Linux: cat /proc/cpuinfo
22 /// OSX: sysctl machdep.cpu.features
23 ///
24 /// Additional web resources:
25 /// http://www.codersnotes.com/notes/maths-lib-2016/
26 
27 // clang-format off
28 
29 #pragma once
30 
31 #include <algorithm>
32 #include <cstring>
33 
34 #include <OpenImageIO/Imath.h>
35 #include <OpenImageIO/dassert.h>
36 #include <OpenImageIO/platform.h>
37 
38 
39 //////////////////////////////////////////////////////////////////////////
40 // Sort out which SIMD capabilities we have and set definitions
41 // appropriately. This is mostly for internal (within this file) use,
42 // but client applications using this header may find a few of the macros
43 // we define to be useful:
44 //
45 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
46 // hardware is available, this will hold the width in number of
47 // float SIMD "lanes" of widest SIMD registers available. For
48 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
49 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
50 // etc. Using SIMD classes wider than this should work (will be
51 // emulated with narrower SIMD or scalar operations), but is not
52 // expected to have high performance.
53 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
54 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
55 // higher (including AVX).
56 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
57 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
58 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
59 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
60 // available (generally will be OIIO_SIMD*4).
61 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
62 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
63 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
64 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
65 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
66 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
67 
68 #if defined(_WIN32)
69 # include <intrin.h>
70 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
71 # include <x86intrin.h>
72 #elif defined(__GNUC__) && defined(__ARM_NEON__)
73 # include <arm_neon.h>
74 #endif
75 
76 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us
77 // to test thoroughly. We presume that anybody needing high performance
78 // badly enough to want SIMD also is on a 64 bit CPU.
79 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
80 #define OIIO_NO_SSE 1
81 #endif
82 
83 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
84 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
85 # define OIIO_SIMD_SSE 4
86  /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
87  * instructions specific to 4.2, but they are all related to string
88  * comparisons and CRCs, which don't currently seem relevant to OIIO,
89  * so for simplicity, we sweep this difference under the rug.
90  */
91 # elif defined(__SSSE3__)
92 # define OIIO_SIMD_SSE 3
93  /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
94  * there are a few older architectures that are SSE3 but not SSSE3,
95  * and this simplification means that these particular old platforms
96  * will only get SSE2 goodness out of our code. So be it. Anybody who
97  * cares about performance is probably using a 64 bit machine that's
98  * SSE 4.x or AVX by now.
99  */
100 # else
101 # define OIIO_SIMD_SSE 2
102 # endif
103 # define OIIO_SIMD 4
104 # define OIIO_SIMD_MAX_SIZE_BYTES 16
105 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
106 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
107 #else
108 # define OIIO_SIMD_SSE 0
109 #endif
110 
111 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
112  // N.B. Any machine with AVX will also have SSE
113 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
114 # define OIIO_SIMD_AVX 2
115 # else
116 # define OIIO_SIMD_AVX 1
117 # endif
118 # undef OIIO_SIMD
119 # define OIIO_SIMD 8
120 # undef OIIO_SIMD_MAX_SIZE_BYTES
121 # define OIIO_SIMD_MAX_SIZE_BYTES 32
122 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
123 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
124 # if defined(__AVX512F__)
125 # undef OIIO_SIMD_AVX
126 # define OIIO_SIMD_AVX 512
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 64
129 # undef OIIO_SIMD
130 # define OIIO_SIMD 16
131 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
132 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
133 # define OIIO_AVX512F_ENABLED 1
134 # endif
135 # if defined(__AVX512DQ__)
136 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */
137 # else
138 # define OIIO_AVX512DQ_ENABLED 0
139 # endif
140 # if defined(__AVX512PF__)
141 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */
142 # else
143 # define OIIO_AVX512PF_ENABLED 0
144 # endif
145 # if defined(__AVX512ER__)
146 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */
147 # else
148 # define OIIO_AVX512ER_ENABLED 0
149 # endif
150 # if defined(__AVX512CD__)
151 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */
152 # else
153 # define OIIO_AVX512CD_ENABLED 0
154 # endif
155 # if defined(__AVX512BW__)
156 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */
157 # else
158 # define OIIO_AVX512BW_ENABLED 0
159 # endif
160 # if defined(__AVX512VL__)
161 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */
162 # else
163 # define OIIO_AVX512VL_ENABLED 0
164 # endif
165 #else
166 # define OIIO_SIMD_AVX 0
167 # define OIIO_AVX512VL_ENABLED 0
168 # define OIIO_AVX512DQ_ENABLED 0
169 # define OIIO_AVX512PF_ENABLED 0
170 # define OIIO_AVX512ER_ENABLED 0
171 # define OIIO_AVX512CD_ENABLED 0
172 # define OIIO_AVX512BW_ENABLED 0
173 #endif
174 
175 #if defined(__FMA__)
176 # define OIIO_FMA_ENABLED 1
177 #else
178 # define OIIO_FMA_ENABLED 0
179 #endif
180 #if defined(__AVX512IFMA__)
181 # define OIIO_AVX512IFMA_ENABLED 1
182 #else
183 # define OIIO_AVX512IFMA_ENABLED 0
184 #endif
185 
186 #if defined(__F16C__)
187 # define OIIO_F16C_ENABLED 1
188 #else
189 # define OIIO_F16C_ENABLED 0
190 #endif
191 
192 // FIXME Future: support ARM Neon
193 // Uncomment this when somebody with Neon can verify it works
194 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
195 # define OIIO_SIMD 4
196 # define OIIO_SIMD_NEON 1
197 # define OIIO_SIMD_MAX_SIZE_BYTES 16
198 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
199 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
200 #else
201 # define OIIO_SIMD_NEON 0
202 #endif
203 
204 #ifndef OIIO_SIMD
205  // No SIMD available
206 # define OIIO_SIMD 0
207 # define OIIO_SIMD4_ALIGN
208 # define OIIO_SIMD_MAX_SIZE_BYTES 16
209 #endif
210 
211 #ifndef OIIO_SIMD8_ALIGN
212 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
213 #endif
214 #ifndef OIIO_SIMD16_ALIGN
215 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
216 #endif
217 
218 
219 // General features that client apps may want to test for, for conditional
220 // compilation. Will add to this over time as needed. Note that just
221 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
222 // the vfloat8 class (and friends) are in this version of simd.h, but that's
223 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
224 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */
225 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */
226 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */
227 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */
228 
229 
230 // Embarrassing hack: Xlib.h #define's True and False!
231 #ifdef True
232 # undef True
233 #endif
234 #ifdef False
235 # undef False
236 #endif
237 
238 
239 
241 
242 namespace simd {
243 
244 //////////////////////////////////////////////////////////////////////////
245 // Forward declarations of our main SIMD classes
246 
247 class vbool4;
248 class vint4;
249 class vfloat4;
250 class vfloat3;
251 class matrix44;
252 class vbool8;
253 class vint8;
254 class vfloat8;
255 class vbool16;
256 class vint16;
257 class vfloat16;
258 
259 // Deprecated names -- remove these in 1.9
260 typedef vbool4 mask4; // old name
261 typedef vbool4 bool4;
262 typedef vbool8 bool8;
263 typedef vint4 int4;
264 typedef vint8 int8;
265 typedef vfloat3 float3;
266 typedef vfloat4 float4;
267 typedef vfloat8 float8;
268 
269 
270 
271 //////////////////////////////////////////////////////////////////////////
272 // Template magic to determine the raw SIMD types involved, and other
273 // things helpful for metaprogramming.
274 
275 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
276 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
277 
278 #if OIIO_SIMD_SSE
279 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
280 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
281 template<> struct simd_bool_t<4> { typedef __m128 type; };
282 #endif
283 
284 #if OIIO_SIMD_AVX
285 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
286 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
287 template<> struct simd_bool_t<8> { typedef __m256 type; };
288 #endif
289 
290 #if OIIO_SIMD_AVX >= 512
291 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
292 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
293 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
294 #else
295 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
296 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
297 template<> struct simd_bool_t<16> { typedef uint16_t type; };
298 #endif
299 
300 #if OIIO_SIMD_NEON
301 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; };
302 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
303 template<> struct simd_bool_t<4> { typedef uint32x4_t type; };
304 #endif
305 
306 
307 /// Template to retrieve the vector type from the scalar. For example,
308 /// simd::VecType<int,4> will be vfloat4.
309 template<typename T,int elements> struct VecType {};
310 template<> struct VecType<int,1> { typedef int type; };
311 template<> struct VecType<float,1> { typedef float type; };
312 template<> struct VecType<int,4> { typedef vint4 type; };
313 template<> struct VecType<float,4> { typedef vfloat4 type; };
314 template<> struct VecType<float,3> { typedef vfloat3 type; };
315 template<> struct VecType<bool,4> { typedef vbool4 type; };
316 template<> struct VecType<int,8> { typedef vint8 type; };
317 template<> struct VecType<float,8> { typedef vfloat8 type; };
318 template<> struct VecType<bool,8> { typedef vbool8 type; };
319 template<> struct VecType<int,16> { typedef vint16 type; };
320 template<> struct VecType<float,16> { typedef vfloat16 type; };
321 template<> struct VecType<bool,16> { typedef vbool16 type; };
322 
323 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
324 /// anything but our SIMD types.
325 template<typename T> struct SimdSize { static const int size = 1; };
326 template<> struct SimdSize<vint4> { static const int size = 4; };
327 template<> struct SimdSize<vfloat4> { static const int size = 4; };
328 template<> struct SimdSize<vfloat3> { static const int size = 4; };
329 template<> struct SimdSize<vbool4> { static const int size = 4; };
330 template<> struct SimdSize<vint8> { static const int size = 8; };
331 template<> struct SimdSize<vfloat8> { static const int size = 8; };
332 template<> struct SimdSize<vbool8> { static const int size = 8; };
333 template<> struct SimdSize<vint16> { static const int size = 16; };
334 template<> struct SimdSize<vfloat16> { static const int size = 16; };
335 template<> struct SimdSize<vbool16> { static const int size = 16; };
336 
337 /// Template to retrieve the number of elements size of a SIMD type. Rigged
338 /// to be 1 for anything but our SIMD types.
339 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
340 template<> struct SimdElements<vfloat3> { static const int size = 3; };
341 
342 /// Template giving a printable name for each type
343 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
344 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } };
345 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } };
346 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } };
347 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } };
348 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } };
349 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } };
350 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } };
351 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } };
352 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } };
353 
354 
355 //////////////////////////////////////////////////////////////////////////
356 // Macros helpful for making static constants in code.
357 
358 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
359  static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
360 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
361  static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
362 # define OIIO_SIMD_INT4_CONST(name,val) \
363  static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
364 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
365  static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
366 # define OIIO_SIMD_UINT4_CONST(name,val) \
367  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
368 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
369  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
370 
371 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
372  static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
373  (val), (val), (val), (val) }
374 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
375  static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
376  (v4), (v5), (v6), (v7) }
377 # define OIIO_SIMD_INT8_CONST(name,val) \
378  static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
379  (val), (val), (val), (val) }
380 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
381  static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
382  (v4), (v5), (v6), (v7) }
383 # define OIIO_SIMD_UINT8_CONST(name,val) \
384  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
385  (val), (val), (val), (val) }
386 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
387  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
388  (v4), (v5), (v6), (v7) }
389 
390 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
391  static const OIIO_SIMD16_ALIGN float name[16] = { \
392  (val), (val), (val), (val), (val), (val), (val), (val), \
393  (val), (val), (val), (val), (val), (val), (val), (val) }
394 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
395  static const OIIO_SIMD16_ALIGN float name[16] = { \
396  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
397  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
398 # define OIIO_SIMD_INT16_CONST(name,val) \
399  static const OIIO_SIMD16_ALIGN int name[16] = { \
400  (val), (val), (val), (val), (val), (val), (val), (val), \
401  (val), (val), (val), (val), (val), (val), (val), (val) }
402 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
403  static const OIIO_SIMD16_ALIGN int name[16] = { \
404  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
405  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
406 # define OIIO_SIMD_UINT16_CONST(name,val) \
407  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
408  (val), (val), (val), (val), (val), (val), (val), (val), \
409  (val), (val), (val), (val), (val), (val), (val), (val) }
410 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
411  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
412  (val), (val), (val), (val), (val), (val), (val), (val), \
413  (val), (val), (val), (val), (val), (val), (val), (val) }
414 
415 
416 //////////////////////////////////////////////////////////////////////////
417 // Some macros just for use in this file (#undef-ed at the end) making
418 // it more succinct to express per-element operations.
419 
420 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
421 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
422 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
423  for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
424 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
425 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
426 
427 
428 
429 //////////////////////////////////////////////////////////////////////////
430 //////////////////////////////////////////////////////////////////////////
431 // The public declarations of the main SIMD classes follow: boolN, intN,
432 // floatN, matrix44.
433 //
434 // These class declarations are intended to be brief and self-documenting,
435 // and give all the information that users or client applications need to
436 // know to use these classes.
437 //
438 // No implementations are given inline except for the briefest, completely
439 // generic methods that don't have any architecture-specific overloads.
440 // After the class defintions, there will be an immense pile of full
441 // implementation definitions, which casual users are not expected to
442 // understand.
443 //////////////////////////////////////////////////////////////////////////
444 //////////////////////////////////////////////////////////////////////////
445 
446 
447 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
448 /// SIMD instructions when available. This is what is naturally produced by
449 /// SIMD comparison operators on the vfloat4 and vint4 types.
450 class vbool4 {
451 public:
452  static const char* type_name() { return "vbool4"; }
453  typedef bool value_t; ///< Underlying equivalent scalar value type
454  enum { elements = 4 }; ///< Number of scalar elements
455  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
456  enum { bits = elements*32 }; ///< Total number of bits
457  typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used
458 
459  /// Default constructor (contents undefined)
460  vbool4 () { }
461 
462  /// Construct from a single value (store it in all slots)
463  vbool4 (bool a) { load(a); }
464 
465  explicit vbool4 (const bool *a);
466 
467  /// Construct from 4 bool values
468  vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
469 
470  /// Copy construct from another vbool4
471  vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
472 
473  /// Construct from 4 int values
474  vbool4 (int a, int b, int c, int d) {
475  load (bool(a), bool(b), bool(c), bool(d));
476  }
477 
478  /// Construct from a SIMD int (is each element nonzero?)
479  vbool4 (const vint4 &i);
480 
481  /// Construct from the underlying SIMD type
482  vbool4 (const simd_t& m) : m_simd(m) { }
483 
484  /// Return the raw SIMD type
485  operator simd_t () const { return m_simd; }
486  simd_t simd () const { return m_simd; }
487  simd_t& simd () { return m_simd; }
488 
489  /// Extract the bitmask
490  int bitmask () const;
491 
492  /// Convert from integer bitmask to a true vbool4
493  static vbool4 from_bitmask (int bitmask);
494 
495  /// Set all components to false
496  void clear ();
497 
498  /// Return a vbool4 the is 'false' for all values
499  static const vbool4 False ();
500 
501  /// Return a vbool4 the is 'true' for all values
502  static const vbool4 True ();
503 
504  /// Assign one value to all components
505  const vbool4 & operator= (bool a) { load(a); return *this; }
506 
507  /// Assignment of another vbool4
508  const vbool4 & operator= (const vbool4 & other);
509 
510  /// Component access (get)
511  int operator[] (int i) const;
512 
513  /// Component access (set).
514  void setcomp (int i, bool value);
515 
516  /// Component access (set).
517  /// NOTE: avoid this unsafe construct. It will go away some day.
518  int& operator[] (int i);
519 
520  /// Helper: load a single value into all components.
521  void load (bool a);
522 
523  /// Helper: load separate values into each component.
524  void load (bool a, bool b, bool c, bool d);
525 
526  /// Helper: store the values into memory as bools.
527  void store (bool *values) const;
528 
529  /// Store the first n values into memory.
530  void store (bool *values, int n) const;
531 
532  /// Logical/bitwise operators, component-by-component
533  friend vbool4 operator! (const vbool4& a);
534  friend vbool4 operator& (const vbool4& a, const vbool4& b);
535  friend vbool4 operator| (const vbool4& a, const vbool4& b);
536  friend vbool4 operator^ (const vbool4& a, const vbool4& b);
537  friend vbool4 operator~ (const vbool4& a);
538  friend const vbool4& operator&= (vbool4& a, const vbool4& b);
539  friend const vbool4& operator|= (vbool4& a, const vbool4& b);
540  friend const vbool4& operator^= (vbool4& a, const vbool4& b);
541 
542  /// Comparison operators, component by component
543  friend vbool4 operator== (const vbool4& a, const vbool4& b);
544  friend vbool4 operator!= (const vbool4& a, const vbool4& b);
545 
546  /// Stream output
547  friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
548 
549 private:
550  // The actual data representation
551  union {
554  };
555 };
556 
557 
558 
559 /// Helper: shuffle/swizzle with constant (templated) indices.
560 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
561 template<int i0, int i1, int i2, int i3> vbool4 shuffle (const vbool4& a);
562 
563 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
564 template<int i> vbool4 shuffle (const vbool4& a);
565 
566 /// Helper: as rapid as possible extraction of one component, when the
567 /// index is fixed.
568 template<int i> bool extract (const vbool4& a);
569 
570 /// Helper: substitute val for a[i]
571 template<int i> vbool4 insert (const vbool4& a, bool val);
572 
573 /// Logical reduction across all components.
574 bool reduce_and (const vbool4& v);
575 bool reduce_or (const vbool4& v);
576 
577 // Are all/any/no components true?
578 bool all (const vbool4& v);
579 bool any (const vbool4& v);
580 bool none (const vbool4& v);
581 
582 // It's handy to have this defined for regular bool as well
583 inline bool all (bool v) { return v; }
584 
585 
586 
587 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
588 /// SIMD instructions when available. This is what is naturally produced by
589 /// SIMD comparison operators on the vfloat8 and vint8 types.
590 class vbool8 {
591 public:
592  static const char* type_name() { return "vbool8"; }
593  typedef bool value_t; ///< Underlying equivalent scalar value type
594  enum { elements = 8 }; ///< Number of scalar elements
595  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
596  enum { bits = elements*32 }; ///< Total number of bits
597  typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used
598 
599  /// Default constructor (contents undefined)
600  vbool8 () { }
601 
602  /// Construct from a single value (store it in all slots)
603  vbool8 (bool a) { load (a); }
604 
605  explicit vbool8 (const bool *values);
606 
607  /// Construct from 8 bool values
608  vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
609 
610  /// Copy construct from another vbool8
611  vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
612 
613  /// Construct from 8 int values
614  vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
615 
616  /// Construct from a SIMD int (is each element nonzero?)
617  vbool8 (const vint8 &i);
618 
619  /// Construct from two vbool4's
620  vbool8 (const vbool4 &lo, const vbool4 &hi);
621 
622  /// Construct from the underlying SIMD type
623  vbool8 (const simd_t& m) : m_simd(m) { }
624 
625  /// Return the raw SIMD type
626  operator simd_t () const { return m_simd; }
627  simd_t simd () const { return m_simd; }
628  simd_t& simd () { return m_simd; }
629 
630  /// Extract the bitmask
631  int bitmask () const;
632 
633  /// Convert from integer bitmask to a true vbool8
634  static vbool8 from_bitmask (int bitmask);
635 
636  /// Set all components to false
637  void clear ();
638 
639  /// Return a vbool8 the is 'false' for all values
640  static const vbool8 False ();
641 
642  /// Return a vbool8 the is 'true' for all values
643  static const vbool8 True ();
644 
645  /// Assign one value to all components
646  const vbool8 & operator= (bool a);
647 
648  /// Assignment of another vbool8
649  const vbool8 & operator= (const vbool8 & other);
650 
651  /// Component access (get)
652  int operator[] (int i) const;
653 
654  /// Component access (set).
655  void setcomp (int i, bool value);
656 
657  /// Component access (set).
658  /// NOTE: avoid this unsafe construct. It will go away some day.
659  int& operator[] (int i);
660 
661  /// Extract the lower precision vbool4
662  vbool4 lo () const;
663 
664  /// Extract the higher precision vbool4
665  vbool4 hi () const;
666 
667  /// Helper: load a single value into all components.
668  void load (bool a);
669 
670  /// Helper: load separate values into each component.
671  void load (bool a, bool b, bool c, bool d,
672  bool e, bool f, bool g, bool h);
673 
674  /// Helper: store the values into memory as bools.
675  void store (bool *values) const;
676 
677  /// Store the first n values into memory.
678  void store (bool *values, int n) const;
679 
680  /// Logical/bitwise operators, component-by-component
681  friend vbool8 operator! (const vbool8& a);
682  friend vbool8 operator& (const vbool8& a, const vbool8& b);
683  friend vbool8 operator| (const vbool8& a, const vbool8& b);
684  friend vbool8 operator^ (const vbool8& a, const vbool8& b);
685  friend vbool8 operator~ (const vbool8& a);
686  friend const vbool8& operator&= (vbool8& a, const vbool8& b);
687  friend const vbool8& operator|= (vbool8& a, const vbool8& b);
688  friend const vbool8& operator^= (vbool8& a, const vbool8& b);
689 
690  /// Comparison operators, component by component
691  friend vbool8 operator== (const vbool8& a, const vbool8& b);
692  friend vbool8 operator!= (const vbool8& a, const vbool8& b);
693 
694  /// Stream output
695  friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
696 
697 private:
698  // The actual data representation
699  union {
703  };
704 };
705 
706 
707 
708 /// Helper: shuffle/swizzle with constant (templated) indices.
709 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
710 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
711 vbool8 shuffle (const vbool8& a);
712 
713 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
714 template<int i> vbool8 shuffle (const vbool8& a);
715 
716 /// Helper: as rapid as possible extraction of one component, when the
717 /// index is fixed.
718 template<int i> bool extract (const vbool8& a);
719 
720 /// Helper: substitute val for a[i]
721 template<int i> vbool8 insert (const vbool8& a, bool val);
722 
723 /// Logical reduction across all components.
724 bool reduce_and (const vbool8& v);
725 bool reduce_or (const vbool8& v);
726 
727 // Are all/any/no components true?
728 bool all (const vbool8& v);
729 bool any (const vbool8& v);
730 bool none (const vbool8& v);
731 
732 
733 
734 
735 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
736 /// by SIMD instructions when available. This is what is naturally produced
737 /// by SIMD comparison operators on the vfloat16 and vint16 types.
738 class vbool16 {
739 public:
740  static const char* type_name() { return "vbool16"; }
741  typedef bool value_t; ///< Underlying equivalent scalar value type
742  enum { elements = 16 }; ///< Number of scalar elements
743  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
744  enum { bits = 16 }; ///< Total number of bits
745  typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used
746 
747  /// Default constructor (contents undefined)
748  vbool16 () { }
749 
750  /// Construct from a single value (store it in all slots)
751  vbool16 (bool a) { load (a); }
752 
753  explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
754 
755  explicit vbool16 (const bool *values);
756 
757  /// Construct from 16 bool values
758  vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
759  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
760 
761  /// Copy construct from another vbool16
762  vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
763 
764  /// Construct from 16 int values
765  vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
766  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
767 
768  /// Construct from a SIMD int (is each element nonzero?)
769  vbool16 (const vint16 &i);
770 
771  /// Construct from two vbool8's
772  vbool16 (const vbool8 &lo, const vbool8 &hi);
773 
774  /// Construct from four vbool4's
775  vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
776 
777  /// Construct from the underlying SIMD type
778  vbool16 (const simd_t& m) : m_simd(m) { }
779 
780  /// Return the raw SIMD type
781  operator simd_t () const { return m_simd; }
782  simd_t simd () const { return m_simd; }
783  simd_t& simd () { return m_simd; }
784 
785  int bitmask () const;
786 
787  /// Convert from integer bitmask to a true vbool16
788  static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
789 
790  /// Set all components to false
791  void clear ();
792 
793  /// Return a vbool16 the is 'false' for all values
794  static const vbool16 False ();
795 
796  /// Return a vbool16 the is 'true' for all values
797  static const vbool16 True ();
798 
799  /// Assign one value to all components
800  const vbool16 & operator= (bool a);
801 
802  /// Assignment of another vbool16
803  const vbool16 & operator= (const vbool16 & other);
804 
805  /// Component access (get)
806  int operator[] (int i) const;
807 
808  /// Component access (set).
809  void setcomp (int i, bool value);
810 
811  /// Extract the lower precision vbool8
812  vbool8 lo () const;
813 
814  /// Extract the higher precision vbool8
815  vbool8 hi () const;
816 
817  /// Helper: load a single value into all components.
818  void load (bool a);
819 
820  /// Helper: load separate values into each component.
821  void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
822  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
823 
824  /// Helper: load all components from a bitmask in an int.
825  void load_bitmask (int a);
826 
827  /// Helper: store the values into memory as bools.
828  void store (bool *values) const;
829 
830  /// Store the first n values into memory.
831  void store (bool *values, int n) const;
832 
833  /// Logical/bitwise operators, component-by-component
834  friend vbool4 operator! (const vbool4& a);
835  friend vbool16 operator! (const vbool16& a);
836  friend vbool16 operator& (const vbool16& a, const vbool16& b);
837  friend vbool16 operator| (const vbool16& a, const vbool16& b);
838  friend vbool16 operator^ (const vbool16& a, const vbool16& b);
839  friend vbool16 operator~ (const vbool16& a);
840  friend const vbool16& operator&= (vbool16& a, const vbool16& b);
841  friend const vbool16& operator|= (vbool16& a, const vbool16& b);
842  friend const vbool16& operator^= (vbool16& a, const vbool16& b);
843 
844  /// Comparison operators, component by component
845  friend vbool16 operator== (const vbool16& a, const vbool16& b);
846  friend vbool16 operator!= (const vbool16& a, const vbool16& b);
847 
848  /// Stream output
849  friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
850 
851 private:
852  // The actual data representation
853  union {
855  uint16_t m_bits;
856  };
857 };
858 
859 
860 
861 /// Helper: as rapid as possible extraction of one component, when the
862 /// index is fixed.
863 template<int i> bool extract (const vbool16& a);
864 
865 /// Helper: substitute val for a[i]
866 template<int i> vbool16 insert (const vbool16& a, bool val);
867 
868 /// Logical reduction across all components.
869 bool reduce_and (const vbool16& v);
870 bool reduce_or (const vbool16& v);
871 
872 // Are all/any/no components true?
873 bool all (const vbool16& v);
874 bool any (const vbool16& v);
875 bool none (const vbool16& v);
876 
877 
878 
879 
880 
881 /// Integer 4-vector, accelerated by SIMD instructions when available.
882 class vint4 {
883 public:
884  static const char* type_name() { return "vint4"; }
885  typedef int value_t; ///< Underlying equivalent scalar value type
886  enum { elements = 4 }; ///< Number of scalar elements
887  enum { paddedelements =4 }; ///< Number of scalar elements for full pad
888  enum { bits = 128 }; ///< Total number of bits
889  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
890  typedef vbool4 vbool_t; ///< bool type of the same length
891  typedef vfloat4 vfloat_t; ///< float type of the same length
892  typedef vint4 vint_t; ///< int type of the same length
893  typedef vbool4 bool_t; // old name (deprecated 1.8)
894  typedef vfloat4 float_t; // old name (deprecated 1.8)
895 
896  /// Default constructor (contents undefined)
897  vint4 () { }
898 
899  /// Construct from a single value (store it in all slots)
900  vint4 (int a);
901 
902  /// Construct from 2 values -- (a,a,b,b)
903  vint4 (int a, int b);
904 
905  /// Construct from 4 values
906  vint4 (int a, int b, int c, int d);
907 
908  /// Construct from a pointer to values
909  vint4 (const int *vals);
910 
911  /// Construct from a pointer to unsigned short values
912  explicit vint4 (const unsigned short *vals);
913 
914  /// Construct from a pointer to signed short values
915  explicit vint4 (const short *vals);
916 
917  /// Construct from a pointer to unsigned char values (0 - 255)
918  explicit vint4 (const unsigned char *vals);
919 
920  /// Construct from a pointer to signed char values (-128 - 127)
921  explicit vint4 (const char *vals);
922 
923  /// Copy construct from another vint4
924  vint4 (const vint4 & other) { m_simd = other.m_simd; }
925 
926  /// Convert a vfloat to an vint. Equivalent to i = (int)f;
927  explicit vint4 (const vfloat4& f); // implementation below
928 
929  /// Construct from the underlying SIMD type
930  vint4 (const simd_t& m) : m_simd(m) { }
931 
932  /// Return the raw SIMD type
933  operator simd_t () const { return m_simd; }
934  simd_t simd () const { return m_simd; }
935  simd_t& simd () { return m_simd; }
936 
937  /// Return a pointer to the underlying scalar type
938  const value_t* data () const { return (const value_t*)this; }
939  value_t* data () { return (value_t*)this; }
940 
941  /// Sset all components to 0
942  void clear () ;
943 
944  /// Return an vint4 with all components set to 0
945  static const vint4 Zero ();
946 
947  /// Return an vint4 with all components set to 1
948  static const vint4 One ();
949 
950  /// Return an vint4 with all components set to -1 (aka 0xffffffff)
951  static const vint4 NegOne ();
952 
953  /// Return an vint4 with incremented components (e.g., 0,1,2,3).
954  /// Optional arguments can give a non-zero starting point and step size.
955  static const vint4 Iota (int start=0, int step=1);
956 
957  /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
958  static const vint4 Giota ();
959 
960  /// Assign one value to all components.
961  const vint4 & operator= (int a);
962 
963  /// Assignment from another vint4
964  const vint4 & operator= (const vint4& other) ;
965 
966  /// Component access (get)
967  int operator[] (int i) const;
968 
969  /// Component access (set)
970  int& operator[] (int i);
971 
972  /// Component access (set).
973  void setcomp (int i, int value);
974 
975  value_t x () const;
976  value_t y () const;
977  value_t z () const;
978  value_t w () const;
979  void set_x (value_t val);
980  void set_y (value_t val);
981  void set_z (value_t val);
982  void set_w (value_t val);
983 
984  /// Helper: load a single int into all components
985  void load (int a);
986 
987  /// Helper: load separate values into each component.
988  void load (int a, int b, int c, int d);
989 
990  /// Load from an array of 4 values
991  void load (const int *values);
992 
993  void load (const int *values, int n) ;
994 
995  /// Load from an array of 4 unsigned short values, convert to vint4
996  void load (const unsigned short *values) ;
997 
998  /// Load from an array of 4 unsigned short values, convert to vint4
999  void load (const short *values);
1000 
1001  /// Load from an array of 4 unsigned char values, convert to vint4
1002  void load (const unsigned char *values);
1003 
1004  /// Load from an array of 4 unsigned char values, convert to vint4
1005  void load (const char *values);
1006 
1007  /// Store the values into memory
1008  void store (int *values) const;
1009 
1010  /// Store the first n values into memory
1011  void store (int *values, int n) const;
1012 
1013  /// Store the least significant 16 bits of each element into adjacent
1014  /// unsigned shorts.
1015  void store (unsigned short *values) const;
1016 
1017  /// Store the least significant 8 bits of each element into adjacent
1018  /// unsigned chars.
1019  void store (unsigned char *values) const;
1020 
1021  /// Masked load -- read from values[] where mask is 1, load zero where
1022  /// mask is 0.
1023  void load_mask (int mask, const value_t *values);
1024  void load_mask (const vbool_t& mask, const value_t *values);
1025 
1026  /// Masked store -- write to values[] where mask is enabled, don't
1027  /// touch values[] where it's not.
1028  void store_mask (int mask, value_t *values) const;
1029  void store_mask (const vbool_t& mask, value_t *values) const;
1030 
1031  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1032  template<int scale=4>
1033  void gather (const value_t *baseptr, const vint_t& vindex);
1034  /// Gather elements defined by the mask, leave others unchanged.
1035  template<int scale=4>
1036  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1037  template<int scale=4>
1038  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1039 
1040  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1041  template<int scale=4>
1042  void scatter (value_t *baseptr, const vint_t& vindex) const;
1043  /// Scatter elements defined by the mask
1044  template<int scale=4>
1045  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1046  template<int scale=4>
1047  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1048 
1049  // Arithmetic operators (component-by-component)
1050  friend vint4 operator+ (const vint4& a, const vint4& b);
1051  friend vint4 operator- (const vint4& a);
1052  friend vint4 operator- (const vint4& a, const vint4& b);
1053  friend vint4 operator* (const vint4& a, const vint4& b);
1054  friend vint4 operator/ (const vint4& a, const vint4& b);
1055  friend vint4 operator% (const vint4& a, const vint4& b);
1056  friend const vint4 & operator+= (vint4& a, const vint4& b);
1057  friend const vint4 & operator-= (vint4& a, const vint4& b);
1058  friend const vint4 & operator*= (vint4& a, const vint4& b);
1059  friend const vint4 & operator/= (vint4& a, const vint4& b);
1060  friend const vint4 & operator%= (vint4& a, const vint4& b);
1061  // Bitwise operators (component-by-component)
1062  friend vint4 operator& (const vint4& a, const vint4& b);
1063  friend vint4 operator| (const vint4& a, const vint4& b);
1064  friend vint4 operator^ (const vint4& a, const vint4& b);
1065  friend const vint4& operator&= (vint4& a, const vint4& b);
1066  friend const vint4& operator|= (vint4& a, const vint4& b);
1067  friend const vint4& operator^= (vint4& a, const vint4& b);
1068  friend vint4 operator~ (const vint4& a);
1069  friend vint4 operator<< (const vint4& a, unsigned int bits);
1070  friend vint4 operator>> (const vint4& a, unsigned int bits);
1071  friend const vint4& operator<<= (vint4& a, unsigned int bits);
1072  friend const vint4& operator>>= (vint4& a, unsigned int bits);
1073  // Comparison operators (component-by-component)
1074  friend vbool4 operator== (const vint4& a, const vint4& b);
1075  friend vbool4 operator!= (const vint4& a, const vint4& b);
1076  friend vbool4 operator< (const vint4& a, const vint4& b);
1077  friend vbool4 operator> (const vint4& a, const vint4& b);
1078  friend vbool4 operator>= (const vint4& a, const vint4& b);
1079  friend vbool4 operator<= (const vint4& a, const vint4& b);
1080 
1081  /// Stream output
1082  friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1083 
1084 private:
1085  // The actual data representation
1086  union {
1089  };
1090 };
1091 
1092 
1093 
1094 // Shift right logical -- unsigned shift. This differs from operator>>
1095 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1096 // srl((1<<31),1) == 1<<30.
1097 vint4 srl (const vint4& val, const unsigned int bits);
1098 
1099 /// Helper: shuffle/swizzle with constant (templated) indices.
1100 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1101 template<int i0, int i1, int i2, int i3> vint4 shuffle (const vint4& a);
1102 
1103 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1104 template<int i> vint4 shuffle (const vint4& a);
1105 
1106 /// Helper: as rapid as possible extraction of one component, when the
1107 /// index is fixed.
1108 template<int i> int extract (const vint4& v);
1109 
1110 /// The sum of all components, returned in all components.
1111 vint4 vreduce_add (const vint4& v);
1112 
1113 // Reduction across all components
1114 int reduce_add (const vint4& v);
1115 int reduce_and (const vint4& v);
1116 int reduce_or (const vint4& v);
1117 
1118 /// Use a bool mask to select between components of a (if mask[i] is false)
1119 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1120 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1121 
1122 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1123 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1124 /// blend(0,a,mask).
1125 vint4 blend0 (const vint4& a, const vbool4& mask);
1126 
1127 /// Use a bool mask to select between components of a (if mask[i] is false)
1128 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1129 /// blend(0,a,!mask), or blend(a,0,mask).
1130 vint4 blend0not (const vint4& a, const vbool4& mask);
1131 
1132 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1133 /// synonym for blend with arguments rearranged, but this is more clear
1134 /// because the arguments are symmetric to scalar (cond ? a : b).
1135 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1136 
1137 // Per-element math
1138 vint4 abs (const vint4& a);
1139 vint4 min (const vint4& a, const vint4& b);
1140 vint4 max (const vint4& a, const vint4& b);
1141 
1142 /// Circular bit rotate by s bits, for N values at once.
1143 vint4 rotl (const vint4& x, const int s);
1144 // DEPRECATED(2.1)
1145 vint4 rotl32 (const vint4& x, const unsigned int k);
1146 
1147 /// andnot(a,b) returns ((~a) & b)
1148 vint4 andnot (const vint4& a, const vint4& b);
1149 
1150 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1151 vint4 bitcast_to_int (const vbool4& x);
1152 vint4 bitcast_to_int (const vfloat4& x);
1153 vfloat4 bitcast_to_float (const vint4& x);
1154 
1155 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1156 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1157  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1158 
1159 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1160 
1161 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1162 vint4 safe_mod (const vint4& a, const vint4& b);
1163 vint4 safe_mod (const vint4& a, int b);
1164 
1165 
1166 
1167 
1168 /// Integer 8-vector, accelerated by SIMD instructions when available.
1169 class vint8 {
1170 public:
1171  static const char* type_name() { return "vint8"; }
1172  typedef int value_t; ///< Underlying equivalent scalar value type
1173  enum { elements = 8 }; ///< Number of scalar elements
1174  enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1175  enum { bits = elements*32 }; ///< Total number of bits
1176  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1177  typedef vbool8 vbool_t; ///< bool type of the same length
1178  typedef vfloat8 vfloat_t; ///< float type of the same length
1179  typedef vint8 vint_t; ///< int type of the same length
1180  typedef vbool8 bool_t; // old name (deprecated 1.8)
1181  typedef vfloat8 float_t; // old name (deprecated 1.8)
1182 
1183  /// Default constructor (contents undefined)
1184  vint8 () { }
1185 
1186  /// Construct from a single value (store it in all slots)
1187  vint8 (int a);
1188 
1189  /// Construct from 2 values -- (a,a,b,b)
1190  vint8 (int a, int b);
1191 
1192  /// Construct from 8 values (won't work for vint8)
1193  vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1194 
1195  /// Construct from a pointer to values
1196  vint8 (const int *vals);
1197 
1198  /// Construct from a pointer to unsigned short values
1199  explicit vint8 (const unsigned short *vals);
1200 
1201  /// Construct from a pointer to signed short values
1202  explicit vint8 (const short *vals);
1203 
1204  /// Construct from a pointer to unsigned char values (0 - 255)
1205  explicit vint8 (const unsigned char *vals);
1206 
1207  /// Construct from a pointer to signed char values (-128 - 127)
1208  explicit vint8 (const char *vals);
1209 
1210  /// Copy construct from another vint8
1211  vint8 (const vint8 & other) { m_simd = other.m_simd; }
1212 
1213  /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1214  explicit vint8 (const vfloat8& f); // implementation below
1215 
1216  /// Construct from two vint4's
1217  vint8 (const vint4 &lo, const vint4 &hi);
1218 
1219  /// Construct from the underlying SIMD type
1220  vint8 (const simd_t& m) : m_simd(m) { }
1221 
1222  /// Return the raw SIMD type
1223  operator simd_t () const { return m_simd; }
1224  simd_t simd () const { return m_simd; }
1225  simd_t& simd () { return m_simd; }
1226 
1227  /// Return a pointer to the underlying scalar type
1228  const value_t* data () const { return (const value_t*)this; }
1229  value_t* data () { return (value_t*)this; }
1230 
1231  /// Sset all components to 0
1232  void clear () ;
1233 
1234  /// Return an vint8 with all components set to 0
1235  static const vint8 Zero ();
1236 
1237  /// Return an vint8 with all components set to 1
1238  static const vint8 One ();
1239 
1240  /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1241  static const vint8 NegOne ();
1242 
1243  /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1244  /// Optional arguments can give a non-zero starting point and step size.
1245  static const vint8 Iota (int start=0, int step=1);
1246 
1247  /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1248  static const vint8 Giota ();
1249 
1250  /// Assign one value to all components.
1251  const vint8 & operator= (int a);
1252 
1253  /// Assignment from another vint8
1254  const vint8 & operator= (const vint8& other) ;
1255 
1256  /// Component access (get)
1257  int operator[] (int i) const;
1258 
1259  /// Component access (set)
1260  int& operator[] (int i);
1261 
1262  /// Component access (set).
1263  void setcomp (int i, int value);
1264 
1265  value_t x () const;
1266  value_t y () const;
1267  value_t z () const;
1268  value_t w () const;
1269  void set_x (value_t val);
1270  void set_y (value_t val);
1271  void set_z (value_t val);
1272  void set_w (value_t val);
1273 
1274  /// Extract the lower precision vint4
1275  vint4 lo () const;
1276 
1277  /// Extract the higher precision vint4
1278  vint4 hi () const;
1279 
1280  /// Helper: load a single int into all components
1281  void load (int a);
1282 
1283  /// Load separate values into each component.
1284  void load (int a, int b, int c, int d, int e, int f, int g, int h);
1285 
1286  /// Load from an array of 8 values
1287  void load (const int *values);
1288 
1289  void load (const int *values, int n) ;
1290 
1291  /// Load from an array of 8 unsigned short values, convert to vint8
1292  void load (const unsigned short *values) ;
1293 
1294  /// Load from an array of 8 unsigned short values, convert to vint8
1295  void load (const short *values);
1296 
1297  /// Load from an array of 8 unsigned char values, convert to vint8
1298  void load (const unsigned char *values);
1299 
1300  /// Load from an array of 8 unsigned char values, convert to vint8
1301  void load (const char *values);
1302 
1303  /// Store the values into memory
1304  void store (int *values) const;
1305 
1306  /// Store the first n values into memory
1307  void store (int *values, int n) const;
1308 
1309  /// Store the least significant 16 bits of each element into adjacent
1310  /// unsigned shorts.
1311  void store (unsigned short *values) const;
1312 
1313  /// Store the least significant 8 bits of each element into adjacent
1314  /// unsigned chars.
1315  void store (unsigned char *values) const;
1316 
1317  /// Masked load -- read from values[] where mask is 1, load zero where
1318  /// mask is 0.
1319  void load_mask (int mask, const value_t *values);
1320  void load_mask (const vbool_t& mask, const value_t *values);
1321 
1322  /// Masked store -- write to values[] where mask is enabled, don't
1323  /// touch values[] where it's not.
1324  void store_mask (int mask, value_t *values) const;
1325  void store_mask (const vbool_t& mask, value_t *values) const;
1326 
1327  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1328  template<int scale=4>
1329  void gather (const value_t *baseptr, const vint_t& vindex);
1330  /// Gather elements defined by the mask, leave others unchanged.
1331  template<int scale=4>
1332  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1333  template<int scale=4>
1334  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1335 
1336  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1337  template<int scale=4>
1338  void scatter (value_t *baseptr, const vint_t& vindex) const;
1339  /// Scatter elements defined by the mask
1340  template<int scale=4>
1341  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1342  template<int scale=4>
1343  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1344 
1345  // Arithmetic operators (component-by-component)
1346  friend vint8 operator+ (const vint8& a, const vint8& b);
1347  friend vint8 operator- (const vint8& a);
1348  friend vint8 operator- (const vint8& a, const vint8& b);
1349  friend vint8 operator* (const vint8& a, const vint8& b);
1350  friend vint8 operator/ (const vint8& a, const vint8& b);
1351  friend vint8 operator% (const vint8& a, const vint8& b);
1352  friend const vint8 & operator+= (vint8& a, const vint8& b);
1353  friend const vint8 & operator-= (vint8& a, const vint8& b);
1354  friend const vint8 & operator*= (vint8& a, const vint8& b);
1355  friend const vint8 & operator/= (vint8& a, const vint8& b);
1356  friend const vint8 & operator%= (vint8& a, const vint8& b);
1357  // Bitwise operators (component-by-component)
1358  friend vint8 operator& (const vint8& a, const vint8& b);
1359  friend vint8 operator| (const vint8& a, const vint8& b);
1360  friend vint8 operator^ (const vint8& a, const vint8& b);
1361  friend const vint8& operator&= (vint8& a, const vint8& b);
1362  friend const vint8& operator|= (vint8& a, const vint8& b);
1363  friend const vint8& operator^= (vint8& a, const vint8& b);
1364  friend vint8 operator~ (const vint8& a);
1365  friend vint8 operator<< (const vint8& a, unsigned int bits);
1366  friend vint8 operator>> (const vint8& a, unsigned int bits);
1367  friend const vint8& operator<<= (vint8& a, unsigned int bits);
1368  friend const vint8& operator>>= (vint8& a, unsigned int bits);
1369  // Comparison operators (component-by-component)
1370  friend vbool8 operator== (const vint8& a, const vint8& b);
1371  friend vbool8 operator!= (const vint8& a, const vint8& b);
1372  friend vbool8 operator< (const vint8& a, const vint8& b);
1373  friend vbool8 operator> (const vint8& a, const vint8& b);
1374  friend vbool8 operator>= (const vint8& a, const vint8& b);
1375  friend vbool8 operator<= (const vint8& a, const vint8& b);
1376 
1377  /// Stream output
1378  friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1379 
1380 private:
1381  // The actual data representation
1382  union {
1386  };
1387 };
1388 
1389 
1390 
1391 // Shift right logical -- unsigned shift. This differs from operator>>
1392 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1393 // srl((1<<31),1) == 1<<30.
1394 vint8 srl (const vint8& val, const unsigned int bits);
1395 
1396 /// Helper: shuffle/swizzle with constant (templated) indices.
1397 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1398 template<int i0, int i1, int i2, int i3,
1399  int i4, int i5, int i6, int i7> vint8 shuffle (const vint8& a);
1400 
1401 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1402 template<int i> vint8 shuffle (const vint8& a);
1403 
1404 /// Helper: as rapid as possible extraction of one component, when the
1405 /// index is fixed.
1406 template<int i> int extract (const vint8& v);
1407 
1408 /// Helper: substitute val for a[i]
1409 template<int i> vint8 insert (const vint8& a, int val);
1410 
1411 /// The sum of all components, returned in all components.
1412 vint8 vreduce_add (const vint8& v);
1413 
1414 // Reduction across all components
1415 int reduce_add (const vint8& v);
1416 int reduce_and (const vint8& v);
1417 int reduce_or (const vint8& v);
1418 
1419 /// Use a bool mask to select between components of a (if mask[i] is false)
1420 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1421 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1422 
1423 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1424 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1425 /// blend(0,a,mask).
1426 vint8 blend0 (const vint8& a, const vbool8& mask);
1427 
1428 /// Use a bool mask to select between components of a (if mask[i] is false)
1429 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1430 /// blend(0,a,!mask), or blend(a,0,mask).
1431 vint8 blend0not (const vint8& a, const vbool8& mask);
1432 
1433 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1434 /// synonym for blend with arguments rearranged, but this is more clear
1435 /// because the arguments are symmetric to scalar (cond ? a : b).
1436 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1437 
1438 // Per-element math
1439 vint8 abs (const vint8& a);
1440 vint8 min (const vint8& a, const vint8& b);
1441 vint8 max (const vint8& a, const vint8& b);
1442 
1443 /// Circular bit rotate by s bits, for N values at once.
1444 vint8 rotl (const vint8& x, const int s);
1445 // DEPRECATED(2.1)
1446 vint8 rotl32 (const vint8& x, const unsigned int k);
1447 
1448 /// andnot(a,b) returns ((~a) & b)
1449 vint8 andnot (const vint8& a, const vint8& b);
1450 
1451 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1452 vint8 bitcast_to_int (const vbool8& x);
1453 vint8 bitcast_to_int (const vfloat8& x);
1454 vfloat8 bitcast_to_float (const vint8& x);
1455 
1456 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1457 vint8 safe_mod (const vint8& a, const vint8& b);
1458 vint8 safe_mod (const vint8& a, int b);
1459 
1460 
1461 
1462 
1463 
1464 /// Integer 16-vector, accelerated by SIMD instructions when available.
1465 class vint16 {
1466 public:
1467  static const char* type_name() { return "vint16"; }
1468  typedef int value_t; ///< Underlying equivalent scalar value type
1469  enum { elements = 16 }; ///< Number of scalar elements
1470  enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1471  enum { bits = 128 }; ///< Total number of bits
1472  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1473  typedef vbool16 vbool_t; ///< bool type of the same length
1474  typedef vfloat16 vfloat_t; ///< float type of the same length
1475  typedef vint16 vint_t; ///< int type of the same length
1476  typedef vbool16 bool_t; // old name (deprecated 1.8)
1477  typedef vfloat16 float_t; // old name (deprecated 1.8)
1478 
1479  /// Default constructor (contents undefined)
1480  vint16 () { }
1481 
1482  /// Construct from a single value (store it in all slots)
1483  vint16 (int a);
1484 
1485  /// Construct from 16 values (won't work for vint16)
1486  vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1487  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1488 
1489  /// Construct from a pointer to values
1490  vint16 (const int *vals);
1491 
1492  /// Construct from a pointer to unsigned short values
1493  explicit vint16 (const unsigned short *vals);
1494 
1495  /// Construct from a pointer to signed short values
1496  explicit vint16 (const short *vals);
1497 
1498  /// Construct from a pointer to unsigned char values (0 - 255)
1499  explicit vint16 (const unsigned char *vals);
1500 
1501  /// Construct from a pointer to signed char values (-128 - 127)
1502  explicit vint16 (const char *vals);
1503 
1504  /// Copy construct from another vint16
1505  vint16 (const vint16 & other) { m_simd = other.m_simd; }
1506 
1507  /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1508  explicit vint16 (const vfloat16& f); // implementation below
1509 
1510  /// Construct from two vint8's
1511  vint16 (const vint8 &lo, const vint8 &hi);
1512 
1513  /// Construct from four vint4's
1514  vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1515 
1516  /// Construct from the underlying SIMD type
1517  vint16 (const simd_t& m) : m_simd(m) { }
1518 
1519  /// Return the raw SIMD type
1520  operator simd_t () const { return m_simd; }
1521  simd_t simd () const { return m_simd; }
1522  simd_t& simd () { return m_simd; }
1523 
1524  /// Return a pointer to the underlying scalar type
1525  const value_t* data () const { return (const value_t*)this; }
1526  value_t* data () { return (value_t*)this; }
1527 
1528  /// Sset all components to 0
1529  void clear () ;
1530 
1531  /// Return an vint16 with all components set to 0
1532  static const vint16 Zero ();
1533 
1534  /// Return an vint16 with all components set to 1
1535  static const vint16 One ();
1536 
1537  /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1538  static const vint16 NegOne ();
1539 
1540  /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1541  /// Optional arguments can give a non-zero starting point and step size.
1542  static const vint16 Iota (int start=0, int step=1);
1543 
1544  /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1545  static const vint16 Giota ();
1546 
1547  /// Assign one value to all components.
1548  const vint16 & operator= (int a);
1549 
1550  /// Assignment from another vint16
1551  const vint16 & operator= (const vint16& other) ;
1552 
1553  /// Component access (get)
1554  int operator[] (int i) const;
1555 
1556  /// Component access (set)
1557  int& operator[] (int i);
1558 
1559  /// Component access (set).
1560  void setcomp (int i, int value);
1561 
1562  value_t x () const;
1563  value_t y () const;
1564  value_t z () const;
1565  value_t w () const;
1566  void set_x (value_t val);
1567  void set_y (value_t val);
1568  void set_z (value_t val);
1569  void set_w (value_t val);
1570 
1571  /// Extract the lower precision vint8
1572  vint8 lo () const;
1573 
1574  /// Extract the higher precision vint8
1575  vint8 hi () const;
1576 
1577  /// Helper: load a single int into all components
1578  void load (int a);
1579 
1580  /// Load separate values into each component.
1581  void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1582  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1583 
1584  /// Load from an array of 16 values
1585  void load (const int *values);
1586 
1587  void load (const int *values, int n) ;
1588 
1589  /// Load from an array of 16 unsigned short values, convert to vint16
1590  void load (const unsigned short *values) ;
1591 
1592  /// Load from an array of 16 unsigned short values, convert to vint16
1593  void load (const short *values);
1594 
1595  /// Load from an array of 16 unsigned char values, convert to vint16
1596  void load (const unsigned char *values);
1597 
1598  /// Load from an array of 16 unsigned char values, convert to vint16
1599  void load (const char *values);
1600 
1601  /// Store the values into memory
1602  void store (int *values) const;
1603 
1604  /// Store the first n values into memory
1605  void store (int *values, int n) const;
1606 
1607  /// Store the least significant 16 bits of each element into adjacent
1608  /// unsigned shorts.
1609  void store (unsigned short *values) const;
1610 
1611  /// Store the least significant 8 bits of each element into adjacent
1612  /// unsigned chars.
1613  void store (unsigned char *values) const;
1614 
1615  /// Masked load -- read from values[] where mask is 1, load zero where
1616  /// mask is 0.
1617  void load_mask (const vbool_t &mask, const value_t *values);
1618  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1619 
1620  /// Masked store -- write to values[] where mask is enabled, don't
1621  /// touch values[] where it's not.
1622  void store_mask (const vbool_t &mask, value_t *values) const;
1623  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1624 
1625  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1626  template<int scale=4>
1627  void gather (const value_t *baseptr, const vint_t& vindex);
1628  /// Gather elements defined by the mask, leave others unchanged.
1629  template<int scale=4>
1630  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1631  template<int scale=4>
1632  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1633  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1634  }
1635 
1636  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1637  template<int scale=4>
1638  void scatter (value_t *baseptr, const vint_t& vindex) const;
1639  /// Scatter elements defined by the mask
1640  template<int scale=4>
1641  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1642  template<int scale=4>
1643  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1644  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1645  }
1646 
1647  // Arithmetic operators (component-by-component)
1648  friend vint16 operator+ (const vint16& a, const vint16& b);
1649  friend vint16 operator- (const vint16& a);
1650  friend vint16 operator- (const vint16& a, const vint16& b);
1651  friend vint16 operator* (const vint16& a, const vint16& b);
1652  friend vint16 operator/ (const vint16& a, const vint16& b);
1653  friend vint16 operator% (const vint16& a, const vint16& b);
1654  friend const vint16 & operator+= (vint16& a, const vint16& b);
1655  friend const vint16 & operator-= (vint16& a, const vint16& b);
1656  friend const vint16 & operator*= (vint16& a, const vint16& b);
1657  friend const vint16 & operator/= (vint16& a, const vint16& b);
1658  friend const vint16 & operator%= (vint16& a, const vint16& b);
1659  // Bitwise operators (component-by-component)
1660  friend vint16 operator& (const vint16& a, const vint16& b);
1661  friend vint16 operator| (const vint16& a, const vint16& b);
1662  friend vint16 operator^ (const vint16& a, const vint16& b);
1663  friend const vint16& operator&= (vint16& a, const vint16& b);
1664  friend const vint16& operator|= (vint16& a, const vint16& b);
1665  friend const vint16& operator^= (vint16& a, const vint16& b);
1666  friend vint16 operator~ (const vint16& a);
1667  friend vint16 operator<< (const vint16& a, unsigned int bits);
1668  friend vint16 operator>> (const vint16& a, unsigned int bits);
1669  friend const vint16& operator<<= (vint16& a, unsigned int bits);
1670  friend const vint16& operator>>= (vint16& a, unsigned int bits);
1671  // Comparison operators (component-by-component)
1672  friend vbool16 operator== (const vint16& a, const vint16& b);
1673  friend vbool16 operator!= (const vint16& a, const vint16& b);
1674  friend vbool16 operator< (const vint16& a, const vint16& b);
1675  friend vbool16 operator> (const vint16& a, const vint16& b);
1676  friend vbool16 operator>= (const vint16& a, const vint16& b);
1677  friend vbool16 operator<= (const vint16& a, const vint16& b);
1678 
1679  /// Stream output
1680  friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1681 
1682 private:
1683  // The actual data representation
1684  union {
1688  };
1689 };
1690 
1691 
1692 
1693 /// Shift right logical -- unsigned shift. This differs from operator>>
1694 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1695 /// srl((1<<31),1) == 1<<30.
1696 vint16 srl (const vint16& val, const unsigned int bits);
1697 
1698 /// Shuffle groups of 4
1699 template<int i0, int i1, int i2, int i3>
1700 vint16 shuffle4 (const vint16& a);
1701 
1702 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1703 template<int i> vint16 shuffle4 (const vint16& a);
1704 
1705 /// Shuffle within each group of 4
1706 template<int i0, int i1, int i2, int i3>
1707 vint16 shuffle (const vint16& a);
1708 
1709 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1710 template<int i> vint16 shuffle (const vint16& a);
1711 
1712 /// Helper: as rapid as possible extraction of one component, when the
1713 /// index is fixed.
1714 template<int i> int extract (const vint16& v);
1715 
1716 /// Helper: substitute val for a[i]
1717 template<int i> vint16 insert (const vint16& a, int val);
1718 
1719 /// The sum of all components, returned in all components.
1720 vint16 vreduce_add (const vint16& v);
1721 
1722 // Reduction across all components
1723 int reduce_add (const vint16& v);
1724 int reduce_and (const vint16& v);
1725 int reduce_or (const vint16& v);
1726 
1727 /// Use a bool mask to select between components of a (if mask[i] is false)
1728 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1729 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1730 
1731 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1732 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1733 /// blend(0,a,mask).
1734 vint16 blend0 (const vint16& a, const vbool16& mask);
1735 
1736 /// Use a bool mask to select between components of a (if mask[i] is false)
1737 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1738 /// blend(0,a,!mask), or blend(a,0,mask).
1739 vint16 blend0not (const vint16& a, const vbool16& mask);
1740 
1741 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1742 /// synonym for blend with arguments rearranged, but this is more clear
1743 /// because the arguments are symmetric to scalar (cond ? a : b).
1744 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1745 
1746 // Per-element math
1747 vint16 abs (const vint16& a);
1748 vint16 min (const vint16& a, const vint16& b);
1749 vint16 max (const vint16& a, const vint16& b);
1750 
1751 /// Circular bit rotate by s bits, for N values at once.
1752 vint16 rotl (const vint16& x, const int s);
1753 // DEPRECATED(2.1)
1754 vint16 rotl32 (const vint16& x, const unsigned int k);
1755 
1756 /// andnot(a,b) returns ((~a) & b)
1757 vint16 andnot (const vint16& a, const vint16& b);
1758 
1759 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1760 vint16 bitcast_to_int (const vbool16& x);
1761 vint16 bitcast_to_int (const vfloat16& x);
1762 vfloat16 bitcast_to_float (const vint16& x);
1763 
1764 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1765 vint16 safe_mod (const vint16& a, const vint16& b);
1766 vint16 safe_mod (const vint16& a, int b);
1767 
1768 
1769 
1770 
1771 
1772 /// Floating point 4-vector, accelerated by SIMD instructions when
1773 /// available.
1774 class vfloat4 {
1775 public:
1776  static const char* type_name() { return "vfloat4"; }
1777  typedef float value_t; ///< Underlying equivalent scalar value type
1778  enum { elements = 4 }; ///< Number of scalar elements
1779  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1780  enum { bits = elements*32 }; ///< Total number of bits
1781  typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used
1782  typedef vfloat4 vfloat_t; ///< SIMD int type
1783  typedef vint4 vint_t; ///< SIMD int type
1784  typedef vbool4 vbool_t; ///< SIMD bool type
1785  typedef vint4 int_t; // old name (deprecated 1.8)
1786  typedef vbool4 bool_t; // old name (deprecated 1.8)
1787 
1788  /// Default constructor (contents undefined)
1789  vfloat4 () { }
1790 
1791  /// Construct from a single value (store it in all slots)
1792  vfloat4 (float a) { load(a); }
1793 
1794  /// Construct from 3 or 4 values
1795  vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1796 
1797  /// Construct from a pointer to 4 values
1798  vfloat4 (const float *f) { load (f); }
1799 
1800  /// Copy construct from another vfloat4
1801  vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1802 
1803  /// Construct from an vint4 (promoting all components to float)
1804  explicit vfloat4 (const vint4& ival);
1805 
1806  /// Construct from the underlying SIMD type
1807  vfloat4 (const simd_t& m) : m_simd(m) { }
1808 
1809  /// Return the raw SIMD type
1810  operator simd_t () const { return m_simd; }
1811  simd_t simd () const { return m_simd; }
1812  simd_t& simd () { return m_simd; }
1813 
1814  /// Return a pointer to the underlying scalar type
1815  const value_t* data () const { return (const value_t*)this; }
1816  value_t* data () { return (value_t*)this; }
1817 
1818  /// Construct from a Imath::V3f
1819  explicit vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); }
1820 
1821  /// Cast to a Imath::V3f
1822  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1823 
1824  /// Construct from a Imath::V4f
1825  explicit vfloat4 (const Imath::V4f &v) { load ((const float *)&v); }
1826 
1827  /// Cast to a Imath::V4f
1828  const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1829 
1830  /// Construct from a pointer to 4 unsigned short values
1831  explicit vfloat4 (const unsigned short *vals) { load(vals); }
1832 
1833  /// Construct from a pointer to 4 short values
1834  explicit vfloat4 (const short *vals) { load(vals); }
1835 
1836  /// Construct from a pointer to 4 unsigned char values
1837  explicit vfloat4 (const unsigned char *vals) { load(vals); }
1838 
1839  /// Construct from a pointer to 4 char values
1840  explicit vfloat4 (const char *vals) { load(vals); }
1841 
1842 #ifdef _HALF_H_
1843  /// Construct from a pointer to 4 half (16 bit float) values
1844  explicit vfloat4 (const half *vals) { load(vals); }
1845 #endif
1846 
1847  /// Assign a single value to all components
1848  const vfloat4 & operator= (float a) { load(a); return *this; }
1849 
1850  /// Assign a vfloat4
1851  const vfloat4 & operator= (vfloat4 other) {
1852  m_simd = other.m_simd;
1853  return *this;
1854  }
1855 
1856  /// Return a vfloat4 with all components set to 0.0
1857  static const vfloat4 Zero ();
1858 
1859  /// Return a vfloat4 with all components set to 1.0
1860  static const vfloat4 One ();
1861 
1862  /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1863  /// Optional argument can give a non-zero starting point and non-1 step.
1864  static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1865 
1866  /// Set all components to 0.0
1867  void clear ();
1868 
1869  /// Assign from a Imath::V4f
1870  const vfloat4 & operator= (const Imath::V4f &v);
1871 
1872  /// Assign from a Imath::V3f
1873  const vfloat4 & operator= (const Imath::V3f &v);
1874 
1875  /// Component access (get)
1876  float operator[] (int i) const;
1877  /// Component access (set)
1878  float& operator[] (int i);
1879 
1880  /// Component access (set).
1881  void setcomp (int i, float value);
1882 
1883  value_t x () const;
1884  value_t y () const;
1885  value_t z () const;
1886  value_t w () const;
1887  void set_x (value_t val);
1888  void set_y (value_t val);
1889  void set_z (value_t val);
1890  void set_w (value_t val);
1891 
1892  /// Helper: load a single value into all components
1893  void load (float val);
1894 
1895  /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
1896  void load (float a, float b, float c, float d=0.0f);
1897 
1898  /// Load from an array of 4 values
1899  void load (const float *values);
1900 
1901  /// Load from a partial array of <=4 values. Unassigned values are
1902  /// undefined.
1903  void load (const float *values, int n);
1904 
1905  /// Load from an array of 4 unsigned short values, convert to float
1906  void load (const unsigned short *values);
1907 
1908  /// Load from an array of 4 short values, convert to float
1909  void load (const short *values);
1910 
1911  /// Load from an array of 4 unsigned char values, convert to float
1912  void load (const unsigned char *values);
1913 
1914  /// Load from an array of 4 char values, convert to float
1915  void load (const char *values);
1916 
1917 #ifdef _HALF_H_
1918  /// Load from an array of 4 half values, convert to float
1919  void load (const half *values);
1920 #endif /* _HALF_H_ */
1921 
1922  void store (float *values) const;
1923 
1924  /// Store the first n values into memory
1925  void store (float *values, int n) const;
1926 
1927 #ifdef _HALF_H_
1928  void store (half *values) const;
1929 #endif
1930 
1931  /// Masked load -- read from values[] where mask is 1, load zero where
1932  /// mask is 0.
1933  void load_mask (int mask, const value_t *values);
1934  void load_mask (const vbool_t& mask, const value_t *values);
1935 
1936  /// Masked store -- write to values[] where mask is enabled, don't
1937  /// touch values[] where it's not.
1938  void store_mask (int mask, value_t *values) const;
1939  void store_mask (const vbool_t& mask, value_t *values) const;
1940 
1941  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1942  template<int scale=4>
1943  void gather (const value_t *baseptr, const vint_t& vindex);
1944  /// Gather elements defined by the mask, leave others unchanged.
1945  template<int scale=4>
1946  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1947  template<int scale=4>
1948  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1949 
1950  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1951  template<int scale=4>
1952  void scatter (value_t *baseptr, const vint_t& vindex) const;
1953  /// Scatter elements defined by the mask
1954  template<int scale=4>
1955  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1956  template<int scale=4>
1957  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1958 
1959  // Arithmetic operators
1960  friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
1961  const vfloat4 & operator+= (const vfloat4& a);
1962  vfloat4 operator- () const;
1963  friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
1964  const vfloat4 & operator-= (const vfloat4& a);
1965  friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
1966  friend vfloat4 operator* (const vfloat4& a, float b);
1967  friend vfloat4 operator* (float a, const vfloat4& b);
1968  const vfloat4 & operator*= (const vfloat4& a);
1969  const vfloat4 & operator*= (float val);
1970  friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
1971  const vfloat4 & operator/= (const vfloat4& a);
1972  const vfloat4 & operator/= (float val);
1973 
1974  // Comparison operations
1975  friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
1976  friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
1977  friend vbool4 operator< (const vfloat4& a, const vfloat4& b);
1978  friend vbool4 operator> (const vfloat4& a, const vfloat4& b);
1979  friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
1980  friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
1981 
1982  // Some oddball items that are handy
1983 
1984  /// Combine the first two components of A with the first two components
1985  /// of B.
1986  friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
1987 
1988  /// Combine the first two components of A with the first two components
1989  /// of B, but interleaved.
1990  friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
1991 
1992  /// Return xyz components, plus 0 for w
1993  vfloat4 xyz0 () const;
1994 
1995  /// Return xyz components, plus 1 for w
1996  vfloat4 xyz1 () const;
1997 
1998  /// Stream output
1999  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
2000 
2001 protected:
2002  // The actual data representation
2003  union {
2006  };
2007 };
2008 
2009 
2010 /// Helper: shuffle/swizzle with constant (templated) indices.
2011 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2012 template<int i0, int i1, int i2, int i3> vfloat4 shuffle (const vfloat4& a);
2013 
2014 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2015 template<int i> vfloat4 shuffle (const vfloat4& a);
2016 
2017 /// Helper: as rapid as possible extraction of one component, when the
2018 /// index is fixed.
2019 template<int i> float extract (const vfloat4& a);
2020 
2021 /// Helper: substitute val for a[i]
2022 template<int i> vfloat4 insert (const vfloat4& a, float val);
2023 
2024 /// The sum of all components, returned in all components.
2025 vfloat4 vreduce_add (const vfloat4& v);
2026 
2027 /// The sum of all components, returned as a scalar.
2028 float reduce_add (const vfloat4& v);
2029 
2030 /// Return the float dot (inner) product of a and b in every component.
2031 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2032 
2033 /// Return the float dot (inner) product of a and b.
2034 float dot (const vfloat4 &a, const vfloat4 &b);
2035 
2036 /// Return the float 3-component dot (inner) product of a and b in
2037 /// all components.
2038 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2039 
2040 /// Return the float 3-component dot (inner) product of a and b.
2041 float dot3 (const vfloat4 &a, const vfloat4 &b);
2042 
2043 /// Use a bool mask to select between components of a (if mask[i] is false)
2044 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2045 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2046 
2047 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2048 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2049 /// blend(0,a,mask).
2050 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2051 
2052 /// Use a bool mask to select between components of a (if mask[i] is false)
2053 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2054 /// blend(0,a,!mask), or blend(a,0,mask).
2055 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2056 
2057 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2058 /// that is 0, return 0 rather than Inf.
2059 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2060 
2061 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2062 vfloat3 hdiv (const vfloat4 &a);
2063 
2064 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2065 /// synonym for blend with arguments rearranged, but this is more clear
2066 /// because the arguments are symmetric to scalar (cond ? a : b).
2067 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2068 
2069 // Per-element math
2070 vfloat4 abs (const vfloat4& a); ///< absolute value (float)
2071 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative
2072 vfloat4 ceil (const vfloat4& a);
2073 vfloat4 floor (const vfloat4& a);
2074 vint4 ifloor (const vfloat4& a); ///< (int)floor
2075 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2076 
2077 /// Per-element round to nearest integer.
2078 /// CAVEAT: the rounding when mid-way between integers may differ depending
2079 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2080 /// integer) but std::round() says to round away from 0 regardless of
2081 /// current rounding mode (but that is multiple instructions on x64).
2082 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2083 /// match std::round().
2084 vfloat4 round (const vfloat4& a);
2085 
2086 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2087 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2088 /// C++ std::rint() which says to use the current rounding mode.
2089 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2090 /// match std::rint().
2091 vint4 rint (const vfloat4& a);
2092 
2093 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a
2094 vfloat4 sqrt (const vfloat4 &a);
2095 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt
2096 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt
2097 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2098 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2099 template <typename T> T exp (const T& v); // template for all SIMD variants
2100 template <typename T> T log (const T& v);
2101 
2102 /// andnot(a,b) returns ((~a) & b)
2103 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2104 
2105 // Fused multiply and add (or subtract):
2106 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2107 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2108 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2109 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2110 
2111 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2112 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2113 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2114 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2115 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2116  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2117 
2118 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2119 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2120  const vfloat4& c, const vfloat4& d);
2121 
2122 
2123 
2124 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2125 /// The way it differs from vfloat4 is that all of he load functions only
2126 /// load three values, and all the stores only store 3 values. The vast
2127 /// majority of ops just fall back to the vfloat4 version, and so will
2128 /// operate on the 4th component, but we won't care about that results.
2129 class vfloat3 : public vfloat4 {
2130 public:
2131  static const char* type_name() { return "vfloat3"; }
2132  enum { elements = 3 }; ///< Number of scalar elements
2133  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2134 
2135  /// Default constructor (contents undefined)
2136  vfloat3 () { }
2137 
2138  /// Construct from a single value (store it in all slots)
2139  vfloat3 (float a) { load(a); }
2140 
2141  /// Construct from 3 or 4 values
2142  vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2143 
2144  /// Construct from a pointer to 4 values
2145  vfloat3 (const float *f) { load (f); }
2146 
2147  /// Copy construct from another vfloat3
2148  vfloat3 (const vfloat3 &other);
2149 
2150  /// Construct from a vfloat4. Note: it will not zero out the internal
2151  /// 4th component, but rather accept on faith that the vfloat4 you are
2152  /// giving it is a valid vfloat3. Be careful!
2153  explicit vfloat3 (const vfloat4 &other);
2154 
2155 #if OIIO_SIMD
2156  /// Construct from the underlying SIMD type. Note: it will not zero out
2157  /// the internal 4th component, but rather accept on faith that the
2158  /// vfloat4 you are giving it is a valid vfloat3. Be careful!
2159  explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2160 #endif
2161 
2162  /// Construct from a Imath::V3f
2163  vfloat3 (const Imath::V3f &v) : vfloat4(v) { }
2164 
2165  /// Cast to a Imath::V3f
2166  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2167 
2168  /// Construct from a pointer to 4 unsigned short values
2169  explicit vfloat3 (const unsigned short *vals) { load(vals); }
2170 
2171  /// Construct from a pointer to 4 short values
2172  explicit vfloat3 (const short *vals) { load(vals); }
2173 
2174  /// Construct from a pointer to 4 unsigned char values
2175  explicit vfloat3 (const unsigned char *vals) { load(vals); }
2176 
2177  /// Construct from a pointer to 4 char values
2178  explicit vfloat3 (const char *vals) { load(vals); }
2179 
2180 #ifdef _HALF_H_
2181  /// Construct from a pointer to 4 half (16 bit float) values
2182  explicit vfloat3 (const half *vals) { load(vals); }
2183 #endif
2184 
2185  /// Assign a single value to all components
2186  const vfloat3 & operator= (float a) { load(a); return *this; }
2187 
2188  /// Return a vfloat3 with all components set to 0.0
2189  static const vfloat3 Zero ();
2190 
2191  /// Return a vfloat3 with all components set to 1.0
2192  static const vfloat3 One ();
2193 
2194  /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2195  /// Optional argument can give a non-zero starting point and non-1 step.
2196  static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2197 
2198  /// Helper: load a single value into all components
2199  void load (float val);
2200 
2201  /// Load from an array of 4 values
2202  void load (const float *values);
2203 
2204  /// Load from an array of 4 values
2205  void load (const float *values, int n);
2206 
2207  /// Load from an array of 4 unsigned short values, convert to float
2208  void load (const unsigned short *values);
2209 
2210  /// Load from an array of 4 short values, convert to float
2211  void load (const short *values);
2212 
2213  /// Load from an array of 4 unsigned char values, convert to float
2214  void load (const unsigned char *values);
2215 
2216  /// Load from an array of 4 char values, convert to float
2217  void load (const char *values);
2218 
2219 #ifdef _HALF_H_
2220  /// Load from an array of 4 half values, convert to float
2221  void load (const half *values);
2222 #endif /* _HALF_H_ */
2223 
2224  void store (float *values) const;
2225 
2226  void store (float *values, int n) const;
2227 
2228 #ifdef _HALF_H_
2229  void store (half *values) const;
2230 #endif
2231 
2232  /// Store into an Imath::V3f reference.
2233  void store (Imath::V3f &vec) const;
2234 
2235  // Math operators -- define in terms of vfloat3.
2236  friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2237  const vfloat3 & operator+= (const vfloat3& a);
2238  vfloat3 operator- () const;
2239  friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2240  const vfloat3 & operator-= (const vfloat3& a);
2241  friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2242  friend vfloat3 operator* (const vfloat3& a, float b);
2243  friend vfloat3 operator* (float a, const vfloat3& b);
2244  const vfloat3 & operator*= (const vfloat3& a);
2245  const vfloat3 & operator*= (float a);
2246  friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2247  const vfloat3 & operator/= (const vfloat3& a);
2248  const vfloat3 & operator/= (float a);
2249 
2250  /// Square of the length of the vector
2251  float length2() const;
2252  /// Length of the vector
2253  float length() const;
2254 
2255  /// Return a normalized version of the vector.
2256  vfloat3 normalized () const;
2257  /// Return a fast, approximate normalized version of the vector.
2258  vfloat3 normalized_fast () const;
2259  /// Normalize in place.
2260  void normalize() { *this = normalized(); }
2261 
2262  /// Stream output
2263  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2264 };
2265 
2266 
2267 
2268 // Per-element math on float3
2269 vfloat3 abs (const vfloat3& a);
2270 vfloat3 sign (const vfloat3& a);
2271 vfloat3 ceil (const vfloat3& a);
2272 vfloat3 floor (const vfloat3& a);
2273 vfloat3 round (const vfloat3& a);
2274 
2275 
2276 
2277 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2278 /// not in registers) isomorphic to Imath::M44f.
2279 class matrix44 {
2280 public:
2281  // Uninitialized
2283 #ifndef OIIO_SIMD_SSE
2284  : m_mat(Imath::UNINITIALIZED)
2285 #endif
2286  { }
2287 
2288  /// Construct from a reference to an Imath::M44f
2289  OIIO_FORCEINLINE explicit matrix44 (const Imath::M44f &M) {
2290 #if OIIO_SIMD_SSE
2291  m_row[0].load (M[0]);
2292  m_row[1].load (M[1]);
2293  m_row[2].load (M[2]);
2294  m_row[3].load (M[3]);
2295 #else
2296  m_mat = M;
2297 #endif
2298  }
2299 
2300  /// Construct from a float array
2301  OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2302 #if OIIO_SIMD_SSE
2303  m_row[0].load (f+0);
2304  m_row[1].load (f+4);
2305  m_row[2].load (f+8);
2306  m_row[3].load (f+12);
2307 #else
2308  m_mat = *(const Imath::M44f*)f;
2309 #endif
2310  }
2311 
2312  /// Construct from 4 vfloat4 rows
2313  OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2314  const vfloat4& c, const vfloat4& d) {
2315 #if OIIO_SIMD_SSE
2316  m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d;
2317 #else
2318  a.store (m_mat[0]);
2319  b.store (m_mat[1]);
2320  c.store (m_mat[2]);
2321  d.store (m_mat[3]);
2322 #endif
2323  }
2324  /// Construct from 4 float[4] rows
2325  OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2326  const float *c, const float *d) {
2327 #if OIIO_SIMD_SSE
2328  m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2329 #else
2330  memcpy (m_mat[0], a, 4*sizeof(float));
2331  memcpy (m_mat[1], b, 4*sizeof(float));
2332  memcpy (m_mat[2], c, 4*sizeof(float));
2333  memcpy (m_mat[3], d, 4*sizeof(float));
2334 #endif
2335  }
2336 
2337  /// Construct from 16 floats
2338  OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2339  float f10, float f11, float f12, float f13,
2340  float f20, float f21, float f22, float f23,
2341  float f30, float f31, float f32, float f33)
2342  {
2343 #if OIIO_SIMD_SSE
2344  m_row[0].load (f00, f01, f02, f03);
2345  m_row[1].load (f10, f11, f12, f13);
2346  m_row[2].load (f20, f21, f22, f23);
2347  m_row[3].load (f30, f31, f32, f33);
2348 #else
2349  m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2350  m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2351  m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2352  m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2353 #endif
2354  }
2355 
2356  /// Present as an Imath::M44f
2357  const Imath::M44f& M44f() const;
2358 
2359  /// Return one row
2360  vfloat4 operator[] (int i) const;
2361 
2362  /// Return the transposed matrix
2363  matrix44 transposed () const;
2364 
2365  /// Transform 3-point V by 4x4 matrix M.
2366  vfloat3 transformp (const vfloat3 &V) const;
2367 
2368  /// Transform 3-vector V by 4x4 matrix M.
2369  vfloat3 transformv (const vfloat3 &V) const;
2370 
2371  /// Transform 3-vector V by the transpose of 4x4 matrix M.
2372  vfloat3 transformvT (const vfloat3 &V) const;
2373 
2374  friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2375  friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2376 
2377  bool operator== (const matrix44& m) const;
2378 
2379  bool operator== (const Imath::M44f& m) const ;
2380  friend bool operator== (const Imath::M44f& a, const matrix44 &b);
2381 
2382  bool operator!= (const matrix44& m) const;
2383 
2384  bool operator!= (const Imath::M44f& m) const;
2385  friend bool operator!= (const Imath::M44f& a, const matrix44 &b);
2386 
2387  /// Return the inverse of the matrix.
2388  matrix44 inverse() const;
2389 
2390  /// Stream output
2391  friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2392 
2393 private:
2394 #if OIIO_SIMD_SSE
2395  vfloat4 m_row[4];
2396 #else
2397  Imath::M44f m_mat;
2398 #endif
2399 };
2400 
2401 /// Transform 3-point V by 4x4 matrix M.
2402 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2403 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V);
2404 
2405 /// Transform 3-vector V by 4x4 matrix M.
2406 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2407 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V);
2408 
2409 // Transform 3-vector by the transpose of 4x4 matrix M.
2410 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2411 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V);
2412 
2413 
2414 
2415 
2416 /// Floating point 8-vector, accelerated by SIMD instructions when
2417 /// available.
2418 class vfloat8 {
2419 public:
2420  static const char* type_name() { return "vfloat8"; }
2421  typedef float value_t; ///< Underlying equivalent scalar value type
2422  enum { elements = 8 }; ///< Number of scalar elements
2423  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2424  enum { bits = elements*32 }; ///< Total number of bits
2425  typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used
2426  typedef vfloat8 vfloat_t; ///< SIMD int type
2427  typedef vint8 vint_t; ///< SIMD int type
2428  typedef vbool8 vbool_t; ///< SIMD bool type
2429  typedef vint8 int_t; // old name (deprecated 1.8)
2430  typedef vbool8 bool_t; // old name (deprecated 1.8)
2431 
2432  /// Default constructor (contents undefined)
2433  vfloat8 () { }
2434 
2435  /// Construct from a single value (store it in all slots)
2436  vfloat8 (float a) { load(a); }
2437 
2438  /// Construct from 8 values
2439  vfloat8 (float a, float b, float c, float d,
2440  float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2441 
2442  /// Construct from a pointer to 8 values
2443  vfloat8 (const float *f) { load (f); }
2444 
2445  /// Copy construct from another vfloat8
2446  vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2447 
2448  /// Construct from an int vector (promoting all components to float)
2449  explicit vfloat8 (const vint8& ival);
2450 
2451  /// Construct from two vfloat4's
2452  vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2453 
2454  /// Construct from the underlying SIMD type
2455  vfloat8 (const simd_t& m) : m_simd(m) { }
2456 
2457  /// Return the raw SIMD type
2458  operator simd_t () const { return m_simd; }
2459  simd_t simd () const { return m_simd; }
2460  simd_t& simd () { return m_simd; }
2461 
2462  /// Return a pointer to the underlying scalar type
2463  const value_t* data () const { return (const value_t*)this; }
2464  value_t* data () { return (value_t*)this; }
2465 
2466  /// Construct from a pointer to unsigned short values
2467  explicit vfloat8 (const unsigned short *vals) { load(vals); }
2468 
2469  /// Construct from a pointer to short values
2470  explicit vfloat8 (const short *vals) { load(vals); }
2471 
2472  /// Construct from a pointer to unsigned char values
2473  explicit vfloat8 (const unsigned char *vals) { load(vals); }
2474 
2475  /// Construct from a pointer to char values
2476  explicit vfloat8 (const char *vals) { load(vals); }
2477 
2478 #ifdef _HALF_H_
2479  /// Construct from a pointer to half (16 bit float) values
2480  explicit vfloat8 (const half *vals) { load(vals); }
2481 #endif
2482 
2483  /// Assign a single value to all components
2484  const vfloat8& operator= (float a) { load(a); return *this; }
2485 
2486  /// Assign a vfloat8
2487  const vfloat8& operator= (vfloat8 other) {
2488  m_simd = other.m_simd;
2489  return *this;
2490  }
2491 
2492  /// Return a vfloat8 with all components set to 0.0
2493  static const vfloat8 Zero ();
2494 
2495  /// Return a vfloat8 with all components set to 1.0
2496  static const vfloat8 One ();
2497 
2498  /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2499  /// Optional argument can give a non-zero starting point and non-1 step.
2500  static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2501 
2502  /// Set all components to 0.0
2503  void clear ();
2504 
2505  /// Component access (get)
2506  float operator[] (int i) const;
2507  /// Component access (set)
2508  float& operator[] (int i);
2509 
2510  /// Component access (set).
2511  void setcomp (int i, float value);
2512 
2513  value_t x () const;
2514  value_t y () const;
2515  value_t z () const;
2516  value_t w () const;
2517  void set_x (value_t val);
2518  void set_y (value_t val);
2519  void set_z (value_t val);
2520  void set_w (value_t val);
2521 
2522  /// Extract the lower precision vfloat4
2523  vfloat4 lo () const;
2524 
2525  /// Extract the higher precision vfloat4
2526  vfloat4 hi () const;
2527 
2528  /// Helper: load a single value into all components
2529  void load (float val);
2530 
2531  /// Helper: load 8 values
2532  void load (float a, float b, float c, float d,
2533  float e, float f, float g, float h);
2534 
2535  /// Load from an array of values
2536  void load (const float *values);
2537 
2538  /// Load from a partial array of <=8 values. Unassigned values are
2539  /// undefined.
2540  void load (const float *values, int n);
2541 
2542  /// Load from an array of 8 unsigned short values, convert to float
2543  void load (const unsigned short *values);
2544 
2545  /// Load from an array of 8 short values, convert to float
2546  void load (const short *values);
2547 
2548  /// Load from an array of 8 unsigned char values, convert to float
2549  void load (const unsigned char *values);
2550 
2551  /// Load from an array of 8 char values, convert to float
2552  void load (const char *values);
2553 
2554 #ifdef _HALF_H_
2555  /// Load from an array of 8 half values, convert to float
2556  void load (const half *values);
2557 #endif /* _HALF_H_ */
2558 
2559  void store (float *values) const;
2560 
2561  /// Store the first n values into memory
2562  void store (float *values, int n) const;
2563 
2564 #ifdef _HALF_H_
2565  void store (half *values) const;
2566 #endif
2567 
2568  /// Masked load -- read from values[] where mask is 1, load zero where
2569  /// mask is 0.
2570  void load_mask (int mask, const value_t *values);
2571  void load_mask (const vbool_t& mask, const value_t *values);
2572 
2573  /// Masked store -- write to values[] where mask is enabled, don't
2574  /// touch values[] where it's not.
2575  void store_mask (int mask, value_t *values) const;
2576  void store_mask (const vbool_t& mask, value_t *values) const;
2577 
2578  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2579  template<int scale=4>
2580  void gather (const value_t *baseptr, const vint_t& vindex);
2581  template<int scale=4>
2582  // Fastest way to fill with all 1 bits is to cmp any value to itself.
2583  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2584  template<int scale=4>
2585  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2586 
2587  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2588  template<int scale=4>
2589  void scatter (value_t *baseptr, const vint_t& vindex) const;
2590  /// Scatter elements defined by the mask
2591  template<int scale=4>
2592  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2593  template<int scale=4>
2594  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2595 
2596  // Arithmetic operators (component-by-component)
2597  friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2598  friend vfloat8 operator- (const vfloat8& a);
2599  friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2600  friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2601  friend vfloat8 operator* (const vfloat8& a, float b);
2602  friend vfloat8 operator* (float a, const vfloat8& b);
2603  friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2604  friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2605  friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2606  friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2607  friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2608  friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2609 
2610  // Comparison operations
2611  friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2612  friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2613  friend vbool8 operator< (const vfloat8& a, const vfloat8& b);
2614  friend vbool8 operator> (const vfloat8& a, const vfloat8& b);
2615  friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2616  friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2617 
2618  // Some oddball items that are handy
2619 
2620  /// Stream output
2621  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2622 
2623 protected:
2624  // The actual data representation
2625  union {
2629  };
2630 };
2631 
2632 
2633 /// Helper: shuffle/swizzle with constant (templated) indices.
2634 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2635 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2636 vfloat8 shuffle (const vfloat8& a);
2637 
2638 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2639 template<int i> vfloat8 shuffle (const vfloat8& a);
2640 
2641 /// Helper: as rapid as possible extraction of one component, when the
2642 /// index is fixed.
2643 template<int i> float extract (const vfloat8& a);
2644 
2645 /// Helper: substitute val for a[i]
2646 template<int i> vfloat8 insert (const vfloat8& a, float val);
2647 
2648 /// The sum of all components, returned in all components.
2649 vfloat8 vreduce_add (const vfloat8& v);
2650 
2651 /// The sum of all components, returned as a scalar.
2652 float reduce_add (const vfloat8& v);
2653 
2654 /// Return the float dot (inner) product of a and b in every component.
2655 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2656 
2657 /// Return the float dot (inner) product of a and b.
2658 float dot (const vfloat8 &a, const vfloat8 &b);
2659 
2660 /// Return the float 3-component dot (inner) product of a and b in
2661 /// all components.
2662 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2663 
2664 /// Return the float 3-component dot (inner) product of a and b.
2665 float dot3 (const vfloat8 &a, const vfloat8 &b);
2666 
2667 /// Use a bool mask to select between components of a (if mask[i] is false)
2668 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2669 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2670 
2671 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2672 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2673 /// blend(0,a,mask).
2674 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2675 
2676 /// Use a bool mask to select between components of a (if mask[i] is false)
2677 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2678 /// blend(0,a,!mask), or blend(a,0,mask).
2679 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2680 
2681 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2682 /// that is 0, return 0 rather than Inf.
2683 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2684 
2685 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2686 /// synonym for blend with arguments rearranged, but this is more clear
2687 /// because the arguments are symmetric to scalar (cond ? a : b).
2688 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2689 
2690 // Per-element math
2691 vfloat8 abs (const vfloat8& a); ///< absolute value (float)
2692 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative
2693 vfloat8 ceil (const vfloat8& a);
2694 vfloat8 floor (const vfloat8& a);
2695 vint8 ifloor (const vfloat8& a); ///< (int)floor
2696 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2697 
2698 /// Per-element round to nearest integer.
2699 /// CAVEAT: the rounding when mid-way between integers may differ depending
2700 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2701 /// integer) but std::round() says to round away from 0 regardless of
2702 /// current rounding mode (but that is multiple instructions on x64).
2703 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2704 /// match std::round().
2705 vfloat8 round (const vfloat8& a);
2706 
2707 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2708 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2709 /// C++ std::rint() which says to use the current rounding mode.
2710 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2711 /// match std::rint().
2712 vint8 rint (const vfloat8& a);
2713 
2714 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a
2715 vfloat8 sqrt (const vfloat8 &a);
2716 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt
2717 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt
2718 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2719 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2720 // vfloat8 exp (const vfloat8& v); // See template with vfloat4
2721 // vfloat8 log (const vfloat8& v); // See template with vfloat4
2722 
2723 /// andnot(a,b) returns ((~a) & b)
2724 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2725 
2726 // Fused multiply and add (or subtract):
2727 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2728 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2729 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2730 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2731 
2732 
2733 
2734 /// Floating point 16-vector, accelerated by SIMD instructions when
2735 /// available.
2736 class vfloat16 {
2737 public:
2738  static const char* type_name() { return "vfloat16"; }
2739  typedef float value_t; ///< Underlying equivalent scalar value type
2740  enum { elements = 16 }; ///< Number of scalar elements
2741  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2742  enum { bits = elements*32 }; ///< Total number of bits
2743  typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used
2744  typedef vfloat16 vfloat_t; ///< SIMD int type
2745  typedef vint16 vint_t; ///< SIMD int type
2746  typedef vbool16 vbool_t; ///< SIMD bool type
2747  typedef vint16 int_t; // old name (deprecated 1.8)
2748  typedef vbool16 bool_t; // old name (deprecated 1.8)
2749 
2750  /// Default constructor (contents undefined)
2751  vfloat16 () { }
2752 
2753  /// Construct from a single value (store it in all slots)
2754  vfloat16 (float a) { load(a); }
2755 
2756  /// Construct from 16 values
2757  vfloat16 (float v0, float v1, float v2, float v3,
2758  float v4, float v5, float v6, float v7,
2759  float v8, float v9, float v10, float v11,
2760  float v12, float v13, float v14, float v15);
2761 
2762  /// Construct from a pointer to 16 values
2763  vfloat16 (const float *f) { load (f); }
2764 
2765  /// Copy construct from another vfloat16
2766  vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2767 
2768  /// Construct from an int vector (promoting all components to float)
2769  explicit vfloat16 (const vint16& ival);
2770 
2771  /// Construct from two vfloat8's
2772  vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2773 
2774  /// Construct from four vfloat4's
2775  vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2776 
2777  /// Construct from the underlying SIMD type
2778  vfloat16 (const simd_t& m) : m_simd(m) { }
2779 
2780  /// Return the raw SIMD type
2781  operator simd_t () const { return m_simd; }
2782  simd_t simd () const { return m_simd; }
2783  simd_t& simd () { return m_simd; }
2784 
2785  /// Return a pointer to the underlying scalar type
2786  const value_t* data () const { return (const value_t*)this; }
2787  value_t* data () { return (value_t*)this; }
2788 
2789  /// Construct from a pointer to unsigned short values
2790  explicit vfloat16 (const unsigned short *vals) { load(vals); }
2791 
2792  /// Construct from a pointer to short values
2793  explicit vfloat16 (const short *vals) { load(vals); }
2794 
2795  /// Construct from a pointer to unsigned char values
2796  explicit vfloat16 (const unsigned char *vals) { load(vals); }
2797 
2798  /// Construct from a pointer to char values
2799  explicit vfloat16 (const char *vals) { load(vals); }
2800 
2801 #ifdef _HALF_H_
2802  /// Construct from a pointer to half (16 bit float) values
2803  explicit vfloat16 (const half *vals) { load(vals); }
2804 #endif
2805 
2806  /// Assign a single value to all components
2807  const vfloat16& operator= (float a) { load(a); return *this; }
2808 
2809  /// Assign a vfloat16
2810  const vfloat16& operator= (vfloat16 other) {
2811  m_simd = other.m_simd;
2812  return *this;
2813  }
2814 
2815  /// Return a vfloat16 with all components set to 0.0
2816  static const vfloat16 Zero ();
2817 
2818  /// Return a vfloat16 with all components set to 1.0
2819  static const vfloat16 One ();
2820 
2821  /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2822  /// Optional argument can give a non-zero starting point and non-1 step.
2823  static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2824 
2825  /// Set all components to 0.0
2826  void clear ();
2827 
2828  /// Component access (get)
2829  float operator[] (int i) const;
2830  /// Component access (set)
2831  float& operator[] (int i);
2832 
2833  /// Component access (set).
2834  void setcomp (int i, float value);
2835 
2836  value_t x () const;
2837  value_t y () const;
2838  value_t z () const;
2839  value_t w () const;
2840  void set_x (value_t val);
2841  void set_y (value_t val);
2842  void set_z (value_t val);
2843  void set_w (value_t val);
2844 
2845  /// Extract the lower precision vfloat8
2846  vfloat8 lo () const;
2847 
2848  /// Extract the higher precision vfloat8
2849  vfloat8 hi () const;
2850 
2851  /// Helper: load a single value into all components
2852  void load (float val);
2853 
2854  /// Load separate values into each component.
2855  void load (float v0, float v1, float v2, float v3,
2856  float v4, float v5, float v6, float v7,
2857  float v8, float v9, float v10, float v11,
2858  float v12, float v13, float v14, float v15);
2859 
2860  /// Load from an array of values
2861  void load (const float *values);
2862 
2863  /// Load from a partial array of <=16 values. Unassigned values are
2864  /// undefined.
2865  void load (const float *values, int n);
2866 
2867  /// Load from an array of 16 unsigned short values, convert to float
2868  void load (const unsigned short *values);
2869 
2870  /// Load from an array of 16 short values, convert to float
2871  void load (const short *values);
2872 
2873  /// Load from an array of 16 unsigned char values, convert to float
2874  void load (const unsigned char *values);
2875 
2876  /// Load from an array of 16 char values, convert to float
2877  void load (const char *values);
2878 
2879 #ifdef _HALF_H_
2880  /// Load from an array of 16 half values, convert to float
2881  void load (const half *values);
2882 #endif /* _HALF_H_ */
2883 
2884  void store (float *values) const;
2885 
2886  /// Store the first n values into memory
2887  void store (float *values, int n) const;
2888 
2889 #ifdef _HALF_H_
2890  void store (half *values) const;
2891 #endif
2892 
2893  /// Masked load -- read from values[] where mask is 1, load zero where
2894  /// mask is 0.
2895  void load_mask (const vbool_t &mask, const value_t *values);
2896  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
2897 
2898  /// Masked store -- write to values[] where mask is enabled, don't
2899  /// touch values[] where it's not.
2900  void store_mask (const vbool_t &mask, value_t *values) const;
2901  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
2902 
2903  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2904  template<int scale=4>
2905  void gather (const value_t *baseptr, const vint_t& vindex);
2906  /// Gather elements defined by the mask, leave others unchanged.
2907  template<int scale=4>
2908  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2909  template<int scale=4>
2910  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
2911  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
2912  }
2913 
2914  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2915  template<int scale=4>
2916  void scatter (value_t *baseptr, const vint_t& vindex) const;
2917  /// Scatter elements defined by the mask
2918  template<int scale=4>
2919  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2920  template<int scale=4>
2921  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
2922  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
2923  }
2924 
2925  // Arithmetic operators (component-by-component)
2926  friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
2927  friend vfloat16 operator- (const vfloat16& a);
2928  friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
2929  friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
2930  friend vfloat16 operator* (const vfloat16& a, float b);
2931  friend vfloat16 operator* (float a, const vfloat16& b);
2932  friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
2933  friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
2934  friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
2935  friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
2936  friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
2937  friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
2938 
2939  // Comparison operations
2940  friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
2941  friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
2942  friend vbool16 operator< (const vfloat16& a, const vfloat16& b);
2943  friend vbool16 operator> (const vfloat16& a, const vfloat16& b);
2944  friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
2945  friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
2946 
2947  // Some oddball items that are handy
2948 
2949  /// Stream output
2950  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
2951 
2952 protected:
2953  // The actual data representation
2954  union {
2958  };
2959 };
2960 
2961 
2962 /// Shuffle groups of 4
2963 template<int i0, int i1, int i2, int i3>
2964 vfloat16 shuffle4 (const vfloat16& a);
2965 
2966 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
2967 template<int i> vfloat16 shuffle4 (const vfloat16& a);
2968 
2969 /// Shuffle within each group of 4
2970 template<int i0, int i1, int i2, int i3>
2971 vfloat16 shuffle (const vfloat16& a);
2972 
2973 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2974 template<int i> vfloat16 shuffle (const vfloat16& a);
2975 
2976 /// Helper: as rapid as possible extraction of one component, when the
2977 /// index is fixed.
2978 template<int i> float extract (const vfloat16& a);
2979 
2980 /// Helper: substitute val for a[i]
2981 template<int i> vfloat16 insert (const vfloat16& a, float val);
2982 
2983 /// The sum of all components, returned in all components.
2984 vfloat16 vreduce_add (const vfloat16& v);
2985 
2986 /// The sum of all components, returned as a scalar.
2987 float reduce_add (const vfloat16& v);
2988 
2989 /// Use a bool mask to select between components of a (if mask[i] is false)
2990 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2991 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
2992 
2993 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2994 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2995 /// blend(0,a,mask).
2996 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
2997 
2998 /// Use a bool mask to select between components of a (if mask[i] is false)
2999 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
3000 /// blend(0,a,!mask), or blend(a,0,mask).
3001 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
3002 
3003 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
3004 /// that is 0, return 0 rather than Inf.
3005 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
3006 
3007 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
3008 /// synonym for blend with arguments rearranged, but this is more clear
3009 /// because the arguments are symmetric to scalar (cond ? a : b).
3010 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
3011 
3012 // Per-element math
3013 vfloat16 abs (const vfloat16& a); ///< absolute value (float)
3014 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative
3015 vfloat16 ceil (const vfloat16& a);
3016 vfloat16 floor (const vfloat16& a);
3017 vint16 ifloor (const vfloat16& a); ///< (int)floor
3018 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias
3019 
3020 /// Per-element round to nearest integer.
3021 /// CAVEAT: the rounding when mid-way between integers may differ depending
3022 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
3023 /// integer) but std::round() says to round away from 0 regardless of
3024 /// current rounding mode (but that is multiple instructions on x64).
3025 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3026 /// match std::round().
3027 vfloat16 round (const vfloat16& a);
3028 
3029 /// Per-element round to nearest integer (equivalent to vint(round(a))).
3030 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
3031 /// C++ std::rint() which says to use the current rounding mode.
3032 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3033 /// match std::rint().
3034 vint16 rint (const vfloat16& a);
3035 
3036 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a
3037 vfloat16 sqrt (const vfloat16 &a);
3038 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt
3039 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt
3040 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
3041 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
3042 // vfloat16 exp (const vfloat16& v); // See template with vfloat4
3043 // vfloat16 log (const vfloat16& v); // See template with vfloat4
3044 
3045 /// andnot(a,b) returns ((~a) & b)
3046 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
3047 
3048 // Fused multiply and add (or subtract):
3049 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
3050 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
3051 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
3052 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
3053 
3054 
3055 
3056 // Odds and ends, other CPU hardware tricks
3057 
3058 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3059 // able, otherwise false (because it's not available on that platform,
3060 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3061 inline bool set_flush_zero_mode (bool on) {
3062 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3063  _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3064  return true;
3065 #endif
3066  return false;
3067 }
3068 
3069 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3070 // able, otherwise false (because it's not available on that platform,
3071 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3072 inline bool set_denorms_zero_mode (bool on) {
3073 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3074  _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3075  return true;
3076 #endif
3077  return false;
3078 }
3079 
3080 // Get the flush_zero_mode CPU flag on x86.
3081 inline bool get_flush_zero_mode () {
3082 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3083  return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3084 #endif
3085  return false;
3086 }
3087 
3088 // Get the denorms_zero_mode CPU flag on x86.
3089 inline bool get_denorms_zero_mode () {
3090 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3091  return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3092 #endif
3093  return false;
3094 }
3095 
3096 
3097 
3098 
3099 
3100 
3101 //////////////////////////////////////////////////////////////////////////
3102 //////////////////////////////////////////////////////////////////////////
3103 //
3104 // Gory implementation details follow.
3105 //
3106 // ^^^ All declarations and documention is above ^^^
3107 //
3108 // vvv Below is the implementation, often considerably cluttered with
3109 // #if's for each architeture, and unapologitic use of intrinsics and
3110 // every manner of dirty trick we can think of to make things fast.
3111 // Some of this isn't pretty. We won't recapitulate comments or
3112 // documentation of what the functions are supposed to do, please
3113 // consult the declarations above for that.
3114 //
3115 // Here be dragons.
3116 //
3117 //////////////////////////////////////////////////////////////////////////
3118 //////////////////////////////////////////////////////////////////////////
3119 
3120 
3121 
3122 //////////////////////////////////////////////////////////////////////
3123 // vbool4 implementation
3124 
3125 
3127  OIIO_DASSERT(i >= 0 && i < elements);
3128 #if OIIO_SIMD_SSE
3129  return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3130 #else
3131  return m_val[i];
3132 #endif
3133 }
3134 
3136  OIIO_DASSERT(i >= 0 && i < elements);
3137  return m_val[i];
3138 }
3139 
3140 
3142  OIIO_DASSERT(i >= 0 && i < elements);
3143  m_val[i] = value ? -1 : 0;
3144 }
3145 
3146 
3147 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3148  cout << a[0];
3149  for (int i = 1; i < a.elements; ++i)
3150  cout << ' ' << a[i];
3151  return cout;
3152 }
3153 
3154 
3156 #if OIIO_SIMD_SSE
3157  m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3158 #elif OIIO_SIMD_NEON
3159  m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3160 #else
3161  int val = -int(a);
3162  SIMD_CONSTRUCT (val);
3163 #endif
3164 }
3165 
3166 
3167 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3168 #if OIIO_SIMD_SSE
3169  // N.B. -- we need to reverse the order because of our convention
3170  // of storing a,b,c,d in the same order in memory.
3171  m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3172 // #elif OIIO_SIMD_NEON
3173 // FIXME
3174 #else
3175  m_val[0] = -int(a);
3176  m_val[1] = -int(b);
3177  m_val[2] = -int(c);
3178  m_val[3] = -int(d);
3179 #endif
3180 }
3181 
3183  load (a[0], a[1], a[2], a[3]);
3184 }
3185 
3187  m_simd = other.m_simd;
3188  return *this;
3189 }
3190 
3191 
3193 #if OIIO_SIMD_SSE
3194  return _mm_movemask_ps(m_simd);
3195 #else
3196  int r = 0;
3197  for (int i = 0; i < elements; ++i)
3198  if (m_val[i])
3199  r |= 1<<i;
3200  return r;
3201 #endif
3202 }
3203 
3204 
3206 vbool4::from_bitmask (int bitmask) {
3207  // I think this is a fast conversion from int bitmask to vbool4
3208  return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3209 }
3210 
3211 
3213 #if OIIO_SIMD_SSE
3214  m_simd = _mm_setzero_ps();
3215 #else
3216  *this = false;
3217 #endif
3218 }
3219 
3220 
3222 #if OIIO_SIMD_SSE
3223  return _mm_setzero_ps();
3224 #else
3225  return false;
3226 #endif
3227 }
3228 
3230  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3231 #if OIIO_SIMD_SSE
3232 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3233  __m128i anyval = _mm_undefined_si128();
3234 # else
3235  __m128i anyval = _mm_setzero_si128();
3236 # endif
3237  return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3238 #else
3239  return true;
3240 #endif
3241 }
3242 
3244  SIMD_DO (values[i] = m_val[i] ? true : false);
3245 }
3246 
3247 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3248  OIIO_DASSERT (n >= 0 && n <= elements);
3249  for (int i = 0; i < n; ++i)
3250  values[i] = m_val[i] ? true : false;
3251 }
3252 
3253 
3254 
3256 #if OIIO_SIMD_SSE
3257  return _mm_xor_ps (a.simd(), vbool4::True());
3258 #elif OIIO_SIMD_NEON
3259  return vmvnq_u32(a.simd());
3260 #else
3261  SIMD_RETURN (vbool4, a[i] ^ (-1));
3262 #endif
3263 }
3264 
3266 #if OIIO_SIMD_SSE
3267  return _mm_and_ps (a.simd(), b.simd());
3268 #elif OIIO_SIMD_NEON
3269  return vandq_u32(a.simd(), b.simd());
3270 #else
3271  SIMD_RETURN (vbool4, a[i] & b[i]);
3272 #endif
3273 }
3274 
3276 #if OIIO_SIMD_SSE
3277  return _mm_or_ps (a.simd(), b.simd());
3278 #elif OIIO_SIMD_NEON
3279  return vorrq_u32(a.simd(), b.simd());
3280 #else
3281  SIMD_RETURN (vbool4, a[i] | b[i]);
3282 #endif
3283 }
3284 
3286 #if OIIO_SIMD_SSE
3287  return _mm_xor_ps (a.simd(), b.simd());
3288 #elif OIIO_SIMD_NEON
3289  return veorq_u32(a.simd(), b.simd());
3290 #else
3291  SIMD_RETURN (vbool4, a[i] ^ b[i]);
3292 #endif
3293 }
3294 
3295 
3297  return a = a & b;
3298 }
3299 
3301  return a = a | b;
3302 }
3303 
3305  return a = a ^ b;
3306 }
3307 
3309 #if OIIO_SIMD_SSE
3310  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3311  return _mm_xor_ps (a.simd(), vbool4::True());
3312 #elif OIIO_SIMD_NEON
3313  return vmvnq_u32(a.m_simd);
3314 #else
3315  SIMD_RETURN (vbool4, ~a[i]);
3316 #endif
3317 }
3318 
3320 #if OIIO_SIMD_SSE
3321  return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3322 #elif OIIO_SIMD_NEON
3323  return vceqq_u32 (a.m_simd, b.m_simd);
3324 #else
3325  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3326 #endif
3327 }
3328 
3330 #if OIIO_SIMD_SSE
3331  return _mm_xor_ps (a, b);
3332 #elif OIIO_SIMD_NEON
3333  return !(a == b);
3334 #else
3335  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3336 #endif
3337 }
3338 
3339 
3340 
3341 
3342 #if OIIO_SIMD_SSE
3343 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b)
3344 template<int i0, int i1, int i2, int i3>
3345 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3346  return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3347 }
3348 #endif
3349 
3350 #if OIIO_SIMD_SSE >= 3
3351 // SSE3 has intrinsics for a few special cases
3352 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3353  return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3354 }
3355 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3356  return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3357 }
3358 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3359  return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3360 }
3361 #endif
3362 
3363 #if OIIO_SIMD_SSE
3364 template<int i0, int i1, int i2, int i3>
3365 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3366  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3367 }
3368 #endif
3369 
3370 #if OIIO_SIMD_SSE >= 3
3371 // SSE3 has intrinsics for a few special cases
3372 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3373  return _mm_moveldup_ps(a);
3374 }
3375 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3376  return _mm_movehdup_ps(a);
3377 }
3378 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3379  return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3380 }
3381 #endif
3382 
3383 
3384 /// Helper: shuffle/swizzle with constant (templated) indices.
3385 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3386 template<int i0, int i1, int i2, int i3>
3388 #if OIIO_SIMD_SSE
3389  return shuffle_sse<i0,i1,i2,i3> (a.simd());
3390 #else
3391  return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3392 #endif
3393 }
3394 
3395 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3396 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3397  return shuffle<i,i,i,i>(a);
3398 }
3399 
3400 
3401 /// Helper: as rapid as possible extraction of one component, when the
3402 /// index is fixed.
3403 template<int i>
3405 #if OIIO_SIMD_SSE >= 4
3406  return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only
3407 #else
3408  return a[i];
3409 #endif
3410 }
3411 
3412 /// Helper: substitute val for a[i]
3413 template<int i>
3415 #if OIIO_SIMD_SSE >= 4
3416  int ival = -int(val);
3417  return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3418 #else
3419  vbool4 tmp = a;
3420  tmp[i] = -int(val);
3421  return tmp;
3422 #endif
3423 }
3424 
3426 #if OIIO_SIMD_AVX
3427  return _mm_testc_ps (v, vbool4(true)) != 0;
3428 #elif OIIO_SIMD_SSE
3429  return _mm_movemask_ps(v.simd()) == 0xf;
3430 #else
3431  SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3432 #endif
3433 }
3434 
3436 #if OIIO_SIMD_AVX
3437  return ! _mm_testz_ps (v, v);
3438 #elif OIIO_SIMD_SSE
3439  return _mm_movemask_ps(v) != 0;
3440 #else
3441  SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3442 #endif
3443 }
3444 
3445 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3446 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3447 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3448 
3449 
3450 
3451 //////////////////////////////////////////////////////////////////////
3452 // vbool8 implementation
3453 
3454 
3456  OIIO_DASSERT(i >= 0 && i < elements);
3457 #if OIIO_SIMD_AVX
3458  return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3459 #else
3460  return m_val[i];
3461 #endif
3462 }
3463 
3465  OIIO_DASSERT(i >= 0 && i < elements);
3466  m_val[i] = value ? -1 : 0;
3467 }
3468 
3470  OIIO_DASSERT(i >= 0 && i < elements);
3471  return m_val[i];
3472 }
3473 
3474 
3475 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3476  cout << a[0];
3477  for (int i = 1; i < a.elements; ++i)
3478  cout << ' ' << a[i];
3479  return cout;
3480 }
3481 
3482 
3484 #if OIIO_SIMD_AVX
3485  m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3486 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3487  m_4[0].load(a);
3488  m_4[1].load(a);
3489 #else
3490  int val = -int(a);
3491  SIMD_CONSTRUCT (val);
3492 #endif
3493 }
3494 
3495 
3496 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3497  bool e, bool f, bool g, bool h) {
3498 #if OIIO_SIMD_AVX
3499  // N.B. -- we need to reverse the order because of our convention
3500  // of storing a,b,c,d in the same order in memory.
3501  m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3502  -int(d), -int(c), -int(b), -int(a)));
3503 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3504  m_4[0].load(a, b, c, d);
3505  m_4[1].load(e, f, g, h);
3506 #else
3507  m_val[0] = -int(a);
3508  m_val[1] = -int(b);
3509  m_val[2] = -int(c);
3510  m_val[3] = -int(d);
3511  m_val[4] = -int(e);
3512  m_val[5] = -int(f);
3513  m_val[6] = -int(g);
3514  m_val[7] = -int(h);
3515 #endif
3516 }
3517 
3518 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3519  bool e, bool f, bool g, bool h) {
3520  load (a, b, c, d, e, f, g, h);
3521 }
3522 
3523 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d,
3524  int e, int f, int g, int h) {
3525  load (bool(a), bool(b), bool(c), bool(d),
3526  bool(e), bool(f), bool(g), bool(h));
3527 }
3528 
3530  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3531 }
3532 
3533 
3535  load(a);
3536  return *this;
3537 }
3538 
3540  m_simd = other.m_simd;
3541  return *this;
3542 }
3543 
3545 #if OIIO_SIMD_AVX
3546  return _mm256_movemask_ps(m_simd);
3547 #else
3548  return lo().bitmask() | (hi().bitmask() << 4);
3549 #endif
3550 }
3551 
3552 
3554 vbool8::from_bitmask (int bitmask) {
3555  // I think this is a fast conversion from int bitmask to vbool8
3556  return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3557 }
3558 
3559 
3561 #if OIIO_SIMD_AVX
3562  m_simd = _mm256_setzero_ps();
3563 #else
3564  *this = false;
3565 #endif
3566 }
3567 
3569 #if OIIO_SIMD_AVX
3570  return _mm256_setzero_ps();
3571 #else
3572  return false;
3573 #endif
3574 }
3575 
3576 
3578 #if OIIO_SIMD_AVX
3579 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3580  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3581  __m256i anyval = _mm256_undefined_si256();
3582  return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3583 # else
3584  return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3585 # endif
3586 #else
3587  return true;
3588 #endif
3589 }
3590 
3591 
3593  SIMD_DO (values[i] = m_val[i] ? true : false);
3594 }
3595 
3596 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3597  OIIO_DASSERT (n >= 0 && n <= elements);
3598  for (int i = 0; i < n; ++i)
3599  values[i] = m_val[i] ? true : false;
3600 }
3601 
3602 
3604 #if OIIO_SIMD_AVX
3605  return _mm256_castps256_ps128 (simd());
3606 #else
3607  return m_4[0];
3608 #endif
3609 }
3610 
3612 #if OIIO_SIMD_AVX
3613  return _mm256_extractf128_ps (simd(), 1);
3614 #else
3615  return m_4[1];
3616 #endif
3617 }
3618 
3619 
3621 #if OIIO_SIMD_AVX
3622  __m256 r = _mm256_castps128_ps256 (lo);
3623  m_simd = _mm256_insertf128_ps (r, hi, 1);
3624  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3625 #else
3626  m_4[0] = lo;
3627  m_4[1] = hi;
3628 #endif
3629 }
3630 
3631 
3633 #if OIIO_SIMD_AVX
3634  return _mm256_xor_ps (a.simd(), vbool8::True());
3635 #else
3636  SIMD_RETURN (vbool8, a[i] ^ (-1));
3637 #endif
3638 }
3639 
3641 #if OIIO_SIMD_AVX
3642  return _mm256_and_ps (a.simd(), b.simd());
3643 #else
3644  SIMD_RETURN (vbool8, a[i] & b[i]);
3645 #endif
3646 }
3647 
3649 #if OIIO_SIMD_AVX
3650  return _mm256_or_ps (a.simd(), b.simd());
3651 #else
3652  SIMD_RETURN (vbool8, a[i] | b[i]);
3653 #endif
3654 }
3655 
3657 #if OIIO_SIMD_AVX
3658  return _mm256_xor_ps (a.simd(), b.simd());
3659 #else
3660  SIMD_RETURN (vbool8, a[i] ^ b[i]);
3661 #endif
3662 }
3663 
3664 
3666  return a = a & b;
3667 }
3668 
3670  return a = a | b;
3671 }
3672 
3674  return a = a ^ b;
3675 }
3676 
3677 
3679 #if OIIO_SIMD_AVX
3680  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3681  return _mm256_xor_ps (a.simd(), vbool8::True());
3682 #else
3683  SIMD_RETURN (vbool8, ~a[i]);
3684 #endif
3685 }
3686 
3687 
3689 #if OIIO_SIMD_AVX >= 2
3690  return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3691 #elif OIIO_SIMD_AVX
3692  return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3693 #else
3694  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3695 #endif
3696 }
3697 
3699 #if OIIO_SIMD_AVX
3700  return _mm256_xor_ps (a, b);
3701 #else
3702  SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3703 #endif
3704 }
3705 
3706 
3707 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3709 #if OIIO_SIMD_AVX >= 2
3710  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3711  return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3712 #else
3713  return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3714 #endif
3715 }
3716 
3717 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3718  return shuffle<i,i,i,i,i,i,i,i>(a);
3719 }
3720 
3721 
3722 template<int i>
3724 #if OIIO_SIMD_AVX && !_WIN32
3725  return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only
3726 #else
3727  return a[i];
3728 #endif
3729 }
3730 
3731 template<int i>
3733 #if OIIO_SIMD_AVX && !_WIN32
3734  int ival = -int(val);
3735  return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3736 #else
3737  vbool8 tmp = a;
3738  tmp[i] = -int(val);
3739  return tmp;
3740 #endif
3741 }
3742 
3743 
3745 #if OIIO_SIMD_AVX
3746  return _mm256_testc_ps (v, vbool8(true)) != 0;
3747  // return _mm256_movemask_ps(v.simd()) == 0xff;
3748 #else
3749  SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3750 #endif
3751 }
3752 
3754 #if OIIO_SIMD_AVX
3755  return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h !
3756  // return _mm256_movemask_ps(v) != 0;
3757 #else
3758  SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3759 #endif
3760 }
3761 
3762 
3763 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3764 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3765 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3766 
3767 
3768 
3769 //////////////////////////////////////////////////////////////////////
3770 // vbool16 implementation
3771 
3772 
3774  OIIO_DASSERT(i >= 0 && i < elements);
3775 #if OIIO_SIMD_AVX >= 512
3776  return (int(m_simd) >> i) & 1;
3777 #else
3778  return (m_bits >> i) & 1;
3779 #endif
3780 }
3781 
3783  OIIO_DASSERT(i >= 0 && i < elements);
3784  int bits = m_bits;
3785  bits &= (0xffff ^ (1<<i));
3786  bits |= (int(value)<<i);
3787  m_bits = bits;
3788 }
3789 
3790 
3791 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3792  cout << a[0];
3793  for (int i = 1; i < a.elements; ++i)
3794  cout << ' ' << a[i];
3795  return cout;
3796 }
3797 
3798 
3800  m_simd = a ? 0xffff : 0;
3801 }
3802 
3803 
3805  m_simd = simd_t(a);
3806 }
3807 
3808 
3809 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3810  bool v4, bool v5, bool v6, bool v7,
3811  bool v8, bool v9, bool v10, bool v11,
3812  bool v12, bool v13, bool v14, bool v15) {
3813  m_simd = simd_t((int(v0) << 0) |
3814  (int(v1) << 1) |
3815  (int(v2) << 2) |
3816  (int(v3) << 3) |
3817  (int(v4) << 4) |
3818  (int(v5) << 5) |
3819  (int(v6) << 6) |
3820  (int(v7) << 7) |
3821  (int(v8) << 8) |
3822  (int(v9) << 9) |
3823  (int(v10) << 10) |
3824  (int(v11) << 11) |
3825  (int(v12) << 12) |
3826  (int(v13) << 13) |
3827  (int(v14) << 14) |
3828  (int(v15) << 15));
3829 }
3830 
3831 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3832  bool v4, bool v5, bool v6, bool v7,
3833  bool v8, bool v9, bool v10, bool v11,
3834  bool v12, bool v13, bool v14, bool v15) {
3835  load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3836 }
3837 
3839  int v4, int v5, int v6, int v7,
3840  int v8, int v9, int v10, int v11,
3841  int v12, int v13, int v14, int v15) {
3842  load (bool(v0), bool(v1), bool(v2), bool(v3),
3843  bool(v4), bool(v5), bool(v6), bool(v7),
3844  bool(v8), bool(v9), bool(v10), bool(v11),
3845  bool(v12), bool(v13), bool(v14), bool(v15));
3846 }
3847 
3849  load_bitmask (a.bitmask() | (b.bitmask() << 8));
3850 }
3851 
3853  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3854  a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3855 }
3856 
3857 
3859  load(a);
3860  return *this;
3861 }
3862 
3864  m_simd = other.m_simd;
3865  return *this;
3866 }
3867 
3868 
3870 #if OIIO_SIMD_AVX >= 512
3871  return int(m_simd);
3872 #else
3873  return int(m_bits);
3874 #endif
3875 }
3876 
3877 
3879  m_simd = simd_t(0);
3880 }
3881 
3883  return simd_t(0);
3884 }
3885 
3886 
3888  return simd_t(0xffff);
3889 }
3890 
3891 
3893  SIMD_DO (values[i] = m_bits & (1<<i));
3894 }
3895 
3896 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
3897  OIIO_DASSERT (n >= 0 && n <= elements);
3898  for (int i = 0; i < n; ++i)
3899  values[i] = m_bits & (1<<i);
3900 }
3901 
3902 
3903 
3905 #if OIIO_SIMD_AVX >= 512
3906  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
3907 #else
3908  SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
3909 #endif
3910 }
3911 
3913 #if OIIO_SIMD_AVX >= 512
3914  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
3915 #else
3916  SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
3917 #endif
3918 }
3919 
3920 
3922 #if OIIO_SIMD_AVX >= 512
3923  return _mm512_knot (a.simd());
3924 #else
3925  return vbool16 (a.m_bits ^ 0xffff);
3926 #endif
3927 }
3928 
3930 #if OIIO_SIMD_AVX >= 512
3931  return _mm512_kand (a.simd(), b.simd());
3932 #else
3933  return vbool16 (a.m_bits & b.m_bits);
3934 #endif
3935 }
3936 
3938 #if OIIO_SIMD_AVX >= 512
3939  return _mm512_kor (a.simd(), b.simd());
3940 #else
3941  return vbool16 (a.m_bits | b.m_bits);
3942 #endif
3943 }
3944 
3946 #if OIIO_SIMD_AVX >= 512
3947  return _mm512_kxor (a.simd(), b.simd());
3948 #else
3949  return vbool16 (a.m_bits ^ b.m_bits);
3950 #endif
3951 }
3952 
3953 
3955  return a = a & b;
3956 }
3957 
3959  return a = a | b;
3960 }
3961 
3963  return a = a ^ b;
3964 }
3965 
3966 
3968  return a ^ vbool16::True();
3969 }
3970 
3971 
3973 #if OIIO_SIMD_AVX >= 512
3974  return _mm512_kxnor (a.simd(), b.simd());
3975 #else
3976  return vbool16 (!(a.m_bits ^ b.m_bits));
3977 #endif
3978 }
3979 
3981 #if OIIO_SIMD_AVX >= 512
3982  return _mm512_kxor (a.simd(), b.simd());
3983 #else
3984  return vbool16 (a.m_bits ^ b.m_bits);
3985 #endif
3986 }
3987 
3988 
3989 template<int i>
3991  return a[i];
3992 }
3993 
3994 template<int i>
3996