HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2014 Larry Gritz et al.
3 All Rights Reserved.
4 
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are
7 met:
8 * Redistributions of source code must retain the above copyright
9  notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11  notice, this list of conditions and the following disclaimer in the
12  documentation and/or other materials provided with the distribution.
13 * Neither the name of Sony Pictures Imageworks nor the names of its
14  contributors may be used to endorse or promote products derived from
15  this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 
29 /// @file simd.h
30 ///
31 /// @brief Classes for SIMD processing.
32 ///
33 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
34 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
35 ///
36 /// It helped me a lot to peruse the source of these packages:
37 /// Syrah: https://github.com/boulos/syrah
38 /// Embree: https://github.com/embree
39 /// Vectorial: https://github.com/scoopr/vectorial
40 ///
41 /// To find out which CPU features you have:
42 /// Linux: cat /proc/cpuinfo
43 /// OSX: sysctl machdep.cpu.features
44 ///
45 /// Additional web resources:
46 /// http://www.codersnotes.com/notes/maths-lib-2016/
47 
48 // clang-format off
49 
50 #pragma once
51 
52 #include <OpenImageIO/dassert.h>
54 #include <OpenImageIO/platform.h>
55 #include <OpenEXR/ImathVec.h>
56 #include <OpenEXR/ImathMatrix.h>
57 #include <algorithm>
58 #include <cstring>
59 
60 
61 //////////////////////////////////////////////////////////////////////////
62 // Sort out which SIMD capabilities we have and set definitions
63 // appropriately. This is mostly for internal (within this file) use,
64 // but client applications using this header may find a few of the macros
65 // we define to be useful:
66 //
67 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
68 // hardware is available, this will hold the width in number of
69 // float SIMD "lanes" of widest SIMD registers available. For
70 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
71 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
72 // etc. Using SIMD classes wider than this should work (will be
73 // emulated with narrower SIMD or scalar operations), but is not
74 // expected to have high performance.
75 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
76 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
77 // higher (including AVX).
78 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
79 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
80 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
81 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
82 // available (generally will be OIIO_SIMD*4).
83 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
84 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
85 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
86 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
87 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
88 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
89 
90 #if defined(_WIN32)
91 # include <intrin.h>
92 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
93 # include <x86intrin.h>
94 #elif defined(__GNUC__) && defined(__ARM_NEON__)
95 # include <arm_neon.h>
96 #endif
97 
98 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us
99 // to test thoroughly. We presume that anybody needing high performance
100 // badly enough to want SIMD also is on a 64 bit CPU.
101 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
102 #define OIIO_NO_SSE 1
103 #endif
104 
105 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
106 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
107 # define OIIO_SIMD_SSE 4
108  /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
109  * instructions specific to 4.2, but they are all related to string
110  * comparisons and CRCs, which don't currently seem relevant to OIIO,
111  * so for simplicity, we sweep this difference under the rug.
112  */
113 # elif defined(__SSSE3__)
114 # define OIIO_SIMD_SSE 3
115  /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
116  * there are a few older architectures that are SSE3 but not SSSE3,
117  * and this simplification means that these particular old platforms
118  * will only get SSE2 goodness out of our code. So be it. Anybody who
119  * cares about performance is probably using a 64 bit machine that's
120  * SSE 4.x or AVX by now.
121  */
122 # else
123 # define OIIO_SIMD_SSE 2
124 # endif
125 # define OIIO_SIMD 4
126 # define OIIO_SIMD_MAX_SIZE_BYTES 16
127 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
128 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
129 #else
130 # define OIIO_SIMD_SSE 0
131 #endif
132 
133 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
134  // N.B. Any machine with AVX will also have SSE
135 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
136 # define OIIO_SIMD_AVX 2
137 # else
138 # define OIIO_SIMD_AVX 1
139 # endif
140 # undef OIIO_SIMD
141 # define OIIO_SIMD 8
142 # undef OIIO_SIMD_MAX_SIZE_BYTES
143 # define OIIO_SIMD_MAX_SIZE_BYTES 32
144 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
145 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
146 # if defined(__AVX512F__)
147 # undef OIIO_SIMD_AVX
148 # define OIIO_SIMD_AVX 512
149 # undef OIIO_SIMD_MAX_SIZE_BYTES
150 # define OIIO_SIMD_MAX_SIZE_BYTES 64
151 # undef OIIO_SIMD
152 # define OIIO_SIMD 16
153 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
154 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
155 # define OIIO_AVX512F_ENABLED 1
156 # endif
157 # if defined(__AVX512DQ__)
158 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */
159 # else
160 # define OIIO_AVX512DQ_ENABLED 0
161 # endif
162 # if defined(__AVX512PF__)
163 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */
164 # else
165 # define OIIO_AVX512PF_ENABLED 0
166 # endif
167 # if defined(__AVX512ER__)
168 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */
169 # else
170 # define OIIO_AVX512ER_ENABLED 0
171 # endif
172 # if defined(__AVX512CD__)
173 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */
174 # else
175 # define OIIO_AVX512CD_ENABLED 0
176 # endif
177 # if defined(__AVX512BW__)
178 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */
179 # else
180 # define OIIO_AVX512BW_ENABLED 0
181 # endif
182 # if defined(__AVX512VL__)
183 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */
184 # else
185 # define OIIO_AVX512VL_ENABLED 0
186 # endif
187 #else
188 # define OIIO_SIMD_AVX 0
189 # define OIIO_AVX512VL_ENABLED 0
190 # define OIIO_AVX512DQ_ENABLED 0
191 # define OIIO_AVX512PF_ENABLED 0
192 # define OIIO_AVX512ER_ENABLED 0
193 # define OIIO_AVX512CD_ENABLED 0
194 # define OIIO_AVX512BW_ENABLED 0
195 #endif
196 
197 #if defined(__FMA__)
198 # define OIIO_FMA_ENABLED 1
199 #else
200 # define OIIO_FMA_ENABLED 0
201 #endif
202 #if defined(__AVX512IFMA__)
203 # define OIIO_AVX512IFMA_ENABLED 1
204 #else
205 # define OIIO_AVX512IFMA_ENABLED 0
206 #endif
207 
208 #if defined(__F16C__)
209 # define OIIO_F16C_ENABLED 1
210 #else
211 # define OIIO_F16C_ENABLED 0
212 #endif
213 
214 // FIXME Future: support ARM Neon
215 // Uncomment this when somebody with Neon can verify it works
216 #if 0 && defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
217 # define OIIO_SIMD 4
218 # define OIIO_SIMD_NEON 1
219 # define OIIO_SIMD_MAX_SIZE_BYTES 16
220 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
221 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
222 #else
223 # define OIIO_SIMD_NEON 0
224 #endif
225 
226 #ifndef OIIO_SIMD
227  // No SIMD available
228 # define OIIO_SIMD 0
229 # define OIIO_SIMD4_ALIGN
230 # define OIIO_SIMD_MAX_SIZE_BYTES 16
231 #endif
232 
233 #ifndef OIIO_SIMD8_ALIGN
234 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
235 #endif
236 #ifndef OIIO_SIMD16_ALIGN
237 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
238 #endif
239 
240 
241 // General features that client apps may want to test for, for conditional
242 // compilation. Will add to this over time as needed. Note that just
243 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
244 // the vfloat8 class (and friends) are in this version of simd.h, but that's
245 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
246 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */
247 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */
248 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */
249 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */
250 
251 
252 #include "missing_math.h"
253 
254 
255 // Embarrassing hack: Xlib.h #define's True and False!
256 #ifdef True
257 # undef True
258 #endif
259 #ifdef False
260 # undef False
261 #endif
262 
263 
264 
266 
267 namespace simd {
268 
269 //////////////////////////////////////////////////////////////////////////
270 // Forward declarations of our main SIMD classes
271 
272 class vbool4;
273 class vint4;
274 class vfloat4;
275 class vfloat3;
276 class matrix44;
277 class vbool8;
278 class vint8;
279 class vfloat8;
280 class vbool16;
281 class vint16;
282 class vfloat16;
283 
284 // Deprecated names -- remove these in 1.9
285 typedef vbool4 mask4; // old name
286 typedef vbool4 bool4;
287 typedef vbool8 bool8;
288 typedef vint4 int4;
289 typedef vint8 int8;
290 typedef vfloat3 float3;
291 typedef vfloat4 float4;
292 typedef vfloat8 float8;
293 
294 
295 
296 //////////////////////////////////////////////////////////////////////////
297 // Template magic to determine the raw SIMD types involved, and other
298 // things helpful for metaprogramming.
299 
300 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
301 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
302 
303 #if OIIO_SIMD_SSE
304 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
305 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
306 template<> struct simd_bool_t<4> { typedef __m128 type; };
307 #endif
308 
309 #if OIIO_SIMD_AVX
310 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
311 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
312 template<> struct simd_bool_t<8> { typedef __m256 type; };
313 #endif
314 
315 #if OIIO_SIMD_AVX >= 512
316 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
317 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
318 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
319 #else
320 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
321 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
322 template<> struct simd_bool_t<16> { typedef uint16_t type; };
323 #endif
324 
325 #if OIIO_SIMD_NEON
326 template<> struct simd_raw_t<int,4> { typedef int32x4 type; };
327 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
328 template<> struct simd_bool_t<4> { typedef int32x4 type; };
329 #endif
330 
331 
332 /// Template to retrieve the vector type from the scalar. For example,
333 /// simd::VecType<int,4> will be vfloat4.
334 template<typename T,int elements> struct VecType {};
335 template<> struct VecType<int,1> { typedef int type; };
336 template<> struct VecType<float,1> { typedef float type; };
337 template<> struct VecType<int,4> { typedef vint4 type; };
338 template<> struct VecType<float,4> { typedef vfloat4 type; };
339 template<> struct VecType<float,3> { typedef vfloat3 type; };
340 template<> struct VecType<bool,4> { typedef vbool4 type; };
341 template<> struct VecType<int,8> { typedef vint8 type; };
342 template<> struct VecType<float,8> { typedef vfloat8 type; };
343 template<> struct VecType<bool,8> { typedef vbool8 type; };
344 template<> struct VecType<int,16> { typedef vint16 type; };
345 template<> struct VecType<float,16> { typedef vfloat16 type; };
346 template<> struct VecType<bool,16> { typedef vbool16 type; };
347 
348 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
349 /// anything but our SIMD types.
350 template<typename T> struct SimdSize { static const int size = 1; };
351 template<> struct SimdSize<vint4> { static const int size = 4; };
352 template<> struct SimdSize<vfloat4> { static const int size = 4; };
353 template<> struct SimdSize<vfloat3> { static const int size = 4; };
354 template<> struct SimdSize<vbool4> { static const int size = 4; };
355 template<> struct SimdSize<vint8> { static const int size = 8; };
356 template<> struct SimdSize<vfloat8> { static const int size = 8; };
357 template<> struct SimdSize<vbool8> { static const int size = 8; };
358 template<> struct SimdSize<vint16> { static const int size = 16; };
359 template<> struct SimdSize<vfloat16> { static const int size = 16; };
360 template<> struct SimdSize<vbool16> { static const int size = 16; };
361 
362 /// Template to retrieve the number of elements size of a SIMD type. Rigged
363 /// to be 1 for anything but our SIMD types.
364 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
365 template<> struct SimdElements<vfloat3> { static const int size = 3; };
366 
367 /// Template giving a printable name for each type
368 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
369 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } };
370 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } };
371 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } };
372 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } };
373 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } };
374 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } };
375 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } };
376 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } };
377 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } };
378 
379 
380 //////////////////////////////////////////////////////////////////////////
381 // Macros helpful for making static constants in code.
382 
383 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
384  static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
385 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
386  static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
387 # define OIIO_SIMD_INT4_CONST(name,val) \
388  static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
389 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
390  static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
391 # define OIIO_SIMD_UINT4_CONST(name,val) \
392  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
393 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
394  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
395 
396 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
397  static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
398  (val), (val), (val), (val) }
399 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
400  static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
401  (v4), (v5), (v6), (v7) }
402 # define OIIO_SIMD_INT8_CONST(name,val) \
403  static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
404  (val), (val), (val), (val) }
405 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
406  static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
407  (v4), (v5), (v6), (v7) }
408 # define OIIO_SIMD_UINT8_CONST(name,val) \
409  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
410  (val), (val), (val), (val) }
411 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
412  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
413  (v4), (v5), (v6), (v7) }
414 
415 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
416  static const OIIO_SIMD16_ALIGN float name[16] = { \
417  (val), (val), (val), (val), (val), (val), (val), (val), \
418  (val), (val), (val), (val), (val), (val), (val), (val) }
419 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
420  static const OIIO_SIMD16_ALIGN float name[16] = { \
421  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
422  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
423 # define OIIO_SIMD_INT16_CONST(name,val) \
424  static const OIIO_SIMD16_ALIGN int name[16] = { \
425  (val), (val), (val), (val), (val), (val), (val), (val), \
426  (val), (val), (val), (val), (val), (val), (val), (val) }
427 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
428  static const OIIO_SIMD16_ALIGN int name[16] = { \
429  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
430  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
431 # define OIIO_SIMD_UINT16_CONST(name,val) \
432  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
433  (val), (val), (val), (val), (val), (val), (val), (val), \
434  (val), (val), (val), (val), (val), (val), (val), (val) }
435 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
436  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
437  (val), (val), (val), (val), (val), (val), (val), (val), \
438  (val), (val), (val), (val), (val), (val), (val), (val) }
439 
440 
441 //////////////////////////////////////////////////////////////////////////
442 // Some macros just for use in this file (#undef-ed at the end) making
443 // it more succinct to express per-element operations.
444 
445 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
446 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
447 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
448  for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
449 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
450 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
451 
452 
453 
454 //////////////////////////////////////////////////////////////////////////
455 //////////////////////////////////////////////////////////////////////////
456 // The public declarations of the main SIMD classes follow: boolN, intN,
457 // floatN, matrix44.
458 //
459 // These class declarations are intended to be brief and self-documenting,
460 // and give all the information that users or client applications need to
461 // know to use these classes.
462 //
463 // No implementations are given inline except for the briefest, completely
464 // generic methods that don't have any architecture-specific overloads.
465 // After the class defintions, there will be an immense pile of full
466 // implementation definitions, which casual users are not expected to
467 // understand.
468 //////////////////////////////////////////////////////////////////////////
469 //////////////////////////////////////////////////////////////////////////
470 
471 
472 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
473 /// SIMD instructions when available. This is what is naturally produced by
474 /// SIMD comparison operators on the vfloat4 and vint4 types.
475 class vbool4 {
476 public:
477  static const char* type_name() { return "vbool4"; }
478  typedef bool value_t; ///< Underlying equivalent scalar value type
479  enum { elements = 4 }; ///< Number of scalar elements
480  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
481  enum { bits = elements*32 }; ///< Total number of bits
482  typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used
483 
484  /// Default constructor (contents undefined)
485  vbool4 () { }
486 
487  /// Construct from a single value (store it in all slots)
488  vbool4 (bool a) { load(a); }
489 
490  explicit vbool4 (const bool *a);
491 
492  /// Construct from 4 values
493  vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
494 
495  /// Copy construct from another vbool4
496  vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
497 
498  /// Construct from a SIMD int (is each element nonzero?)
499  vbool4 (const vint4 &i);
500 
501  /// Construct from the underlying SIMD type
502  vbool4 (const simd_t& m) : m_simd(m) { }
503 
504  /// Return the raw SIMD type
505  operator simd_t () const { return m_simd; }
506  simd_t simd () const { return m_simd; }
507 
508  /// Extract the bitmask
509  int bitmask () const;
510 
511  /// Convert from integer bitmask to a true vbool4
512  static vbool4 from_bitmask (int bitmask);
513 
514  /// Set all components to false
515  void clear ();
516 
517  /// Return a vbool4 the is 'false' for all values
518  static const vbool4 False ();
519 
520  /// Return a vbool4 the is 'true' for all values
521  static const vbool4 True ();
522 
523  /// Assign one value to all components
524  const vbool4 & operator= (bool a) { load(a); return *this; }
525 
526  /// Assignment of another vbool4
527  const vbool4 & operator= (const vbool4 & other);
528 
529  /// Component access (get)
530  int operator[] (int i) const;
531 
532  /// Component access (set).
533  void setcomp (int i, bool value);
534 
535  /// Component access (set).
536  /// NOTE: avoid this unsafe construct. It will go away some day.
537  int& operator[] (int i);
538 
539  /// Helper: load a single value into all components.
540  void load (bool a);
541 
542  /// Helper: load separate values into each component.
543  void load (bool a, bool b, bool c, bool d);
544 
545  /// Helper: store the values into memory as bools.
546  void store (bool *values) const;
547 
548  /// Store the first n values into memory.
549  void store (bool *values, int n) const;
550 
551  /// Logical/bitwise operators, component-by-component
552  friend vbool4 operator! (const vbool4& a);
553  friend vbool4 operator& (const vbool4& a, const vbool4& b);
554  friend vbool4 operator| (const vbool4& a, const vbool4& b);
555  friend vbool4 operator^ (const vbool4& a, const vbool4& b);
556  friend vbool4 operator~ (const vbool4& a);
557  friend const vbool4& operator&= (vbool4& a, const vbool4& b);
558  friend const vbool4& operator|= (vbool4& a, const vbool4& b);
559  friend const vbool4& operator^= (vbool4& a, const vbool4& b);
560 
561  /// Comparison operators, component by component
562  friend vbool4 operator== (const vbool4& a, const vbool4& b);
563  friend vbool4 operator!= (const vbool4& a, const vbool4& b);
564 
565  /// Stream output
566  friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
567 
568 private:
569  // The actual data representation
570  union {
573  };
574 };
575 
576 
577 
578 /// Helper: shuffle/swizzle with constant (templated) indices.
579 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
580 template<int i0, int i1, int i2, int i3> vbool4 shuffle (const vbool4& a);
581 
582 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
583 template<int i> vbool4 shuffle (const vbool4& a);
584 
585 /// Helper: as rapid as possible extraction of one component, when the
586 /// index is fixed.
587 template<int i> bool extract (const vbool4& a);
588 
589 /// Helper: substitute val for a[i]
590 template<int i> vbool4 insert (const vbool4& a, bool val);
591 
592 /// Logical reduction across all components.
593 bool reduce_and (const vbool4& v);
594 bool reduce_or (const vbool4& v);
595 
596 // Are all/any/no components true?
597 bool all (const vbool4& v);
598 bool any (const vbool4& v);
599 bool none (const vbool4& v);
600 
601 // It's handy to have this defined for regular bool as well
602 inline bool all (bool v) { return v; }
603 
604 
605 
606 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
607 /// SIMD instructions when available. This is what is naturally produced by
608 /// SIMD comparison operators on the vfloat8 and vint8 types.
609 class vbool8 {
610 public:
611  static const char* type_name() { return "vbool8"; }
612  typedef bool value_t; ///< Underlying equivalent scalar value type
613  enum { elements = 8 }; ///< Number of scalar elements
614  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
615  enum { bits = elements*32 }; ///< Total number of bits
616  typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used
617 
618  /// Default constructor (contents undefined)
619  vbool8 () { }
620 
621  /// Construct from a single value (store it in all slots)
622  vbool8 (bool a) { load (a); }
623 
624  explicit vbool8 (const bool *values);
625 
626  /// Construct from 8 values
627  vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
628 
629  /// Copy construct from another vbool8
630  vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
631 
632  /// Construct from a SIMD int (is each element nonzero?)
633  vbool8 (const vint8 &i);
634 
635  /// Construct from two vbool4's
636  vbool8 (const vbool4 &lo, const vbool4 &hi);
637 
638  /// Construct from the underlying SIMD type
639  vbool8 (const simd_t& m) : m_simd(m) { }
640 
641  /// Return the raw SIMD type
642  operator simd_t () const { return m_simd; }
643  simd_t simd () const { return m_simd; }
644 
645  /// Extract the bitmask
646  int bitmask () const;
647 
648  /// Convert from integer bitmask to a true vbool8
649  static vbool8 from_bitmask (int bitmask);
650 
651  /// Set all components to false
652  void clear ();
653 
654  /// Return a vbool8 the is 'false' for all values
655  static const vbool8 False ();
656 
657  /// Return a vbool8 the is 'true' for all values
658  static const vbool8 True ();
659 
660  /// Assign one value to all components
661  const vbool8 & operator= (bool a);
662 
663  /// Assignment of another vbool8
664  const vbool8 & operator= (const vbool8 & other);
665 
666  /// Component access (get)
667  int operator[] (int i) const;
668 
669  /// Component access (set).
670  void setcomp (int i, bool value);
671 
672  /// Component access (set).
673  /// NOTE: avoid this unsafe construct. It will go away some day.
674  int& operator[] (int i);
675 
676  /// Extract the lower percision vbool4
677  vbool4 lo () const;
678 
679  /// Extract the higher percision vbool4
680  vbool4 hi () const;
681 
682  /// Helper: load a single value into all components.
683  void load (bool a);
684 
685  /// Helper: load separate values into each component.
686  void load (bool a, bool b, bool c, bool d,
687  bool e, bool f, bool g, bool h);
688 
689  /// Helper: store the values into memory as bools.
690  void store (bool *values) const;
691 
692  /// Store the first n values into memory.
693  void store (bool *values, int n) const;
694 
695  /// Logical/bitwise operators, component-by-component
696  friend vbool8 operator! (const vbool8& a);
697  friend vbool8 operator& (const vbool8& a, const vbool8& b);
698  friend vbool8 operator| (const vbool8& a, const vbool8& b);
699  friend vbool8 operator^ (const vbool8& a, const vbool8& b);
700  friend vbool8 operator~ (const vbool8& a);
701  friend const vbool8& operator&= (vbool8& a, const vbool8& b);
702  friend const vbool8& operator|= (vbool8& a, const vbool8& b);
703  friend const vbool8& operator^= (vbool8& a, const vbool8& b);
704 
705  /// Comparison operators, component by component
706  friend vbool8 operator== (const vbool8& a, const vbool8& b);
707  friend vbool8 operator!= (const vbool8& a, const vbool8& b);
708 
709  /// Stream output
710  friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
711 
712 private:
713  // The actual data representation
714  union {
718  };
719 };
720 
721 
722 
723 /// Helper: shuffle/swizzle with constant (templated) indices.
724 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
725 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
726 vbool8 shuffle (const vbool8& a);
727 
728 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
729 template<int i> vbool8 shuffle (const vbool8& a);
730 
731 /// Helper: as rapid as possible extraction of one component, when the
732 /// index is fixed.
733 template<int i> bool extract (const vbool8& a);
734 
735 /// Helper: substitute val for a[i]
736 template<int i> vbool8 insert (const vbool8& a, bool val);
737 
738 /// Logical reduction across all components.
739 bool reduce_and (const vbool8& v);
740 bool reduce_or (const vbool8& v);
741 
742 // Are all/any/no components true?
743 bool all (const vbool8& v);
744 bool any (const vbool8& v);
745 bool none (const vbool8& v);
746 
747 
748 
749 
750 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
751 /// by SIMD instructions when available. This is what is naturally produced
752 /// by SIMD comparison operators on the vfloat16 and vint16 types.
753 class vbool16 {
754 public:
755  static const char* type_name() { return "vbool16"; }
756  typedef bool value_t; ///< Underlying equivalent scalar value type
757  enum { elements = 16 }; ///< Number of scalar elements
758  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
759  enum { bits = 16 }; ///< Total number of bits
760  typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used
761 
762  /// Default constructor (contents undefined)
763  vbool16 () { }
764 
765  /// Construct from a single value (store it in all slots)
766  vbool16 (bool a) { load (a); }
767 
768  explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
769 
770  explicit vbool16 (const bool *values);
771 
772  /// Construct from 16 values
773  vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
774  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
775 
776  /// Copy construct from another vbool16
777  vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
778 
779  /// Construct from a SIMD int (is each element nonzero?)
780  vbool16 (const vint16 &i);
781 
782  /// Construct from two vbool8's
783  vbool16 (const vbool8 &lo, const vbool8 &hi);
784 
785  /// Construct from four vbool4's
786  vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
787 
788  /// Construct from the underlying SIMD type
789  vbool16 (const simd_t& m) : m_simd(m) { }
790 
791  /// Return the raw SIMD type
792  operator simd_t () const { return m_simd; }
793  simd_t simd () const { return m_simd; }
794 
795  int bitmask () const;
796 
797  /// Convert from integer bitmask to a true vbool16
798  static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
799 
800  /// Set all components to false
801  void clear ();
802 
803  /// Return a vbool16 the is 'false' for all values
804  static const vbool16 False ();
805 
806  /// Return a vbool16 the is 'true' for all values
807  static const vbool16 True ();
808 
809  /// Assign one value to all components
810  const vbool16 & operator= (bool a);
811 
812  /// Assignment of another vbool16
813  const vbool16 & operator= (const vbool16 & other);
814 
815  /// Component access (get)
816  int operator[] (int i) const;
817 
818  /// Component access (set).
819  void setcomp (int i, bool value);
820 
821  /// Extract the lower percision vbool8
822  vbool8 lo () const;
823 
824  /// Extract the higher percision vbool8
825  vbool8 hi () const;
826 
827  /// Helper: load a single value into all components.
828  void load (bool a);
829 
830  /// Helper: load separate values into each component.
831  void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
832  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
833 
834  /// Helper: load all components from a bitmask in an int.
835  void load_bitmask (int a);
836 
837  /// Helper: store the values into memory as bools.
838  void store (bool *values) const;
839 
840  /// Store the first n values into memory.
841  void store (bool *values, int n) const;
842 
843  /// Logical/bitwise operators, component-by-component
844  friend vbool4 operator! (const vbool4& a);
845  friend vbool16 operator! (const vbool16& a);
846  friend vbool16 operator& (const vbool16& a, const vbool16& b);
847  friend vbool16 operator| (const vbool16& a, const vbool16& b);
848  friend vbool16 operator^ (const vbool16& a, const vbool16& b);
849  friend vbool16 operator~ (const vbool16& a);
850  friend const vbool16& operator&= (vbool16& a, const vbool16& b);
851  friend const vbool16& operator|= (vbool16& a, const vbool16& b);
852  friend const vbool16& operator^= (vbool16& a, const vbool16& b);
853 
854  /// Comparison operators, component by component
855  friend vbool16 operator== (const vbool16& a, const vbool16& b);
856  friend vbool16 operator!= (const vbool16& a, const vbool16& b);
857 
858  /// Stream output
859  friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
860 
861 private:
862  // The actual data representation
863  union {
865  uint16_t m_bits;
866  };
867 };
868 
869 
870 
871 /// Helper: as rapid as possible extraction of one component, when the
872 /// index is fixed.
873 template<int i> bool extract (const vbool16& a);
874 
875 /// Helper: substitute val for a[i]
876 template<int i> vbool16 insert (const vbool16& a, bool val);
877 
878 /// Logical reduction across all components.
879 bool reduce_and (const vbool16& v);
880 bool reduce_or (const vbool16& v);
881 
882 // Are all/any/no components true?
883 bool all (const vbool16& v);
884 bool any (const vbool16& v);
885 bool none (const vbool16& v);
886 
887 
888 
889 
890 
891 /// Integer 4-vector, accelerated by SIMD instructions when available.
892 class vint4 {
893 public:
894  static const char* type_name() { return "vint4"; }
895  typedef int value_t; ///< Underlying equivalent scalar value type
896  enum { elements = 4 }; ///< Number of scalar elements
897  enum { paddedelements =4 }; ///< Number of scalar elements for full pad
898  enum { bits = 128 }; ///< Total number of bits
899  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
900  typedef vbool4 vbool_t; ///< bool type of the same length
901  typedef vfloat4 vfloat_t; ///< float type of the same length
902  typedef vint4 vint_t; ///< int type of the same length
903  typedef vbool4 bool_t; // old name (deprecated 1.8)
904  typedef vfloat4 float_t; // old name (deprecated 1.8)
905 
906  /// Default constructor (contents undefined)
907  vint4 () { }
908 
909  /// Construct from a single value (store it in all slots)
910  vint4 (int a);
911 
912  /// Construct from 2 values -- (a,a,b,b)
913  vint4 (int a, int b);
914 
915  /// Construct from 4 values
916  vint4 (int a, int b, int c, int d);
917 
918  /// Construct from a pointer to values
919  vint4 (const int *vals);
920 
921  /// Construct from a pointer to unsigned short values
922  explicit vint4 (const unsigned short *vals);
923 
924  /// Construct from a pointer to signed short values
925  explicit vint4 (const short *vals);
926 
927  /// Construct from a pointer to unsigned char values (0 - 255)
928  explicit vint4 (const unsigned char *vals);
929 
930  /// Construct from a pointer to signed char values (-128 - 127)
931  explicit vint4 (const char *vals);
932 
933  /// Copy construct from another vint4
934  vint4 (const vint4 & other) { m_simd = other.m_simd; }
935 
936  /// Convert a vfloat to an vint. Equivalent to i = (int)f;
937  explicit vint4 (const vfloat4& f); // implementation below
938 
939  /// Construct from the underlying SIMD type
940  vint4 (const simd_t& m) : m_simd(m) { }
941 
942  /// Return the raw SIMD type
943  operator simd_t () const { return m_simd; }
944  simd_t simd () const { return m_simd; }
945 
946  /// Return a pointer to the underlying scalar type
947  const value_t* data () const { return (const value_t*)this; }
948  value_t* data () { return (value_t*)this; }
949 
950  /// Sset all components to 0
951  void clear () ;
952 
953  /// Return an vint4 with all components set to 0
954  static const vint4 Zero ();
955 
956  /// Return an vint4 with all components set to 1
957  static const vint4 One ();
958 
959  /// Return an vint4 with all components set to -1 (aka 0xffffffff)
960  static const vint4 NegOne ();
961 
962  /// Return an vint4 with incremented components (e.g., 0,1,2,3).
963  /// Optional arguments can give a non-zero starting point and step size.
964  static const vint4 Iota (int start=0, int step=1);
965 
966  /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
967  static const vint4 Giota ();
968 
969  /// Assign one value to all components.
970  const vint4 & operator= (int a);
971 
972  /// Assignment from another vint4
973  const vint4 & operator= (const vint4& other) ;
974 
975  /// Component access (get)
976  int operator[] (int i) const;
977 
978  /// Component access (set)
979  int& operator[] (int i);
980 
981  /// Component access (set).
982  void setcomp (int i, int value);
983 
984  value_t x () const;
985  value_t y () const;
986  value_t z () const;
987  value_t w () const;
988  void set_x (value_t val);
989  void set_y (value_t val);
990  void set_z (value_t val);
991  void set_w (value_t val);
992 
993  /// Helper: load a single int into all components
994  void load (int a);
995 
996  /// Helper: load separate values into each component.
997  void load (int a, int b, int c, int d);
998 
999  /// Load from an array of 4 values
1000  void load (const int *values);
1001 
1002  void load (const int *values, int n) ;
1003 
1004  /// Load from an array of 4 unsigned short values, convert to vint4
1005  void load (const unsigned short *values) ;
1006 
1007  /// Load from an array of 4 unsigned short values, convert to vint4
1008  void load (const short *values);
1009 
1010  /// Load from an array of 4 unsigned char values, convert to vint4
1011  void load (const unsigned char *values);
1012 
1013  /// Load from an array of 4 unsigned char values, convert to vint4
1014  void load (const char *values);
1015 
1016  /// Store the values into memory
1017  void store (int *values) const;
1018 
1019  /// Store the first n values into memory
1020  void store (int *values, int n) const;
1021 
1022  /// Store the least significant 16 bits of each element into adjacent
1023  /// unsigned shorts.
1024  void store (unsigned short *values) const;
1025 
1026  /// Store the least significant 8 bits of each element into adjacent
1027  /// unsigned chars.
1028  void store (unsigned char *values) const;
1029 
1030  /// Masked load -- read from values[] where mask is 1, load zero where
1031  /// mask is 0.
1032  void load_mask (int mask, const value_t *values);
1033  void load_mask (const vbool_t& mask, const value_t *values);
1034 
1035  /// Masked store -- write to values[] where mask is enabled, don't
1036  /// touch values[] where it's not.
1037  void store_mask (int mask, value_t *values) const;
1038  void store_mask (const vbool_t& mask, value_t *values) const;
1039 
1040  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1041  template<int scale=4>
1042  void gather (const value_t *baseptr, const vint_t& vindex);
1043  /// Gather elements defined by the mask, leave others unchanged.
1044  template<int scale=4>
1045  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1046  template<int scale=4>
1047  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1048 
1049  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1050  template<int scale=4>
1051  void scatter (value_t *baseptr, const vint_t& vindex) const;
1052  /// Scatter elements defined by the mask
1053  template<int scale=4>
1054  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1055  template<int scale=4>
1056  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1057 
1058  // Arithmetic operators (component-by-component)
1059  friend vint4 operator+ (const vint4& a, const vint4& b);
1060  friend vint4 operator- (const vint4& a);
1061  friend vint4 operator- (const vint4& a, const vint4& b);
1062  friend vint4 operator* (const vint4& a, const vint4& b);
1063  friend vint4 operator/ (const vint4& a, const vint4& b);
1064  friend vint4 operator% (const vint4& a, const vint4& b);
1065  friend const vint4 & operator+= (vint4& a, const vint4& b);
1066  friend const vint4 & operator-= (vint4& a, const vint4& b);
1067  friend const vint4 & operator*= (vint4& a, const vint4& b);
1068  friend const vint4 & operator/= (vint4& a, const vint4& b);
1069  friend const vint4 & operator%= (vint4& a, const vint4& b);
1070  // Bitwise operators (component-by-component)
1071  friend vint4 operator& (const vint4& a, const vint4& b);
1072  friend vint4 operator| (const vint4& a, const vint4& b);
1073  friend vint4 operator^ (const vint4& a, const vint4& b);
1074  friend const vint4& operator&= (vint4& a, const vint4& b);
1075  friend const vint4& operator|= (vint4& a, const vint4& b);
1076  friend const vint4& operator^= (vint4& a, const vint4& b);
1077  friend vint4 operator~ (const vint4& a);
1078  friend vint4 operator<< (const vint4& a, unsigned int bits);
1079  friend vint4 operator>> (const vint4& a, unsigned int bits);
1080  friend const vint4& operator<<= (vint4& a, unsigned int bits);
1081  friend const vint4& operator>>= (vint4& a, unsigned int bits);
1082  // Comparison operators (component-by-component)
1083  friend vbool4 operator== (const vint4& a, const vint4& b);
1084  friend vbool4 operator!= (const vint4& a, const vint4& b);
1085  friend vbool4 operator< (const vint4& a, const vint4& b);
1086  friend vbool4 operator> (const vint4& a, const vint4& b);
1087  friend vbool4 operator>= (const vint4& a, const vint4& b);
1088  friend vbool4 operator<= (const vint4& a, const vint4& b);
1089 
1090  /// Stream output
1091  friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1092 
1093 private:
1094  // The actual data representation
1095  union {
1098  };
1099 };
1100 
1101 
1102 
1103 // Shift right logical -- unsigned shift. This differs from operator>>
1104 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1105 // srl((1<<31),1) == 1<<30.
1106 vint4 srl (const vint4& val, const unsigned int bits);
1107 
1108 /// Helper: shuffle/swizzle with constant (templated) indices.
1109 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1110 template<int i0, int i1, int i2, int i3> vint4 shuffle (const vint4& a);
1111 
1112 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1113 template<int i> vint4 shuffle (const vint4& a);
1114 
1115 /// Helper: as rapid as possible extraction of one component, when the
1116 /// index is fixed.
1117 template<int i> int extract (const vint4& v);
1118 
1119 /// The sum of all components, returned in all components.
1120 vint4 vreduce_add (const vint4& v);
1121 
1122 // Reduction across all components
1123 int reduce_add (const vint4& v);
1124 int reduce_and (const vint4& v);
1125 int reduce_or (const vint4& v);
1126 
1127 /// Use a bool mask to select between components of a (if mask[i] is false)
1128 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1129 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1130 
1131 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1132 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1133 /// blend(0,a,mask).
1134 vint4 blend0 (const vint4& a, const vbool4& mask);
1135 
1136 /// Use a bool mask to select between components of a (if mask[i] is false)
1137 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1138 /// blend(0,a,!mask), or blend(a,0,mask).
1139 vint4 blend0not (const vint4& a, const vbool4& mask);
1140 
1141 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1142 /// synonym for blend with arguments rearranged, but this is more clear
1143 /// because the arguments are symmetric to scalar (cond ? a : b).
1144 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1145 
1146 // Per-element math
1147 vint4 abs (const vint4& a);
1148 vint4 min (const vint4& a, const vint4& b);
1149 vint4 max (const vint4& a, const vint4& b);
1150 
1151 // Circular bit rotate by k bits, for N values at once.
1152 vint4 rotl32 (const vint4& x, const unsigned int k);
1153 
1154 /// andnot(a,b) returns ((~a) & b)
1155 vint4 andnot (const vint4& a, const vint4& b);
1156 
1157 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1158 vint4 bitcast_to_int (const vbool4& x);
1159 vint4 bitcast_to_int (const vfloat4& x);
1160 vfloat4 bitcast_to_float (const vint4& x);
1161 
1162 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1163 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1164  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1165 
1166 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1167 
1168 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1169 vint4 safe_mod (const vint4& a, const vint4& b);
1170 vint4 safe_mod (const vint4& a, int b);
1171 
1172 
1173 
1174 
1175 /// Integer 8-vector, accelerated by SIMD instructions when available.
1176 class vint8 {
1177 public:
1178  static const char* type_name() { return "vint8"; }
1179  typedef int value_t; ///< Underlying equivalent scalar value type
1180  enum { elements = 8 }; ///< Number of scalar elements
1181  enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1182  enum { bits = elements*32 }; ///< Total number of bits
1183  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1184  typedef vbool8 vbool_t; ///< bool type of the same length
1185  typedef vfloat8 vfloat_t; ///< float type of the same length
1186  typedef vint8 vint_t; ///< int type of the same length
1187  typedef vbool8 bool_t; // old name (deprecated 1.8)
1188  typedef vfloat8 float_t; // old name (deprecated 1.8)
1189 
1190  /// Default constructor (contents undefined)
1191  vint8 () { }
1192 
1193  /// Construct from a single value (store it in all slots)
1194  vint8 (int a);
1195 
1196  /// Construct from 2 values -- (a,a,b,b)
1197  vint8 (int a, int b);
1198 
1199  /// Construct from 8 values (won't work for vint8)
1200  vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1201 
1202  /// Construct from a pointer to values
1203  vint8 (const int *vals);
1204 
1205  /// Construct from a pointer to unsigned short values
1206  explicit vint8 (const unsigned short *vals);
1207 
1208  /// Construct from a pointer to signed short values
1209  explicit vint8 (const short *vals);
1210 
1211  /// Construct from a pointer to unsigned char values (0 - 255)
1212  explicit vint8 (const unsigned char *vals);
1213 
1214  /// Construct from a pointer to signed char values (-128 - 127)
1215  explicit vint8 (const char *vals);
1216 
1217  /// Copy construct from another vint8
1218  vint8 (const vint8 & other) { m_simd = other.m_simd; }
1219 
1220  /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1221  explicit vint8 (const vfloat8& f); // implementation below
1222 
1223  /// Construct from two vint4's
1224  vint8 (const vint4 &lo, const vint4 &hi);
1225 
1226  /// Construct from the underlying SIMD type
1227  vint8 (const simd_t& m) : m_simd(m) { }
1228 
1229  /// Return the raw SIMD type
1230  operator simd_t () const { return m_simd; }
1231  simd_t simd () const { return m_simd; }
1232 
1233  /// Return a pointer to the underlying scalar type
1234  const value_t* data () const { return (const value_t*)this; }
1235  value_t* data () { return (value_t*)this; }
1236 
1237  /// Sset all components to 0
1238  void clear () ;
1239 
1240  /// Return an vint8 with all components set to 0
1241  static const vint8 Zero ();
1242 
1243  /// Return an vint8 with all components set to 1
1244  static const vint8 One ();
1245 
1246  /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1247  static const vint8 NegOne ();
1248 
1249  /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1250  /// Optional arguments can give a non-zero starting point and step size.
1251  static const vint8 Iota (int start=0, int step=1);
1252 
1253  /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1254  static const vint8 Giota ();
1255 
1256  /// Assign one value to all components.
1257  const vint8 & operator= (int a);
1258 
1259  /// Assignment from another vint8
1260  const vint8 & operator= (const vint8& other) ;
1261 
1262  /// Component access (get)
1263  int operator[] (int i) const;
1264 
1265  /// Component access (set)
1266  int& operator[] (int i);
1267 
1268  /// Component access (set).
1269  void setcomp (int i, int value);
1270 
1271  value_t x () const;
1272  value_t y () const;
1273  value_t z () const;
1274  value_t w () const;
1275  void set_x (value_t val);
1276  void set_y (value_t val);
1277  void set_z (value_t val);
1278  void set_w (value_t val);
1279 
1280  /// Extract the lower percision vint4
1281  vint4 lo () const;
1282 
1283  /// Extract the higher percision vint4
1284  vint4 hi () const;
1285 
1286  /// Helper: load a single int into all components
1287  void load (int a);
1288 
1289  /// Load separate values into each component.
1290  void load (int a, int b, int c, int d, int e, int f, int g, int h);
1291 
1292  /// Load from an array of 8 values
1293  void load (const int *values);
1294 
1295  void load (const int *values, int n) ;
1296 
1297  /// Load from an array of 8 unsigned short values, convert to vint8
1298  void load (const unsigned short *values) ;
1299 
1300  /// Load from an array of 8 unsigned short values, convert to vint8
1301  void load (const short *values);
1302 
1303  /// Load from an array of 8 unsigned char values, convert to vint8
1304  void load (const unsigned char *values);
1305 
1306  /// Load from an array of 8 unsigned char values, convert to vint8
1307  void load (const char *values);
1308 
1309  /// Store the values into memory
1310  void store (int *values) const;
1311 
1312  /// Store the first n values into memory
1313  void store (int *values, int n) const;
1314 
1315  /// Store the least significant 16 bits of each element into adjacent
1316  /// unsigned shorts.
1317  void store (unsigned short *values) const;
1318 
1319  /// Store the least significant 8 bits of each element into adjacent
1320  /// unsigned chars.
1321  void store (unsigned char *values) const;
1322 
1323  /// Masked load -- read from values[] where mask is 1, load zero where
1324  /// mask is 0.
1325  void load_mask (int mask, const value_t *values);
1326  void load_mask (const vbool_t& mask, const value_t *values);
1327 
1328  /// Masked store -- write to values[] where mask is enabled, don't
1329  /// touch values[] where it's not.
1330  void store_mask (int mask, value_t *values) const;
1331  void store_mask (const vbool_t& mask, value_t *values) const;
1332 
1333  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1334  template<int scale=4>
1335  void gather (const value_t *baseptr, const vint_t& vindex);
1336  /// Gather elements defined by the mask, leave others unchanged.
1337  template<int scale=4>
1338  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1339  template<int scale=4>
1340  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1341 
1342  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1343  template<int scale=4>
1344  void scatter (value_t *baseptr, const vint_t& vindex) const;
1345  /// Scatter elements defined by the mask
1346  template<int scale=4>
1347  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1348  template<int scale=4>
1349  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1350 
1351  // Arithmetic operators (component-by-component)
1352  friend vint8 operator+ (const vint8& a, const vint8& b);
1353  friend vint8 operator- (const vint8& a);
1354  friend vint8 operator- (const vint8& a, const vint8& b);
1355  friend vint8 operator* (const vint8& a, const vint8& b);
1356  friend vint8 operator/ (const vint8& a, const vint8& b);
1357  friend vint8 operator% (const vint8& a, const vint8& b);
1358  friend const vint8 & operator+= (vint8& a, const vint8& b);
1359  friend const vint8 & operator-= (vint8& a, const vint8& b);
1360  friend const vint8 & operator*= (vint8& a, const vint8& b);
1361  friend const vint8 & operator/= (vint8& a, const vint8& b);
1362  friend const vint8 & operator%= (vint8& a, const vint8& b);
1363  // Bitwise operators (component-by-component)
1364  friend vint8 operator& (const vint8& a, const vint8& b);
1365  friend vint8 operator| (const vint8& a, const vint8& b);
1366  friend vint8 operator^ (const vint8& a, const vint8& b);
1367  friend const vint8& operator&= (vint8& a, const vint8& b);
1368  friend const vint8& operator|= (vint8& a, const vint8& b);
1369  friend const vint8& operator^= (vint8& a, const vint8& b);
1370  friend vint8 operator~ (const vint8& a);
1371  friend vint8 operator<< (const vint8& a, unsigned int bits);
1372  friend vint8 operator>> (const vint8& a, unsigned int bits);
1373  friend const vint8& operator<<= (vint8& a, unsigned int bits);
1374  friend const vint8& operator>>= (vint8& a, unsigned int bits);
1375  // Comparison operators (component-by-component)
1376  friend vbool8 operator== (const vint8& a, const vint8& b);
1377  friend vbool8 operator!= (const vint8& a, const vint8& b);
1378  friend vbool8 operator< (const vint8& a, const vint8& b);
1379  friend vbool8 operator> (const vint8& a, const vint8& b);
1380  friend vbool8 operator>= (const vint8& a, const vint8& b);
1381  friend vbool8 operator<= (const vint8& a, const vint8& b);
1382 
1383  /// Stream output
1384  friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1385 
1386 private:
1387  // The actual data representation
1388  union {
1392  };
1393 };
1394 
1395 
1396 
1397 // Shift right logical -- unsigned shift. This differs from operator>>
1398 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1399 // srl((1<<31),1) == 1<<30.
1400 vint8 srl (const vint8& val, const unsigned int bits);
1401 
1402 /// Helper: shuffle/swizzle with constant (templated) indices.
1403 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1404 template<int i0, int i1, int i2, int i3,
1405  int i4, int i5, int i6, int i7> vint8 shuffle (const vint8& a);
1406 
1407 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1408 template<int i> vint8 shuffle (const vint8& a);
1409 
1410 /// Helper: as rapid as possible extraction of one component, when the
1411 /// index is fixed.
1412 template<int i> int extract (const vint8& v);
1413 
1414 /// Helper: substitute val for a[i]
1415 template<int i> vint8 insert (const vint8& a, int val);
1416 
1417 /// The sum of all components, returned in all components.
1418 vint8 vreduce_add (const vint8& v);
1419 
1420 // Reduction across all components
1421 int reduce_add (const vint8& v);
1422 int reduce_and (const vint8& v);
1423 int reduce_or (const vint8& v);
1424 
1425 /// Use a bool mask to select between components of a (if mask[i] is false)
1426 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1427 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1428 
1429 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1430 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1431 /// blend(0,a,mask).
1432 vint8 blend0 (const vint8& a, const vbool8& mask);
1433 
1434 /// Use a bool mask to select between components of a (if mask[i] is false)
1435 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1436 /// blend(0,a,!mask), or blend(a,0,mask).
1437 vint8 blend0not (const vint8& a, const vbool8& mask);
1438 
1439 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1440 /// synonym for blend with arguments rearranged, but this is more clear
1441 /// because the arguments are symmetric to scalar (cond ? a : b).
1442 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1443 
1444 // Per-element math
1445 vint8 abs (const vint8& a);
1446 vint8 min (const vint8& a, const vint8& b);
1447 vint8 max (const vint8& a, const vint8& b);
1448 
1449 // Circular bit rotate by k bits, for N values at once.
1450 vint8 rotl32 (const vint8& x, const unsigned int k);
1451 
1452 /// andnot(a,b) returns ((~a) & b)
1453 vint8 andnot (const vint8& a, const vint8& b);
1454 
1455 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1456 vint8 bitcast_to_int (const vbool8& x);
1457 vint8 bitcast_to_int (const vfloat8& x);
1458 vfloat8 bitcast_to_float (const vint8& x);
1459 
1460 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1461 vint8 safe_mod (const vint8& a, const vint8& b);
1462 vint8 safe_mod (const vint8& a, int b);
1463 
1464 
1465 
1466 
1467 
1468 /// Integer 16-vector, accelerated by SIMD instructions when available.
1469 class vint16 {
1470 public:
1471  static const char* type_name() { return "vint16"; }
1472  typedef int value_t; ///< Underlying equivalent scalar value type
1473  enum { elements = 16 }; ///< Number of scalar elements
1474  enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1475  enum { bits = 128 }; ///< Total number of bits
1476  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1477  typedef vbool16 vbool_t; ///< bool type of the same length
1478  typedef vfloat16 vfloat_t; ///< float type of the same length
1479  typedef vint16 vint_t; ///< int type of the same length
1480  typedef vbool16 bool_t; // old name (deprecated 1.8)
1481  typedef vfloat16 float_t; // old name (deprecated 1.8)
1482 
1483  /// Default constructor (contents undefined)
1484  vint16 () { }
1485 
1486  /// Construct from a single value (store it in all slots)
1487  vint16 (int a);
1488 
1489  /// Construct from 16 values (won't work for vint16)
1490  vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1491  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1492 
1493  /// Construct from a pointer to values
1494  vint16 (const int *vals);
1495 
1496  /// Construct from a pointer to unsigned short values
1497  explicit vint16 (const unsigned short *vals);
1498 
1499  /// Construct from a pointer to signed short values
1500  explicit vint16 (const short *vals);
1501 
1502  /// Construct from a pointer to unsigned char values (0 - 255)
1503  explicit vint16 (const unsigned char *vals);
1504 
1505  /// Construct from a pointer to signed char values (-128 - 127)
1506  explicit vint16 (const char *vals);
1507 
1508  /// Copy construct from another vint16
1509  vint16 (const vint16 & other) { m_simd = other.m_simd; }
1510 
1511  /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1512  explicit vint16 (const vfloat16& f); // implementation below
1513 
1514  /// Construct from two vint8's
1515  vint16 (const vint8 &lo, const vint8 &hi);
1516 
1517  /// Construct from four vint4's
1518  vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1519 
1520  /// Construct from the underlying SIMD type
1521  vint16 (const simd_t& m) : m_simd(m) { }
1522 
1523  /// Return the raw SIMD type
1524  operator simd_t () const { return m_simd; }
1525  simd_t simd () const { return m_simd; }
1526 
1527  /// Return a pointer to the underlying scalar type
1528  const value_t* data () const { return (const value_t*)this; }
1529  value_t* data () { return (value_t*)this; }
1530 
1531  /// Sset all components to 0
1532  void clear () ;
1533 
1534  /// Return an vint16 with all components set to 0
1535  static const vint16 Zero ();
1536 
1537  /// Return an vint16 with all components set to 1
1538  static const vint16 One ();
1539 
1540  /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1541  static const vint16 NegOne ();
1542 
1543  /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1544  /// Optional arguments can give a non-zero starting point and step size.
1545  static const vint16 Iota (int start=0, int step=1);
1546 
1547  /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1548  static const vint16 Giota ();
1549 
1550  /// Assign one value to all components.
1551  const vint16 & operator= (int a);
1552 
1553  /// Assignment from another vint16
1554  const vint16 & operator= (const vint16& other) ;
1555 
1556  /// Component access (get)
1557  int operator[] (int i) const;
1558 
1559  /// Component access (set)
1560  int& operator[] (int i);
1561 
1562  /// Component access (set).
1563  void setcomp (int i, int value);
1564 
1565  value_t x () const;
1566  value_t y () const;
1567  value_t z () const;
1568  value_t w () const;
1569  void set_x (value_t val);
1570  void set_y (value_t val);
1571  void set_z (value_t val);
1572  void set_w (value_t val);
1573 
1574  /// Extract the lower percision vint8
1575  vint8 lo () const;
1576 
1577  /// Extract the higher percision vint8
1578  vint8 hi () const;
1579 
1580  /// Helper: load a single int into all components
1581  void load (int a);
1582 
1583  /// Load separate values into each component.
1584  void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1585  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1586 
1587  /// Load from an array of 16 values
1588  void load (const int *values);
1589 
1590  void load (const int *values, int n) ;
1591 
1592  /// Load from an array of 16 unsigned short values, convert to vint16
1593  void load (const unsigned short *values) ;
1594 
1595  /// Load from an array of 16 unsigned short values, convert to vint16
1596  void load (const short *values);
1597 
1598  /// Load from an array of 16 unsigned char values, convert to vint16
1599  void load (const unsigned char *values);
1600 
1601  /// Load from an array of 16 unsigned char values, convert to vint16
1602  void load (const char *values);
1603 
1604  /// Store the values into memory
1605  void store (int *values) const;
1606 
1607  /// Store the first n values into memory
1608  void store (int *values, int n) const;
1609 
1610  /// Store the least significant 16 bits of each element into adjacent
1611  /// unsigned shorts.
1612  void store (unsigned short *values) const;
1613 
1614  /// Store the least significant 8 bits of each element into adjacent
1615  /// unsigned chars.
1616  void store (unsigned char *values) const;
1617 
1618  /// Masked load -- read from values[] where mask is 1, load zero where
1619  /// mask is 0.
1620  void load_mask (const vbool_t &mask, const value_t *values);
1621  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1622 
1623  /// Masked store -- write to values[] where mask is enabled, don't
1624  /// touch values[] where it's not.
1625  void store_mask (const vbool_t &mask, value_t *values) const;
1626  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1627 
1628  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1629  template<int scale=4>
1630  void gather (const value_t *baseptr, const vint_t& vindex);
1631  /// Gather elements defined by the mask, leave others unchanged.
1632  template<int scale=4>
1633  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1634  template<int scale=4>
1635  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1636  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1637  }
1638 
1639  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1640  template<int scale=4>
1641  void scatter (value_t *baseptr, const vint_t& vindex) const;
1642  /// Scatter elements defined by the mask
1643  template<int scale=4>
1644  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1645  template<int scale=4>
1646  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1647  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1648  }
1649 
1650  // Arithmetic operators (component-by-component)
1651  friend vint16 operator+ (const vint16& a, const vint16& b);
1652  friend vint16 operator- (const vint16& a);
1653  friend vint16 operator- (const vint16& a, const vint16& b);
1654  friend vint16 operator* (const vint16& a, const vint16& b);
1655  friend vint16 operator/ (const vint16& a, const vint16& b);
1656  friend vint16 operator% (const vint16& a, const vint16& b);
1657  friend const vint16 & operator+= (vint16& a, const vint16& b);
1658  friend const vint16 & operator-= (vint16& a, const vint16& b);
1659  friend const vint16 & operator*= (vint16& a, const vint16& b);
1660  friend const vint16 & operator/= (vint16& a, const vint16& b);
1661  friend const vint16 & operator%= (vint16& a, const vint16& b);
1662  // Bitwise operators (component-by-component)
1663  friend vint16 operator& (const vint16& a, const vint16& b);
1664  friend vint16 operator| (const vint16& a, const vint16& b);
1665  friend vint16 operator^ (const vint16& a, const vint16& b);
1666  friend const vint16& operator&= (vint16& a, const vint16& b);
1667  friend const vint16& operator|= (vint16& a, const vint16& b);
1668  friend const vint16& operator^= (vint16& a, const vint16& b);
1669  friend vint16 operator~ (const vint16& a);
1670  friend vint16 operator<< (const vint16& a, unsigned int bits);
1671  friend vint16 operator>> (const vint16& a, unsigned int bits);
1672  friend const vint16& operator<<= (vint16& a, unsigned int bits);
1673  friend const vint16& operator>>= (vint16& a, unsigned int bits);
1674  // Comparison operators (component-by-component)
1675  friend vbool16 operator== (const vint16& a, const vint16& b);
1676  friend vbool16 operator!= (const vint16& a, const vint16& b);
1677  friend vbool16 operator< (const vint16& a, const vint16& b);
1678  friend vbool16 operator> (const vint16& a, const vint16& b);
1679  friend vbool16 operator>= (const vint16& a, const vint16& b);
1680  friend vbool16 operator<= (const vint16& a, const vint16& b);
1681 
1682  /// Stream output
1683  friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1684 
1685 private:
1686  // The actual data representation
1687  union {
1691  };
1692 };
1693 
1694 
1695 
1696 /// Shift right logical -- unsigned shift. This differs from operator>>
1697 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1698 /// srl((1<<31),1) == 1<<30.
1699 vint16 srl (const vint16& val, const unsigned int bits);
1700 
1701 /// Shuffle groups of 4
1702 template<int i0, int i1, int i2, int i3>
1703 vint16 shuffle4 (const vint16& a);
1704 
1705 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1706 template<int i> vint16 shuffle4 (const vint16& a);
1707 
1708 /// Shuffle within each group of 4
1709 template<int i0, int i1, int i2, int i3>
1710 vint16 shuffle (const vint16& a);
1711 
1712 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1713 template<int i> vint16 shuffle (const vint16& a);
1714 
1715 /// Helper: as rapid as possible extraction of one component, when the
1716 /// index is fixed.
1717 template<int i> int extract (const vint16& v);
1718 
1719 /// Helper: substitute val for a[i]
1720 template<int i> vint16 insert (const vint16& a, int val);
1721 
1722 /// The sum of all components, returned in all components.
1723 vint16 vreduce_add (const vint16& v);
1724 
1725 // Reduction across all components
1726 int reduce_add (const vint16& v);
1727 int reduce_and (const vint16& v);
1728 int reduce_or (const vint16& v);
1729 
1730 /// Use a bool mask to select between components of a (if mask[i] is false)
1731 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1732 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1733 
1734 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1735 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1736 /// blend(0,a,mask).
1737 vint16 blend0 (const vint16& a, const vbool16& mask);
1738 
1739 /// Use a bool mask to select between components of a (if mask[i] is false)
1740 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1741 /// blend(0,a,!mask), or blend(a,0,mask).
1742 vint16 blend0not (const vint16& a, const vbool16& mask);
1743 
1744 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1745 /// synonym for blend with arguments rearranged, but this is more clear
1746 /// because the arguments are symmetric to scalar (cond ? a : b).
1747 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1748 
1749 // Per-element math
1750 vint16 abs (const vint16& a);
1751 vint16 min (const vint16& a, const vint16& b);
1752 vint16 max (const vint16& a, const vint16& b);
1753 
1754 // Circular bit rotate by k bits, for N values at once.
1755 vint16 rotl32 (const vint16& x, const unsigned int k);
1756 
1757 /// andnot(a,b) returns ((~a) & b)
1758 vint16 andnot (const vint16& a, const vint16& b);
1759 
1760 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1761 vint16 bitcast_to_int (const vbool16& x);
1762 vint16 bitcast_to_int (const vfloat16& x);
1763 vfloat16 bitcast_to_float (const vint16& x);
1764 
1765 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1766 vint16 safe_mod (const vint16& a, const vint16& b);
1767 vint16 safe_mod (const vint16& a, int b);
1768 
1769 
1770 
1771 
1772 
1773 /// Floating point 4-vector, accelerated by SIMD instructions when
1774 /// available.
1775 class vfloat4 {
1776 public:
1777  static const char* type_name() { return "vfloat4"; }
1778  typedef float value_t; ///< Underlying equivalent scalar value type
1779  enum { elements = 4 }; ///< Number of scalar elements
1780  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1781  enum { bits = elements*32 }; ///< Total number of bits
1782  typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used
1783  typedef vfloat4 vfloat_t; ///< SIMD int type
1784  typedef vint4 vint_t; ///< SIMD int type
1785  typedef vbool4 vbool_t; ///< SIMD bool type
1786  typedef vint4 int_t; // old name (deprecated 1.8)
1787  typedef vbool4 bool_t; // old name (deprecated 1.8)
1788 
1789  /// Default constructor (contents undefined)
1790  vfloat4 () { }
1791 
1792  /// Construct from a single value (store it in all slots)
1793  vfloat4 (float a) { load(a); }
1794 
1795  /// Construct from 3 or 4 values
1796  vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1797 
1798  /// Construct from a pointer to 4 values
1799  vfloat4 (const float *f) { load (f); }
1800 
1801  /// Copy construct from another vfloat4
1802  vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1803 
1804  /// Construct from an vint4 (promoting all components to float)
1805  explicit vfloat4 (const vint4& ival);
1806 
1807  /// Construct from the underlying SIMD type
1808  vfloat4 (const simd_t& m) : m_simd(m) { }
1809 
1810  /// Return the raw SIMD type
1811  operator simd_t () const { return m_simd; }
1812  simd_t simd () const { return m_simd; }
1813 
1814  /// Return a pointer to the underlying scalar type
1815  const value_t* data () const { return (const value_t*)this; }
1816  value_t* data () { return (value_t*)this; }
1817 
1818  /// Construct from a Imath::V3f
1819  vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); }
1820 
1821  /// Cast to a Imath::V3f
1822  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1823 
1824  /// Construct from a Imath::V4f
1825  vfloat4 (const Imath::V4f &v) { load ((const float *)&v); }
1826 
1827  /// Cast to a Imath::V4f
1828  const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1829 
1830  /// Construct from a pointer to 4 unsigned short values
1831  explicit vfloat4 (const unsigned short *vals) { load(vals); }
1832 
1833  /// Construct from a pointer to 4 short values
1834  explicit vfloat4 (const short *vals) { load(vals); }
1835 
1836  /// Construct from a pointer to 4 unsigned char values
1837  explicit vfloat4 (const unsigned char *vals) { load(vals); }
1838 
1839  /// Construct from a pointer to 4 char values
1840  explicit vfloat4 (const char *vals) { load(vals); }
1841 
1842 #ifdef _HALF_H_
1843  /// Construct from a pointer to 4 half (16 bit float) values
1844  explicit vfloat4 (const half *vals) { load(vals); }
1845 #endif
1846 
1847  /// Assign a single value to all components
1848  const vfloat4 & operator= (float a) { load(a); return *this; }
1849 
1850  /// Assign a vfloat4
1851  const vfloat4 & operator= (vfloat4 other) {
1852  m_simd = other.m_simd;
1853  return *this;
1854  }
1855 
1856  /// Return a vfloat4 with all components set to 0.0
1857  static const vfloat4 Zero ();
1858 
1859  /// Return a vfloat4 with all components set to 1.0
1860  static const vfloat4 One ();
1861 
1862  /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1863  /// Optional argument can give a non-zero starting point and non-1 step.
1864  static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1865 
1866  /// Set all components to 0.0
1867  void clear ();
1868 
1869  /// Assign from a Imath::V4f
1870  const vfloat4 & operator= (const Imath::V4f &v);
1871 
1872  /// Assign from a Imath::V3f
1873  const vfloat4 & operator= (const Imath::V3f &v);
1874 
1875  /// Component access (get)
1876  float operator[] (int i) const;
1877  /// Component access (set)
1878  float& operator[] (int i);
1879 
1880  /// Component access (set).
1881  void setcomp (int i, float value);
1882 
1883  value_t x () const;
1884  value_t y () const;
1885  value_t z () const;
1886  value_t w () const;
1887  void set_x (value_t val);
1888  void set_y (value_t val);
1889  void set_z (value_t val);
1890  void set_w (value_t val);
1891 
1892  /// Helper: load a single value into all components
1893  void load (float val);
1894 
1895  /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
1896  void load (float a, float b, float c, float d=0.0f);
1897 
1898  /// Load from an array of 4 values
1899  void load (const float *values);
1900 
1901  /// Load from a partial array of <=4 values. Unassigned values are
1902  /// undefined.
1903  void load (const float *values, int n);
1904 
1905  /// Load from an array of 4 unsigned short values, convert to float
1906  void load (const unsigned short *values);
1907 
1908  /// Load from an array of 4 short values, convert to float
1909  void load (const short *values);
1910 
1911  /// Load from an array of 4 unsigned char values, convert to float
1912  void load (const unsigned char *values);
1913 
1914  /// Load from an array of 4 char values, convert to float
1915  void load (const char *values);
1916 
1917 #ifdef _HALF_H_
1918  /// Load from an array of 4 half values, convert to float
1919  void load (const half *values);
1920 #endif /* _HALF_H_ */
1921 
1922  void store (float *values) const;
1923 
1924  /// Store the first n values into memory
1925  void store (float *values, int n) const;
1926 
1927 #ifdef _HALF_H_
1928  void store (half *values) const;
1929 #endif
1930 
1931  /// Masked load -- read from values[] where mask is 1, load zero where
1932  /// mask is 0.
1933  void load_mask (int mask, const value_t *values);
1934  void load_mask (const vbool_t& mask, const value_t *values);
1935 
1936  /// Masked store -- write to values[] where mask is enabled, don't
1937  /// touch values[] where it's not.
1938  void store_mask (int mask, value_t *values) const;
1939  void store_mask (const vbool_t& mask, value_t *values) const;
1940 
1941  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1942  template<int scale=4>
1943  void gather (const value_t *baseptr, const vint_t& vindex);
1944  /// Gather elements defined by the mask, leave others unchanged.
1945  template<int scale=4>
1946  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
1947  template<int scale=4>
1948  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1949 
1950  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1951  template<int scale=4>
1952  void scatter (value_t *baseptr, const vint_t& vindex) const;
1953  /// Scatter elements defined by the mask
1954  template<int scale=4>
1955  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1956  template<int scale=4>
1957  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1958 
1959  // Arithmetic operators
1960  friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
1961  const vfloat4 & operator+= (const vfloat4& a);
1962  vfloat4 operator- () const;
1963  friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
1964  const vfloat4 & operator-= (const vfloat4& a);
1965  friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
1966  const vfloat4 & operator*= (const vfloat4& a);
1967  const vfloat4 & operator*= (float val);
1968  friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
1969  const vfloat4 & operator/= (const vfloat4& a);
1970  const vfloat4 & operator/= (float val);
1971 
1972  // Comparison operations
1973  friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
1974  friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
1975  friend vbool4 operator< (const vfloat4& a, const vfloat4& b);
1976  friend vbool4 operator> (const vfloat4& a, const vfloat4& b);
1977  friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
1978  friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
1979 
1980  // Some oddball items that are handy
1981 
1982  /// Combine the first two components of A with the first two components
1983  /// of B.
1984  friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
1985 
1986  /// Combine the first two components of A with the first two components
1987  /// of B, but interleaved.
1988  friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
1989 
1990  /// Return xyz components, plus 0 for w
1991  vfloat4 xyz0 () const;
1992 
1993  /// Return xyz components, plus 1 for w
1994  vfloat4 xyz1 () const;
1995 
1996  /// Stream output
1997  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
1998 
1999 protected:
2000  // The actual data representation
2001  union {
2004  };
2005 };
2006 
2007 
2008 /// Helper: shuffle/swizzle with constant (templated) indices.
2009 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2010 template<int i0, int i1, int i2, int i3> vfloat4 shuffle (const vfloat4& a);
2011 
2012 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2013 template<int i> vfloat4 shuffle (const vfloat4& a);
2014 
2015 /// Helper: as rapid as possible extraction of one component, when the
2016 /// index is fixed.
2017 template<int i> float extract (const vfloat4& a);
2018 
2019 /// Helper: substitute val for a[i]
2020 template<int i> vfloat4 insert (const vfloat4& a, float val);
2021 
2022 /// The sum of all components, returned in all components.
2023 vfloat4 vreduce_add (const vfloat4& v);
2024 
2025 /// The sum of all components, returned as a scalar.
2026 float reduce_add (const vfloat4& v);
2027 
2028 /// Return the float dot (inner) product of a and b in every component.
2029 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2030 
2031 /// Return the float dot (inner) product of a and b.
2032 float dot (const vfloat4 &a, const vfloat4 &b);
2033 
2034 /// Return the float 3-component dot (inner) product of a and b in
2035 /// all components.
2036 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2037 
2038 /// Return the float 3-component dot (inner) product of a and b.
2039 float dot3 (const vfloat4 &a, const vfloat4 &b);
2040 
2041 /// Use a bool mask to select between components of a (if mask[i] is false)
2042 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2043 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2044 
2045 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2046 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2047 /// blend(0,a,mask).
2048 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2049 
2050 /// Use a bool mask to select between components of a (if mask[i] is false)
2051 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2052 /// blend(0,a,!mask), or blend(a,0,mask).
2053 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2054 
2055 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2056 /// that is 0, return 0 rather than Inf.
2057 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2058 
2059 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2060 vfloat3 hdiv (const vfloat4 &a);
2061 
2062 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2063 /// synonym for blend with arguments rearranged, but this is more clear
2064 /// because the arguments are symmetric to scalar (cond ? a : b).
2065 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2066 
2067 // Per-element math
2068 vfloat4 abs (const vfloat4& a); ///< absolute value (float)
2069 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative
2070 vfloat4 ceil (const vfloat4& a);
2071 vfloat4 floor (const vfloat4& a);
2072 vint4 ifloor (const vfloat4& a); ///< (int)floor
2073 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2074 
2075 /// Per-element round to nearest integer (rounding away from 0 in cases
2076 /// that are exactly half way).
2077 vfloat4 round (const vfloat4& a);
2078 
2079 /// Per-element round to nearest integer (rounding away from 0 in cases
2080 /// that are exactly half way).
2081 vint4 rint (const vfloat4& a);
2082 
2083 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a
2084 vfloat4 sqrt (const vfloat4 &a);
2085 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt
2086 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt
2087 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2088 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2089 template <typename T> T exp (const T& v); // template for all SIMD variants
2090 template <typename T> T log (const T& v);
2091 
2092 /// andnot(a,b) returns ((~a) & b)
2093 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2094 
2095 // Fused multiply and add (or subtract):
2096 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2097 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2098 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2099 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2100 
2101 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2102 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2103 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2104 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2105 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2106  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2107 
2108 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2109 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2110  const vfloat4& c, const vfloat4& d);
2111 
2112 
2113 
2114 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2115 /// The way it differs from vfloat4 is that all of he load functions only
2116 /// load three values, and all the stores only store 3 values. The vast
2117 /// majority of ops just fall back to the vfloat4 version, and so will
2118 /// operate on the 4th component, but we won't care about that results.
2119 class vfloat3 : public vfloat4 {
2120 public:
2121  static const char* type_name() { return "vfloat3"; }
2122  enum { elements = 3 }; ///< Number of scalar elements
2123  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2124 
2125  /// Default constructor (contents undefined)
2126  vfloat3 () { }
2127 
2128  /// Construct from a single value (store it in all slots)
2129  vfloat3 (float a) { load(a); }
2130 
2131  /// Construct from 3 or 4 values
2132  vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2133 
2134  /// Construct from a pointer to 4 values
2135  vfloat3 (const float *f) { load (f); }
2136 
2137  /// Copy construct from another vfloat3
2138  vfloat3 (const vfloat3 &other);
2139 
2140  explicit vfloat3 (const vfloat4 &other);
2141 
2142 #if OIIO_SIMD
2143  /// Construct from the underlying SIMD type
2144  explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2145 #endif
2146 
2147  /// Construct from a Imath::V3f
2148  vfloat3 (const Imath::V3f &v) : vfloat4(v) { }
2149 
2150  /// Cast to a Imath::V3f
2151  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2152 
2153  /// Construct from a pointer to 4 unsigned short values
2154  explicit vfloat3 (const unsigned short *vals) { load(vals); }
2155 
2156  /// Construct from a pointer to 4 short values
2157  explicit vfloat3 (const short *vals) { load(vals); }
2158 
2159  /// Construct from a pointer to 4 unsigned char values
2160  explicit vfloat3 (const unsigned char *vals) { load(vals); }
2161 
2162  /// Construct from a pointer to 4 char values
2163  explicit vfloat3 (const char *vals) { load(vals); }
2164 
2165 #ifdef _HALF_H_
2166  /// Construct from a pointer to 4 half (16 bit float) values
2167  explicit vfloat3 (const half *vals) { load(vals); }
2168 #endif
2169 
2170  /// Assign a single value to all components
2171  const vfloat3 & operator= (float a) { load(a); return *this; }
2172 
2173  /// Return a vfloat3 with all components set to 0.0
2174  static const vfloat3 Zero ();
2175 
2176  /// Return a vfloat3 with all components set to 1.0
2177  static const vfloat3 One ();
2178 
2179  /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2180  /// Optional argument can give a non-zero starting point and non-1 step.
2181  static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2182 
2183  /// Helper: load a single value into all components
2184  void load (float val);
2185 
2186  /// Load from an array of 4 values
2187  void load (const float *values);
2188 
2189  /// Load from an array of 4 values
2190  void load (const float *values, int n);
2191 
2192  /// Load from an array of 4 unsigned short values, convert to float
2193  void load (const unsigned short *values);
2194 
2195  /// Load from an array of 4 short values, convert to float
2196  void load (const short *values);
2197 
2198  /// Load from an array of 4 unsigned char values, convert to float
2199  void load (const unsigned char *values);
2200 
2201  /// Load from an array of 4 char values, convert to float
2202  void load (const char *values);
2203 
2204 #ifdef _HALF_H_
2205  /// Load from an array of 4 half values, convert to float
2206  void load (const half *values);
2207 #endif /* _HALF_H_ */
2208 
2209  void store (float *values) const;
2210 
2211  void store (float *values, int n) const;
2212 
2213 #ifdef _HALF_H_
2214  void store (half *values) const;
2215 #endif
2216 
2217  /// Store into an Imath::V3f reference.
2218  void store (Imath::V3f &vec) const;
2219 
2220  // Math operators -- define in terms of vfloat3.
2221  friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2222  const vfloat3 & operator+= (const vfloat3& a);
2223  vfloat3 operator- () const;
2224  friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2225  const vfloat3 & operator-= (const vfloat3& a);
2226  friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2227  const vfloat3 & operator*= (const vfloat3& a);
2228  const vfloat3 & operator*= (float a);
2229  friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2230  const vfloat3 & operator/= (const vfloat3& a);
2231  const vfloat3 & operator/= (float a);
2232 
2233  vfloat3 normalized () const;
2234  vfloat3 normalized_fast () const;
2235 
2236  /// Stream output
2237  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2238 };
2239 
2240 
2241 
2242 
2243 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2244 /// not in registers) isomorphic to Imath::M44f.
2245 class matrix44 {
2246 public:
2247  // Uninitialized
2249 #ifndef OIIO_SIMD_SSE
2250  : m_mat(Imath::UNINITIALIZED)
2251 #endif
2252  { }
2253 
2254  /// Construct from a reference to an Imath::M44f
2256 #if OIIO_SIMD_SSE
2257  m_row[0].load (M[0]);
2258  m_row[1].load (M[1]);
2259  m_row[2].load (M[2]);
2260  m_row[3].load (M[3]);
2261 #else
2262  m_mat = M;
2263 #endif
2264  }
2265 
2266  /// Construct from a float array
2267  OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2268 #if OIIO_SIMD_SSE
2269  m_row[0].load (f+0);
2270  m_row[1].load (f+4);
2271  m_row[2].load (f+8);
2272  m_row[3].load (f+12);
2273 #else
2274  m_mat = *(const Imath::M44f*)f;
2275 #endif
2276  }
2277 
2278  /// Construct from 4 vfloat4 rows
2279  OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2280  const vfloat4& c, const vfloat4& d) {
2281 #if OIIO_SIMD_SSE
2282  m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d;
2283 #else
2284  a.store (m_mat[0]);
2285  b.store (m_mat[1]);
2286  c.store (m_mat[2]);
2287  d.store (m_mat[3]);
2288 #endif
2289  }
2290  /// Construct from 4 float[4] rows
2291  OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2292  const float *c, const float *d) {
2293 #if OIIO_SIMD_SSE
2294  m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2295 #else
2296  memcpy (m_mat[0], a, 4*sizeof(float));
2297  memcpy (m_mat[1], b, 4*sizeof(float));
2298  memcpy (m_mat[2], c, 4*sizeof(float));
2299  memcpy (m_mat[3], d, 4*sizeof(float));
2300 #endif
2301  }
2302 
2303  /// Construct from 16 floats
2304  OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2305  float f10, float f11, float f12, float f13,
2306  float f20, float f21, float f22, float f23,
2307  float f30, float f31, float f32, float f33)
2308  {
2309 #if OIIO_SIMD_SSE
2310  m_row[0].load (f00, f01, f02, f03);
2311  m_row[1].load (f10, f11, f12, f13);
2312  m_row[2].load (f20, f21, f22, f23);
2313  m_row[3].load (f30, f31, f32, f33);
2314 #else
2315  m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2316  m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2317  m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2318  m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2319 #endif
2320  }
2321 
2322  /// Present as an Imath::M44f
2323  const Imath::M44f& M44f() const;
2324 
2325  /// Return one row
2326  vfloat4 operator[] (int i) const;
2327 
2328  /// Return the transposed matrix
2329  matrix44 transposed () const;
2330 
2331  /// Transform 3-point V by 4x4 matrix M.
2332  vfloat3 transformp (const vfloat3 &V) const;
2333 
2334  /// Transform 3-vector V by 4x4 matrix M.
2335  vfloat3 transformv (const vfloat3 &V) const;
2336 
2337  /// Transform 3-vector V by the transpose of 4x4 matrix M.
2338  vfloat3 transformvT (const vfloat3 &V) const;
2339 
2340  friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2341  friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2342 
2343  bool operator== (const matrix44& m) const;
2344 
2345  bool operator== (const Imath::M44f& m) const ;
2346  friend bool operator== (const Imath::M44f& a, const matrix44 &b);
2347 
2348  bool operator!= (const matrix44& m) const;
2349 
2350  bool operator!= (const Imath::M44f& m) const;
2351  friend bool operator!= (const Imath::M44f& a, const matrix44 &b);
2352 
2353  /// Return the inverse of the matrix.
2354  matrix44 inverse() const;
2355 
2356  /// Stream output
2357  friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2358 
2359 private:
2360 #if OIIO_SIMD_SSE
2361  vfloat4 m_row[4];
2362 #else
2363  Imath::M44f m_mat;
2364 #endif
2365 };
2366 
2367 /// Transform 3-point V by 4x4 matrix M.
2368 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2369 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V);
2370 
2371 /// Transform 3-vector V by 4x4 matrix M.
2372 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2373 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V);
2374 
2375 // Transform 3-vector by the transpose of 4x4 matrix M.
2376 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2377 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V);
2378 
2379 
2380 
2381 
2382 /// Floating point 8-vector, accelerated by SIMD instructions when
2383 /// available.
2384 class vfloat8 {
2385 public:
2386  static const char* type_name() { return "vfloat8"; }
2387  typedef float value_t; ///< Underlying equivalent scalar value type
2388  enum { elements = 8 }; ///< Number of scalar elements
2389  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2390  enum { bits = elements*32 }; ///< Total number of bits
2391  typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used
2392  typedef vfloat8 vfloat_t; ///< SIMD int type
2393  typedef vint8 vint_t; ///< SIMD int type
2394  typedef vbool8 vbool_t; ///< SIMD bool type
2395  typedef vint8 int_t; // old name (deprecated 1.8)
2396  typedef vbool8 bool_t; // old name (deprecated 1.8)
2397 
2398  /// Default constructor (contents undefined)
2399  vfloat8 () { }
2400 
2401  /// Construct from a single value (store it in all slots)
2402  vfloat8 (float a) { load(a); }
2403 
2404  /// Construct from 8 values
2405  vfloat8 (float a, float b, float c, float d,
2406  float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2407 
2408  /// Construct from a pointer to 8 values
2409  vfloat8 (const float *f) { load (f); }
2410 
2411  /// Copy construct from another vfloat8
2412  vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2413 
2414  /// Construct from an int vector (promoting all components to float)
2415  explicit vfloat8 (const vint8& ival);
2416 
2417  /// Construct from two vfloat4's
2418  vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2419 
2420  /// Construct from the underlying SIMD type
2421  vfloat8 (const simd_t& m) : m_simd(m) { }
2422 
2423  /// Return the raw SIMD type
2424  operator simd_t () const { return m_simd; }
2425  simd_t simd () const { return m_simd; }
2426 
2427  /// Return a pointer to the underlying scalar type
2428  const value_t* data () const { return (const value_t*)this; }
2429  value_t* data () { return (value_t*)this; }
2430 
2431  /// Construct from a pointer to unsigned short values
2432  explicit vfloat8 (const unsigned short *vals) { load(vals); }
2433 
2434  /// Construct from a pointer to short values
2435  explicit vfloat8 (const short *vals) { load(vals); }
2436 
2437  /// Construct from a pointer to unsigned char values
2438  explicit vfloat8 (const unsigned char *vals) { load(vals); }
2439 
2440  /// Construct from a pointer to char values
2441  explicit vfloat8 (const char *vals) { load(vals); }
2442 
2443 #ifdef _HALF_H_
2444  /// Construct from a pointer to half (16 bit float) values
2445  explicit vfloat8 (const half *vals) { load(vals); }
2446 #endif
2447 
2448  /// Assign a single value to all components
2449  const vfloat8& operator= (float a) { load(a); return *this; }
2450 
2451  /// Assign a vfloat8
2452  const vfloat8& operator= (vfloat8 other) {
2453  m_simd = other.m_simd;
2454  return *this;
2455  }
2456 
2457  /// Return a vfloat8 with all components set to 0.0
2458  static const vfloat8 Zero ();
2459 
2460  /// Return a vfloat8 with all components set to 1.0
2461  static const vfloat8 One ();
2462 
2463  /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2464  /// Optional argument can give a non-zero starting point and non-1 step.
2465  static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2466 
2467  /// Set all components to 0.0
2468  void clear ();
2469 
2470  /// Component access (get)
2471  float operator[] (int i) const;
2472  /// Component access (set)
2473  float& operator[] (int i);
2474 
2475  /// Component access (set).
2476  void setcomp (int i, float value);
2477 
2478  value_t x () const;
2479  value_t y () const;
2480  value_t z () const;
2481  value_t w () const;
2482  void set_x (value_t val);
2483  void set_y (value_t val);
2484  void set_z (value_t val);
2485  void set_w (value_t val);
2486 
2487  /// Extract the lower percision vfloat4
2488  vfloat4 lo () const;
2489 
2490  /// Extract the higher percision vfloat4
2491  vfloat4 hi () const;
2492 
2493  /// Helper: load a single value into all components
2494  void load (float val);
2495 
2496  /// Helper: load 8 values
2497  void load (float a, float b, float c, float d,
2498  float e, float f, float g, float h);
2499 
2500  /// Load from an array of values
2501  void load (const float *values);
2502 
2503  /// Load from a partial array of <=8 values. Unassigned values are
2504  /// undefined.
2505  void load (const float *values, int n);
2506 
2507  /// Load from an array of 8 unsigned short values, convert to float
2508  void load (const unsigned short *values);
2509 
2510  /// Load from an array of 8 short values, convert to float
2511  void load (const short *values);
2512 
2513  /// Load from an array of 8 unsigned char values, convert to float
2514  void load (const unsigned char *values);
2515 
2516  /// Load from an array of 8 char values, convert to float
2517  void load (const char *values);
2518 
2519 #ifdef _HALF_H_
2520  /// Load from an array of 8 half values, convert to float
2521  void load (const half *values);
2522 #endif /* _HALF_H_ */
2523 
2524  void store (float *values) const;
2525 
2526  /// Store the first n values into memory
2527  void store (float *values, int n) const;
2528 
2529 #ifdef _HALF_H_
2530  void store (half *values) const;
2531 #endif
2532 
2533  /// Masked load -- read from values[] where mask is 1, load zero where
2534  /// mask is 0.
2535  void load_mask (int mask, const value_t *values);
2536  void load_mask (const vbool_t& mask, const value_t *values);
2537 
2538  /// Masked store -- write to values[] where mask is enabled, don't
2539  /// touch values[] where it's not.
2540  void store_mask (int mask, value_t *values) const;
2541  void store_mask (const vbool_t& mask, value_t *values) const;
2542 
2543  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2544  template<int scale=4>
2545  void gather (const value_t *baseptr, const vint_t& vindex);
2546  template<int scale=4>
2547  // Fastest way to fill with all 1 bits is to cmp any value to itself.
2548  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2549  template<int scale=4>
2550  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2551 
2552  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2553  template<int scale=4>
2554  void scatter (value_t *baseptr, const vint_t& vindex) const;
2555  /// Scatter elements defined by the mask
2556  template<int scale=4>
2557  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2558  template<int scale=4>
2559  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2560 
2561  // Arithmetic operators (component-by-component)
2562  friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2563  friend vfloat8 operator- (const vfloat8& a);
2564  friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2565  friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2566  friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2567  friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2568  friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2569  friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2570  friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2571  friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2572 
2573  // Comparison operations
2574  friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2575  friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2576  friend vbool8 operator< (const vfloat8& a, const vfloat8& b);
2577  friend vbool8 operator> (const vfloat8& a, const vfloat8& b);
2578  friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2579  friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2580 
2581  // Some oddball items that are handy
2582 
2583  /// Stream output
2584  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2585 
2586 protected:
2587  // The actual data representation
2588  union {
2592  };
2593 };
2594 
2595 
2596 /// Helper: shuffle/swizzle with constant (templated) indices.
2597 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2598 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2599 vfloat8 shuffle (const vfloat8& a);
2600 
2601 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2602 template<int i> vfloat8 shuffle (const vfloat8& a);
2603 
2604 /// Helper: as rapid as possible extraction of one component, when the
2605 /// index is fixed.
2606 template<int i> float extract (const vfloat8& a);
2607 
2608 /// Helper: substitute val for a[i]
2609 template<int i> vfloat8 insert (const vfloat8& a, float val);
2610 
2611 /// The sum of all components, returned in all components.
2612 vfloat8 vreduce_add (const vfloat8& v);
2613 
2614 /// The sum of all components, returned as a scalar.
2615 float reduce_add (const vfloat8& v);
2616 
2617 /// Return the float dot (inner) product of a and b in every component.
2618 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2619 
2620 /// Return the float dot (inner) product of a and b.
2621 float dot (const vfloat8 &a, const vfloat8 &b);
2622 
2623 /// Return the float 3-component dot (inner) product of a and b in
2624 /// all components.
2625 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2626 
2627 /// Return the float 3-component dot (inner) product of a and b.
2628 float dot3 (const vfloat8 &a, const vfloat8 &b);
2629 
2630 /// Use a bool mask to select between components of a (if mask[i] is false)
2631 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2632 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2633 
2634 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2635 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2636 /// blend(0,a,mask).
2637 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2638 
2639 /// Use a bool mask to select between components of a (if mask[i] is false)
2640 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2641 /// blend(0,a,!mask), or blend(a,0,mask).
2642 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2643 
2644 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2645 /// that is 0, return 0 rather than Inf.
2646 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2647 
2648 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2649 /// synonym for blend with arguments rearranged, but this is more clear
2650 /// because the arguments are symmetric to scalar (cond ? a : b).
2651 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2652 
2653 // Per-element math
2654 vfloat8 abs (const vfloat8& a); ///< absolute value (float)
2655 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative
2656 vfloat8 ceil (const vfloat8& a);
2657 vfloat8 floor (const vfloat8& a);
2658 vint8 ifloor (const vfloat8& a); ///< (int)floor
2659 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2660 
2661 /// Per-element round to nearest integer (rounding away from 0 in cases
2662 /// that are exactly half way).
2663 vfloat8 round (const vfloat8& a);
2664 
2665 /// Per-element round to nearest integer (rounding away from 0 in cases
2666 /// that are exactly half way).
2667 vint8 rint (const vfloat8& a);
2668 
2669 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a
2670 vfloat8 sqrt (const vfloat8 &a);
2671 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt
2672 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt
2673 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2674 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2675 // vfloat8 exp (const vfloat8& v); // See template with vfloat4
2676 // vfloat8 log (const vfloat8& v); // See template with vfloat4
2677 
2678 /// andnot(a,b) returns ((~a) & b)
2679 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2680 
2681 // Fused multiply and add (or subtract):
2682 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2683 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2684 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2685 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2686 
2687 
2688 
2689 /// Floating point 16-vector, accelerated by SIMD instructions when
2690 /// available.
2691 class vfloat16 {
2692 public:
2693  static const char* type_name() { return "vfloat16"; }
2694  typedef float value_t; ///< Underlying equivalent scalar value type
2695  enum { elements = 16 }; ///< Number of scalar elements
2696  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2697  enum { bits = elements*32 }; ///< Total number of bits
2698  typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used
2699  typedef vfloat16 vfloat_t; ///< SIMD int type
2700  typedef vint16 vint_t; ///< SIMD int type
2701  typedef vbool16 vbool_t; ///< SIMD bool type
2702  typedef vint16 int_t; // old name (deprecated 1.8)
2703  typedef vbool16 bool_t; // old name (deprecated 1.8)
2704 
2705  /// Default constructor (contents undefined)
2706  vfloat16 () { }
2707 
2708  /// Construct from a single value (store it in all slots)
2709  vfloat16 (float a) { load(a); }
2710 
2711  /// Construct from 16 values
2712  vfloat16 (float v0, float v1, float v2, float v3,
2713  float v4, float v5, float v6, float v7,
2714  float v8, float v9, float v10, float v11,
2715  float v12, float v13, float v14, float v15);
2716 
2717  /// Construct from a pointer to 16 values
2718  vfloat16 (const float *f) { load (f); }
2719 
2720  /// Copy construct from another vfloat16
2721  vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2722 
2723  /// Construct from an int vector (promoting all components to float)
2724  explicit vfloat16 (const vint16& ival);
2725 
2726  /// Construct from two vfloat8's
2727  vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2728 
2729  /// Construct from four vfloat4's
2730  vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2731 
2732  /// Construct from the underlying SIMD type
2733  vfloat16 (const simd_t& m) : m_simd(m) { }
2734 
2735  /// Return the raw SIMD type
2736  operator simd_t () const { return m_simd; }
2737  simd_t simd () const { return m_simd; }
2738 
2739  /// Return a pointer to the underlying scalar type
2740  const value_t* data () const { return (const value_t*)this; }
2741  value_t* data () { return (value_t*)this; }
2742 
2743  /// Construct from a pointer to unsigned short values
2744  explicit vfloat16 (const unsigned short *vals) { load(vals); }
2745 
2746  /// Construct from a pointer to short values
2747  explicit vfloat16 (const short *vals) { load(vals); }
2748 
2749  /// Construct from a pointer to unsigned char values
2750  explicit vfloat16 (const unsigned char *vals) { load(vals); }
2751 
2752  /// Construct from a pointer to char values
2753  explicit vfloat16 (const char *vals) { load(vals); }
2754 
2755 #ifdef _HALF_H_
2756  /// Construct from a pointer to half (16 bit float) values
2757  explicit vfloat16 (const half *vals) { load(vals); }
2758 #endif
2759 
2760  /// Assign a single value to all components
2761  const vfloat16& operator= (float a) { load(a); return *this; }
2762 
2763  /// Assign a vfloat16
2764  const vfloat16& operator= (vfloat16 other) {
2765  m_simd = other.m_simd;
2766  return *this;
2767  }
2768 
2769  /// Return a vfloat16 with all components set to 0.0
2770  static const vfloat16 Zero ();
2771 
2772  /// Return a vfloat16 with all components set to 1.0
2773  static const vfloat16 One ();
2774 
2775  /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2776  /// Optional argument can give a non-zero starting point and non-1 step.
2777  static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2778 
2779  /// Set all components to 0.0
2780  void clear ();
2781 
2782  /// Component access (get)
2783  float operator[] (int i) const;
2784  /// Component access (set)
2785  float& operator[] (int i);
2786 
2787  /// Component access (set).
2788  void setcomp (int i, float value);
2789 
2790  value_t x () const;
2791  value_t y () const;
2792  value_t z () const;
2793  value_t w () const;
2794  void set_x (value_t val);
2795  void set_y (value_t val);
2796  void set_z (value_t val);
2797  void set_w (value_t val);
2798 
2799  /// Extract the lower percision vfloat8
2800  vfloat8 lo () const;
2801 
2802  /// Extract the higher percision vfloat8
2803  vfloat8 hi () const;
2804 
2805  /// Helper: load a single value into all components
2806  void load (float val);
2807 
2808  /// Load separate values into each component.
2809  void load (float v0, float v1, float v2, float v3,
2810  float v4, float v5, float v6, float v7,
2811  float v8, float v9, float v10, float v11,
2812  float v12, float v13, float v14, float v15);
2813 
2814  /// Load from an array of values
2815  void load (const float *values);
2816 
2817  /// Load from a partial array of <=16 values. Unassigned values are
2818  /// undefined.
2819  void load (const float *values, int n);
2820 
2821  /// Load from an array of 16 unsigned short values, convert to float
2822  void load (const unsigned short *values);
2823 
2824  /// Load from an array of 16 short values, convert to float
2825  void load (const short *values);
2826 
2827  /// Load from an array of 16 unsigned char values, convert to float
2828  void load (const unsigned char *values);
2829 
2830  /// Load from an array of 16 char values, convert to float
2831  void load (const char *values);
2832 
2833 #ifdef _HALF_H_
2834  /// Load from an array of 16 half values, convert to float
2835  void load (const half *values);
2836 #endif /* _HALF_H_ */
2837 
2838  void store (float *values) const;
2839 
2840  /// Store the first n values into memory
2841  void store (float *values, int n) const;
2842 
2843 #ifdef _HALF_H_
2844  void store (half *values) const;
2845 #endif
2846 
2847  /// Masked load -- read from values[] where mask is 1, load zero where
2848  /// mask is 0.
2849  void load_mask (const vbool_t &mask, const value_t *values);
2850  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
2851 
2852  /// Masked store -- write to values[] where mask is enabled, don't
2853  /// touch values[] where it's not.
2854  void store_mask (const vbool_t &mask, value_t *values) const;
2855  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
2856 
2857  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2858  template<int scale=4>
2859  void gather (const value_t *baseptr, const vint_t& vindex);
2860  /// Gather elements defined by the mask, leave others unchanged.
2861  template<int scale=4>
2862  void gather_mask (const bool_t& mask, const value_t *baseptr, const vint_t& vindex);
2863  template<int scale=4>
2864  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
2865  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
2866  }
2867 
2868  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2869  template<int scale=4>
2870  void scatter (value_t *baseptr, const vint_t& vindex) const;
2871  /// Scatter elements defined by the mask
2872  template<int scale=4>
2873  void scatter_mask (const bool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2874  template<int scale=4>
2875  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
2876  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
2877  }
2878 
2879  // Arithmetic operators (component-by-component)
2880  friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
2881  friend vfloat16 operator- (const vfloat16& a);
2882  friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
2883  friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
2884  friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
2885  friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
2886  friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
2887  friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
2888  friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
2889  friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
2890 
2891  // Comparison operations
2892  friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
2893  friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
2894  friend vbool16 operator< (const vfloat16& a, const vfloat16& b);
2895  friend vbool16 operator> (const vfloat16& a, const vfloat16& b);
2896  friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
2897  friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
2898 
2899  // Some oddball items that are handy
2900 
2901  /// Stream output
2902  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
2903 
2904 protected:
2905  // The actual data representation
2906  union {
2910  };
2911 };
2912 
2913 
2914 /// Shuffle groups of 4
2915 template<int i0, int i1, int i2, int i3>
2916 vfloat16 shuffle4 (const vfloat16& a);
2917 
2918 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
2919 template<int i> vfloat16 shuffle4 (const vfloat16& a);
2920 
2921 /// Shuffle within each group of 4
2922 template<int i0, int i1, int i2, int i3>
2923 vfloat16 shuffle (const vfloat16& a);
2924 
2925 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2926 template<int i> vfloat16 shuffle (const vfloat16& a);
2927 
2928 /// Helper: as rapid as possible extraction of one component, when the
2929 /// index is fixed.
2930 template<int i> float extract (const vfloat16& a);
2931 
2932 /// Helper: substitute val for a[i]
2933 template<int i> vfloat16 insert (const vfloat16& a, float val);
2934 
2935 /// The sum of all components, returned in all components.
2936 vfloat16 vreduce_add (const vfloat16& v);
2937 
2938 /// The sum of all components, returned as a scalar.
2939 float reduce_add (const vfloat16& v);
2940 
2941 /// Use a bool mask to select between components of a (if mask[i] is false)
2942 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2943 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
2944 
2945 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2946 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2947 /// blend(0,a,mask).
2948 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
2949 
2950 /// Use a bool mask to select between components of a (if mask[i] is false)
2951 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2952 /// blend(0,a,!mask), or blend(a,0,mask).
2953 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
2954 
2955 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
2956 /// that is 0, return 0 rather than Inf.
2957 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
2958 
2959 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2960 /// synonym for blend with arguments rearranged, but this is more clear
2961 /// because the arguments are symmetric to scalar (cond ? a : b).
2962 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
2963 
2964 // Per-element math
2965 vfloat16 abs (const vfloat16& a); ///< absolute value (float)
2966 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative
2967 vfloat16 ceil (const vfloat16& a);
2968 vfloat16 floor (const vfloat16& a);
2969 vint16 ifloor (const vfloat16& a); ///< (int)floor
2970 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2971 
2972 /// Per-element round to nearest integer (rounding away from 0 in cases
2973 /// that are exactly half way).
2974 vfloat16 round (const vfloat16& a);
2975 
2976 /// Per-element round to nearest integer (rounding away from 0 in cases
2977 /// that are exactly half way).
2978 vint16 rint (const vfloat16& a);
2979 
2980 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a
2981 vfloat16 sqrt (const vfloat16 &a);
2982 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt
2983 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt
2984 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
2985 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
2986 // vfloat16 exp (const vfloat16& v); // See template with vfloat4
2987 // vfloat16 log (const vfloat16& v); // See template with vfloat4
2988 
2989 /// andnot(a,b) returns ((~a) & b)
2990 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
2991 
2992 // Fused multiply and add (or subtract):
2993 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
2994 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
2995 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
2996 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
2997 
2998 
2999 
3000 // Odds and ends, other CPU hardware tricks
3001 
3002 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3003 // able, otherwise false (because it's not available on that platform,
3004 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3005 inline bool set_flush_zero_mode (bool on) {
3006 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3007  _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3008  return true;
3009 #endif
3010  return false;
3011 }
3012 
3013 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3014 // able, otherwise false (because it's not available on that platform,
3015 // or because it's gcc 4.8 which has a bug that lacks this intrinsic).
3016 inline bool set_denorms_zero_mode (bool on) {
3017 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3018  _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3019  return true;
3020 #endif
3021  return false;
3022 }
3023 
3024 // Get the flush_zero_mode CPU flag on x86.
3025 inline bool get_flush_zero_mode () {
3026 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3027  return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3028 #endif
3029  return false;
3030 }
3031 
3032 // Get the denorms_zero_mode CPU flag on x86.
3033 inline bool get_denorms_zero_mode () {
3034 #if (defined(__x86_64__) || defined(__i386__)) && (OIIO_GNUC_VERSION == 0 || OIIO_GNUC_VERSION > 40900)
3035  return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3036 #endif
3037  return false;
3038 }
3039 
3040 
3041 
3042 
3043 
3044 
3045 //////////////////////////////////////////////////////////////////////////
3046 //////////////////////////////////////////////////////////////////////////
3047 //
3048 // Gory implementation details follow.
3049 //
3050 // ^^^ All declarations and documention is above ^^^
3051 //
3052 // vvv Below is the implementation, often considerably cluttered with
3053 // #if's for each architeture, and unapologitic use of intrinsics and
3054 // every manner of dirty trick we can think of to make things fast.
3055 // Some of this isn't pretty. We won't recapitulate comments or
3056 // documentation of what the functions are supposed to do, please
3057 // consult the declarations above for that.
3058 //
3059 // Here be dragons.
3060 //
3061 //////////////////////////////////////////////////////////////////////////
3062 //////////////////////////////////////////////////////////////////////////
3063 
3064 
3065 
3066 //////////////////////////////////////////////////////////////////////
3067 // vbool4 implementation
3068 
3069 
3071  DASSERT(i >= 0 && i < elements);
3072 #if OIIO_SIMD_SSE
3073  return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3074 #else
3075  return m_val[i];
3076 #endif
3077 }
3078 
3080  DASSERT(i >= 0 && i < elements);
3081  return m_val[i];
3082 }
3083 
3084 
3086  DASSERT(i >= 0 && i < elements);
3087  m_val[i] = value ? -1 : 0;
3088 }
3089 
3090 
3091 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3092  cout << a[0];
3093  for (int i = 1; i < a.elements; ++i)
3094  cout << ' ' << a[i];
3095  return cout;
3096 }
3097 
3098 
3100 #if OIIO_SIMD_SSE
3101  m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3102 #else
3103  int val = -int(a);
3104  SIMD_CONSTRUCT (val);
3105 #endif
3106 }
3107 
3108 
3109 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3110 #if OIIO_SIMD_SSE
3111  // N.B. -- we need to reverse the order because of our convention
3112  // of storing a,b,c,d in the same order in memory.
3113  m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3114 #else
3115  m_val[0] = -int(a);
3116  m_val[1] = -int(b);
3117  m_val[2] = -int(c);
3118  m_val[3] = -int(d);
3119 #endif
3120 }
3121 
3123  load (a[0], a[1], a[2], a[3]);
3124 }
3125 
3127  m_simd = other.m_simd;
3128  return *this;
3129 }
3130 
3131 
3133 #if OIIO_SIMD_SSE
3134  return _mm_movemask_ps(m_simd);
3135 #else
3136  int r = 0;
3137  for (int i = 0; i < elements; ++i)
3138  if (m_val[i])
3139  r |= 1<<i;
3140  return r;
3141 #endif
3142 }
3143 
3144 
3146 vbool4::from_bitmask (int bitmask) {
3147  // I think this is a fast conversion from int bitmask to vbool4
3148  return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3149 }
3150 
3151 
3153 #if OIIO_SIMD_SSE
3154  m_simd = _mm_setzero_ps();
3155 #else
3156  *this = false;
3157 #endif
3158 }
3159 
3160 
3162 #if OIIO_SIMD_SSE
3163  return _mm_setzero_ps();
3164 #else
3165  return false;
3166 #endif
3167 }
3168 
3170  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3171 #if OIIO_SIMD_SSE
3172 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3173  __m128i anyval = _mm_undefined_si128();
3174 # else
3175  __m128i anyval = _mm_setzero_si128();
3176 # endif
3177  return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3178 #else
3179  return true;
3180 #endif
3181 }
3182 
3184  SIMD_DO (values[i] = m_val[i] ? true : false);
3185 }
3186 
3187 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3188  DASSERT (n >= 0 && n <= elements);
3189  for (int i = 0; i < n; ++i)
3190  values[i] = m_val[i] ? true : false;
3191 }
3192 
3193 
3194 
3196 #if OIIO_SIMD_SSE
3197  return _mm_xor_ps (a.simd(), vbool4::True());
3198 #else
3199  SIMD_RETURN (vbool4, a[i] ^ (-1));
3200 #endif
3201 }
3202 
3204 #if OIIO_SIMD_SSE
3205  return _mm_and_ps (a.simd(), b.simd());
3206 #else
3207  SIMD_RETURN (vbool4, a[i] & b[i]);
3208 #endif
3209 }
3210 
3212 #if OIIO_SIMD_SSE
3213  return _mm_or_ps (a.simd(), b.simd());
3214 #else
3215  SIMD_RETURN (vbool4, a[i] | b[i]);
3216 #endif
3217 }
3218 
3220 #if OIIO_SIMD_SSE
3221  return _mm_xor_ps (a.simd(), b.simd());
3222 #else
3223  SIMD_RETURN (vbool4, a[i] ^ b[i]);
3224 #endif
3225 }
3226 
3227 
3229  return a = a & b;
3230 }
3231 
3233  return a = a | b;
3234 }
3235 
3237  return a = a ^ b;
3238 }
3239 
3241 #if OIIO_SIMD_SSE
3242  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3243  return _mm_xor_ps (a.simd(), vbool4::True());
3244 #else
3245  SIMD_RETURN (vbool4, ~a[i]);
3246 #endif
3247 }
3248 
3250 #if OIIO_SIMD_SSE
3251  return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3252 #else
3253  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3254 #endif
3255 }
3256 
3258 #if OIIO_SIMD_SSE
3259  return _mm_xor_ps (a, b);
3260 #else
3261  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3262 #endif
3263 }
3264 
3265 
3266 
3267 
3268 #if OIIO_SIMD_SSE
3269 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b)
3270 template<int i0, int i1, int i2, int i3>
3271 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3272  return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3273 }
3274 #endif
3275 
3276 #if OIIO_SIMD_SSE >= 3
3277 // SSE3 has intrinsics for a few special cases
3278 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3279  return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3280 }
3281 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3282  return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3283 }
3284 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3285  return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3286 }
3287 #endif
3288 
3289 #if OIIO_SIMD_SSE
3290 template<int i0, int i1, int i2, int i3>
3291 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3292  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3293 }
3294 #endif
3295 
3296 #if OIIO_SIMD_SSE >= 3
3297 // SSE3 has intrinsics for a few special cases
3298 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3299  return _mm_moveldup_ps(a);
3300 }
3301 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3302  return _mm_movehdup_ps(a);
3303 }
3304 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3305  return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3306 }
3307 #endif
3308 
3309 
3310 /// Helper: shuffle/swizzle with constant (templated) indices.
3311 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3312 template<int i0, int i1, int i2, int i3>
3314 #if OIIO_SIMD_SSE
3315  return shuffle_sse<i0,i1,i2,i3> (a.simd());
3316 #else
3317  return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3318 #endif
3319 }
3320 
3321 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3322 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3323  return shuffle<i,i,i,i>(a);
3324 }
3325 
3326 
3327 /// Helper: as rapid as possible extraction of one component, when the
3328 /// index is fixed.
3329 template<int i>
3331 #if OIIO_SIMD_SSE >= 4
3332  return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only
3333 #else
3334  return a[i];
3335 #endif
3336 }
3337 
3338 /// Helper: substitute val for a[i]
3339 template<int i>
3341 #if OIIO_SIMD_SSE >= 4
3342  int ival = -int(val);
3343  return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3344 #else
3345  vbool4 tmp = a;
3346  tmp[i] = -int(val);
3347  return tmp;
3348 #endif
3349 }
3350 
3352 #if OIIO_SIMD_AVX
3353  return _mm_testc_ps (v, vbool4(true)) != 0;
3354 #elif OIIO_SIMD_SSE
3355  return _mm_movemask_ps(v.simd()) == 0xf;
3356 #else
3357  SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3358 #endif
3359 }
3360 
3362 #if OIIO_SIMD_AVX
3363  return ! _mm_testz_ps (v, v);
3364 #elif OIIO_SIMD_SSE
3365  return _mm_movemask_ps(v) != 0;
3366 #else
3367  SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3368 #endif
3369 }
3370 
3371 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3372 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3373 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3374 
3375 
3376 
3377 //////////////////////////////////////////////////////////////////////
3378 // vbool8 implementation
3379 
3380 
3382  DASSERT(i >= 0 && i < elements);
3383 #if OIIO_SIMD_AVX
3384  return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3385 #else
3386  return m_val[i];
3387 #endif
3388 }
3389 
3391  DASSERT(i >= 0 && i < elements);
3392  m_val[i] = value ? -1 : 0;
3393 }
3394 
3396  DASSERT(i >= 0 && i < elements);
3397  return m_val[i];
3398 }
3399 
3400 
3401 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3402  cout << a[0];
3403  for (int i = 1; i < a.elements; ++i)
3404  cout << ' ' << a[i];
3405  return cout;
3406 }
3407 
3408 
3410 #if OIIO_SIMD_AVX
3411  m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3412 #else
3413  int val = -int(a);
3414  SIMD_CONSTRUCT (val);
3415 #endif
3416 }
3417 
3418 
3419 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3420  bool e, bool f, bool g, bool h) {
3421 #if OIIO_SIMD_AVX
3422  // N.B. -- we need to reverse the order because of our convention
3423  // of storing a,b,c,d in the same order in memory.
3424  m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3425  -int(d), -int(c), -int(b), -int(a)));
3426 #else
3427  m_val[0] = -int(a);
3428  m_val[1] = -int(b);
3429  m_val[2] = -int(c);
3430  m_val[3] = -int(d);
3431  m_val[4] = -int(e);
3432  m_val[5] = -int(f);
3433  m_val[6] = -int(g);
3434  m_val[7] = -int(h);
3435 #endif
3436 }
3437 
3438 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3439  bool e, bool f, bool g, bool h) {
3440  load (a, b, c, d, e, f, g, h);
3441 }
3442 
3444  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3445 }
3446 
3447 
3449  load(a);
3450  return *this;
3451 }
3452 
3454  m_simd = other.m_simd;
3455  return *this;
3456 }
3457 
3459 #if OIIO_SIMD_AVX
3460  return _mm256_movemask_ps(m_simd);
3461 #else
3462  return lo().bitmask() | (hi().bitmask() << 4);
3463 #endif
3464 }
3465 
3466 
3468 vbool8::from_bitmask (int bitmask) {
3469  // I think this is a fast conversion from int bitmask to vbool8
3470  return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3471 }
3472 
3473 
3475 #if OIIO_SIMD_AVX
3476  m_simd = _mm256_setzero_ps();
3477 #else
3478  *this = false;
3479 #endif
3480 }
3481 
3483 #if OIIO_SIMD_AVX
3484  return _mm256_setzero_ps();
3485 #else
3486  return false;
3487 #endif
3488 }
3489 
3490 
3492 #if OIIO_SIMD_AVX
3493 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3494  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3495  __m256i anyval = _mm256_undefined_si256();
3496  return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3497 # else
3498  return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3499 # endif
3500 #else
3501  return true;
3502 #endif
3503 }
3504 
3505 
3507  SIMD_DO (values[i] = m_val[i] ? true : false);
3508 }
3509 
3510 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3511  DASSERT (n >= 0 && n <= elements);
3512  for (int i = 0; i < n; ++i)
3513  values[i] = m_val[i] ? true : false;
3514 }
3515 
3516 
3518 #if OIIO_SIMD_AVX
3519  return _mm256_castps256_ps128 (simd());
3520 #else
3521  return m_4[0];
3522 #endif
3523 }
3524 
3526 #if OIIO_SIMD_AVX
3527  return _mm256_extractf128_ps (simd(), 1);
3528 #else
3529  return m_4[1];
3530 #endif
3531 }
3532 
3533 
3535 #if OIIO_SIMD_AVX
3536  __m256 r = _mm256_castps128_ps256 (lo);
3537  m_simd = _mm256_insertf128_ps (r, hi, 1);
3538  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3539 #else
3540  m_4[0] = lo;
3541  m_4[1] = hi;
3542 #endif
3543 }
3544 
3545 
3547 #if OIIO_SIMD_AVX
3548  return _mm256_xor_ps (a.simd(), vbool8::True());
3549 #else
3550  SIMD_RETURN (vbool8, a[i] ^ (-1));
3551 #endif
3552 }
3553 
3555 #if OIIO_SIMD_AVX
3556  return _mm256_and_ps (a.simd(), b.simd());
3557 #else
3558  SIMD_RETURN (vbool8, a[i] & b[i]);
3559 #endif
3560 }
3561 
3563 #if OIIO_SIMD_AVX
3564  return _mm256_or_ps (a.simd(), b.simd());
3565 #else
3566  SIMD_RETURN (vbool8, a[i] | b[i]);
3567 #endif
3568 }
3569 
3571 #if OIIO_SIMD_AVX
3572  return _mm256_xor_ps (a.simd(), b.simd());
3573 #else
3574  SIMD_RETURN (vbool8, a[i] ^ b[i]);
3575 #endif
3576 }
3577 
3578 
3580  return a = a & b;
3581 }
3582 
3584  return a = a | b;
3585 }
3586 
3588  return a = a ^ b;
3589 }
3590 
3591 
3593 #if OIIO_SIMD_AVX
3594  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3595  return _mm256_xor_ps (a.simd(), vbool8::True());
3596 #else
3597  SIMD_RETURN (vbool8, ~a[i]);
3598 #endif
3599 }
3600 
3601 
3603 #if OIIO_SIMD_AVX >= 2
3604  return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3605 #elif OIIO_SIMD_AVX
3606  return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3607 #else
3608  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3609 #endif
3610 }
3611 
3613 #if OIIO_SIMD_AVX
3614  return _mm256_xor_ps (a, b);
3615 #else
3616  SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3617 #endif
3618 }
3619 
3620 
3621 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3623 #if OIIO_SIMD_AVX >= 2
3624  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3625  return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3626 #else
3627  return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3628 #endif
3629 }
3630 
3631 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3632  return shuffle<i,i,i,i,i,i,i,i>(a);
3633 }
3634 
3635 
3636 template<int i>
3638 #if OIIO_SIMD_AVX && !_WIN32
3639  return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only
3640 #else
3641  return a[i];
3642 #endif
3643 }
3644 
3645 template<int i>
3647 #if OIIO_SIMD_AVX && !_WIN32
3648  int ival = -int(val);
3649  return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3650 #else
3651  vbool8 tmp = a;
3652  tmp[i] = -int(val);
3653  return tmp;
3654 #endif
3655 }
3656 
3657 
3659 #if OIIO_SIMD_AVX
3660  return _mm256_testc_ps (v, vbool8(true)) != 0;
3661  // return _mm256_movemask_ps(v.simd()) == 0xff;
3662 #else
3663  SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3664 #endif
3665 }
3666 
3668 #if OIIO_SIMD_AVX
3669  return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h !
3670  // return _mm256_movemask_ps(v) != 0;
3671 #else
3672  SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3673 #endif
3674 }
3675 
3676 
3677 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3678 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3679 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3680 
3681 
3682 
3683 //////////////////////////////////////////////////////////////////////
3684 // vbool16 implementation
3685 
3686 
3688  DASSERT(i >= 0 && i < elements);
3689 #if OIIO_SIMD_AVX >= 512
3690  return (int(m_simd) >> i) & 1;
3691 #else
3692  return (m_bits >> i) & 1;
3693 #endif
3694 }
3695 
3697  DASSERT(i >= 0 && i < elements);
3698  int bits = m_bits;
3699  bits &= (0xffff ^ (1<<i));
3700  bits |= (int(value)<<i);
3701  m_bits = bits;
3702 }
3703 
3704 
3705 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3706  cout << a[0];
3707  for (int i = 1; i < a.elements; ++i)
3708  cout << ' ' << a[i];
3709  return cout;
3710 }
3711 
3712 
3714  m_simd = a ? 0xffff : 0;
3715 }
3716 
3717 
3719  m_simd = simd_t(a);
3720 }
3721 
3722 
3723 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3724  bool v4, bool v5, bool v6, bool v7,
3725  bool v8, bool v9, bool v10, bool v11,
3726  bool v12, bool v13, bool v14, bool v15) {
3727  m_simd = simd_t((int(v0) << 0) |
3728  (int(v1) << 1) |
3729  (int(v2) << 2) |
3730  (int(v3) << 3) |
3731  (int(v4) << 4) |
3732  (int(v5) << 5) |
3733  (int(v6) << 6) |
3734  (int(v7) << 7) |
3735  (int(v8) << 8) |
3736  (int(v9) << 9) |
3737  (int(v10) << 10) |
3738  (int(v11) << 11) |
3739  (int(v12) << 12) |
3740  (int(v13) << 13) |
3741  (int(v14) << 14) |
3742  (int(v15) << 15));
3743 }
3744 
3745 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3746  bool v4, bool v5, bool v6, bool v7,
3747  bool v8, bool v9, bool v10, bool v11,
3748  bool v12, bool v13, bool v14, bool v15) {
3749  load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3750 }
3751 
3753  load_bitmask (a.bitmask() | (b.bitmask() << 8));
3754 }
3755 
3757  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3758  a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3759 }
3760 
3761 
3763  load(a);
3764  return *this;
3765 }
3766 
3768  m_simd = other.m_simd;
3769  return *this;
3770 }
3771 
3772 
3774 #if OIIO_SIMD_AVX >= 512
3775  return int(m_simd);
3776 #else
3777  return int(m_bits);
3778 #endif
3779 }
3780 
3781 
3783  m_simd = simd_t(0);
3784 }
3785 
3787  return simd_t(0);
3788 }
3789 
3790 
3792  return simd_t(0xffff);
3793 }
3794 
3795 
3797  SIMD_DO (values[i] = m_bits & (1<<i));
3798 }
3799 
3800 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
3801  DASSERT (n >= 0 && n <= elements);
3802  for (int i = 0; i < n; ++i)
3803  values[i] = m_bits & (1<<i);
3804 }
3805 
3806 
3807 
3809 #if OIIO_SIMD_AVX >= 512
3810  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
3811 #else
3812  SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
3813 #endif
3814 }
3815 
3817 #if OIIO_SIMD_AVX >= 512
3818  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
3819 #else
3820  SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
3821 #endif
3822 }
3823 
3824 
3826 #if OIIO_SIMD_AVX >= 512
3827  return _mm512_knot (a.simd());
3828 #else
3829  return vbool16 (a.m_bits ^ 0xffff);
3830 #endif
3831 }
3832 
3834 #if OIIO_SIMD_AVX >= 512
3835  return _mm512_kand (a.simd(), b.simd());
3836 #else
3837  return vbool16 (a.m_bits & b.m_bits);
3838 #endif
3839 }
3840 
3842 #if OIIO_SIMD_AVX >= 512
3843  return _mm512_kor (a.simd(), b.simd());
3844 #else
3845  return vbool16 (a.m_bits | b.m_bits);
3846 #endif
3847 }
3848 
3850 #if OIIO_SIMD_AVX >= 512
3851  return _mm512_kxor (a.simd(), b.simd());
3852 #else
3853  return vbool16 (a.m_bits ^ b.m_bits);
3854 #endif
3855 }
3856 
3857 
3859  return a = a & b;
3860 }
3861 
3863  return a = a | b;
3864 }
3865 
3867  return a = a ^ b;
3868 }
3869 
3870 
3872  return a ^ vbool16::True();
3873 }
3874 
3875 
3877 #if OIIO_SIMD_AVX >= 512
3878  return _mm512_kxnor (a.simd(), b.simd());
3879 #else
3880  return vbool16 (!(a.m_bits ^ a.m_bits));
3881 #endif
3882 }
3883 
3885 #if OIIO_SIMD_AVX >= 512
3886  return _mm512_kxor (a.simd(), b.simd());
3887 #else
3888  return vbool16 (a.m_bits ^ a.m_bits);
3889 #endif
3890 }
3891 
3892 
3893 template<int i>
3895  return a[i];
3896 }
3897 
3898 template<int i>
3900  vbool16 tmp = a;
3901  tmp.setcomp (i, val);
3902  return tmp;
3903 }
3904 
3905 
3907  return v.bitmask() == 0xffff;
3908 }
3909 
3911  return v.bitmask() != 0;
3912 }
3913 
3914 
3915 OIIO_FORCEINLINE bool all (const vbool16& v) { return reduce_and(v) == true; }
3916 OIIO_FORCEINLINE bool any (const vbool16& v) { return reduce_or(v) == true; }
3917 OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
3918 
3919 
3920 
3921 
3922 
3923 
3924 //////////////////////////////////////////////////////////////////////
3925 // vint4 implementation
3926 
3928  m_simd = other.m_simd;
3929  return *this;
3930 }
3931 
3933  DASSERT(i<elements);
3934  return m_val[i];
3935 }
3936 
3938  DASSERT(i<elements);
3939  return m_val[i];
3940 }
3941 
3943  DASSERT(i<elements);
3944  m_val[i] = val;
3945 }
3946 
3947 
3949 #if OIIO_SIMD_SSE
3950  m_simd = _mm_set1_epi32 (a);
3951 #else
3952  SIMD_CONSTRUCT (a);
3953 #endif
3954 }
3955 
3956 
3957 
3958 OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d) {
3959 #if OIIO_SIMD_SSE
3960  m_simd = _mm_set_epi32 (d, c, b, a);
3961 #else
3962  m_val[0] = a;
3963  m_val[1] = b;
3964  m_val[2] = c;
3965  m_val[3] = d;
3966 #endif
3967 }
3968 
3969 
3970 // OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d,
3971 // int e, int f, int g, int h) {
3972 // load (a, b, c, d);
3973 // }
3974 
3975 
3976 
3978 #if OIIO_SIMD_SSE
3979  m_simd = _mm_loadu_si128 ((const simd_t *)values);
3980 #else
3981  SIMD_CONSTRUCT (values[i]);
3982 #endif
3983 }
3984 
3985 
3986 OIIO_FORCEINLINE void vint4::load (const int *values, int n)
3987 {
3988  DASSERT (n >= 0 && n <= elements);
3989 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
3990  m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
3991 #elif OIIO_SIMD_SSE
3992  switch (n) {
3993  case 1:
3994  m_simd = _mm_castps_si128 (_mm_load_ss ((const float *)values));
3995  break;
3996  case 2:
3997  // Trickery: load one double worth of bits!
3998  m_simd = _mm_castpd_si128 (_mm_load_sd ((const double*)values));
3999  break;
4000  case 3:
4001  // Trickery: load one double worth of bits, then a float,
4002  // and combine, casting to ints.
4003  m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((const double*)values)),
4004  _mm_load_ss ((const float *)values + 2)));
4005  break;
4006  case 4:
4007  m_simd = _mm_loadu_si128 ((const simd_t *)values);
4008  break;
4009  default:
4010  clear ();
4011  break;
4012  }
4013 #else
4014  for (int i = 0; i < n; ++i)
4015  m_val[i] = values[i];
4016  for (int i = n; i < elements; ++i)
4017  m_val[i] = 0;
4018 #endif
4019 }
4020 
4021 
4022 OIIO_FORCEINLINE void vint4::load (const unsigned short *values) {
4023 #if OIIO_SIMD_SSE >= 4
4024  // Trickery: load one double worth of bits = 4 ushorts!
4025  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4026  m_simd = _mm_cvtepu16_epi32 (a);
4027 #else
4028  SIMD_CONSTRUCT (values[i]);
4029 #endif
4030 }
4031 
4032 
4033 OIIO_FORCEINLINE void vint4::load (const short *values) {
4034 #if OIIO_SIMD_SSE >= 4
4035  // Trickery: load one double worth of bits = 4 shorts!
4036  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4037  m_simd = _mm_cvtepi16_epi32 (a);
4038 #else
4039  SIMD_CONSTRUCT (values[i]);
4040 #endif
4041 }
4042 
4043 
4044 OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {</