HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fmath.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 /*
6  A few bits here are based upon code from NVIDIA that was also released
7  under the same 3-Clause BSD license, and marked as:
8  Copyright 2004 NVIDIA Corporation. All Rights Reserved.
9 
10  Some parts of this file were first open-sourced in Open Shading Language,
11  also 3-Clause BSD license, then later moved here. The original copyright
12  notice was:
13  Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
14 
15  Many of the math functions were copied from or inspired by other
16  public domain sources or open source packages with compatible licenses.
17  The individual functions give references were applicable.
18 */
19 
20 
21 // clang-format off
22 
23 /// \file
24 ///
25 /// A variety of floating-point math helper routines (and, slight
26 /// misnomer, some int stuff as well).
27 ///
28 
29 
30 #pragma once
31 #define OIIO_FMATH_H 1
32 
33 #include <algorithm>
34 #include <cmath>
35 #include <cstring>
36 #include <limits>
37 #include <typeinfo>
38 #include <type_traits>
39 
40 #include <OpenImageIO/Imath.h>
41 #include <OpenImageIO/span.h>
42 #include <OpenImageIO/dassert.h>
44 #include <OpenImageIO/platform.h>
45 #include <OpenImageIO/simd.h>
46 
47 
49 
50 /// Occasionally there may be a tradeoff where the best/fastest
51 /// implementation of a math function in an ordinary scalar context does
52 /// things that prevent autovectorization (or OMP pragma vectorization) of a
53 /// loop containing that operation, and the only way to make it SIMD
54 /// friendly slows it down as a scalar op.
55 ///
56 /// By default, we always prefer doing it as correctly and quickly as
57 /// possible in scalar mode, but if OIIO_FMATH_SIMD_FRIENDLY is defined to
58 /// be nonzero, we prefer ensuring that the SIMD loops are as efficient as
59 /// possible, even at the expense of scalar performance.
60 ///
61 /// Because everything here consists of inline functions, and it doesn't
62 /// really matter for OIIO internals themselves, this is primarily intended
63 /// for the sake of 3rd party apps that happen to have OIIO as a dependency
64 /// and use the fmath.h functions. Such a downstream dependency just needs
65 /// to ensure that OIIO_FMATH_SIMD_FRIENDLY is defined to 1 prior to
66 /// including fmath.h.
67 ///
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 # define OIIO_FMATH_SIMD_FRIENDLY 0
70 #endif
71 
72 
73 // Helper template to let us tell if two types are the same.
74 // C++11 defines this, keep in OIIO namespace for back compat.
75 // DEPRECATED(2.0) -- clients should switch OIIO::is_same -> std::is_same.
76 using std::is_same;
77 
78 
79 // For back compatibility: expose these in the OIIO namespace.
80 // DEPRECATED(2.0) -- clients should switch OIIO:: -> std:: for these.
81 using std::isfinite;
82 using std::isinf;
83 using std::isnan;
84 
85 
86 // Define math constants just in case they aren't included (Windows is a
87 // little finicky about this, only defining these if _USE_MATH_DEFINES is
88 // defined before <cmath> is included, which is hard to control).
89 #ifndef M_PI
90 # define M_PI 3.14159265358979323846264338327950288
91 #endif
92 #ifndef M_PI_2
93 # define M_PI_2 1.57079632679489661923132169163975144
94 #endif
95 #ifndef M_PI_4
96 # define M_PI_4 0.785398163397448309615660845819875721
97 #endif
98 #ifndef M_TWO_PI
99 # define M_TWO_PI (M_PI * 2.0)
100 #endif
101 #ifndef M_1_PI
102 # define M_1_PI 0.318309886183790671537767526745028724
103 #endif
104 #ifndef M_2_PI
105 # define M_2_PI 0.636619772367581343075535053490057448
106 #endif
107 #ifndef M_SQRT2
108 # define M_SQRT2 1.41421356237309504880168872420969808
109 #endif
110 #ifndef M_SQRT1_2
111 # define M_SQRT1_2 0.707106781186547524400844362104849039
112 #endif
113 #ifndef M_LN2
114 # define M_LN2 0.69314718055994530941723212145817656
115 #endif
116 #ifndef M_LN10
117 # define M_LN10 2.30258509299404568401799145468436421
118 #endif
119 #ifndef M_E
120 # define M_E 2.71828182845904523536028747135266250
121 #endif
122 #ifndef M_LOG2E
123 # define M_LOG2E 1.44269504088896340735992468100189214
124 #endif
125 
126 
127 
128 ////////////////////////////////////////////////////////////////////////////
129 ////////////////////////////////////////////////////////////////////////////
130 //
131 // INTEGER HELPER FUNCTIONS
132 //
133 // A variety of handy functions that operate on integers.
134 //
135 
136 /// Quick test for whether an integer is a power of 2.
137 ///
138 template<typename T>
139 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool
140 ispow2(T x) noexcept
141 {
142  // Numerous references for this bit trick are on the web. The
143  // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
144  // all 1 bits for bits < b.
145  return (x & (x - 1)) == 0 && (x >= 0);
146 }
147 
148 
149 
150 /// Round up to next higher power of 2 (return x if it's already a power
151 /// of 2).
152 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
153 ceil2(int x) noexcept
154 {
155  // Here's a version with no loops.
156  if (x < 0)
157  return 0;
158  // Subtract 1, then round up to a power of 2, that way if we are
159  // already a power of 2, we end up with the same number.
160  --x;
161  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
162  x |= x >> 1;
163  x |= x >> 2;
164  x |= x >> 4;
165  x |= x >> 8;
166  x |= x >> 16;
167  // Now we have 2^n-1, by adding 1, we make it a power of 2 again
168  return x + 1;
169 }
170 
171 
172 
173 /// Round down to next lower power of 2 (return x if it's already a power
174 /// of 2).
175 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
176 floor2(int x) noexcept
177 {
178  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
179  x |= x >> 1;
180  x |= x >> 2;
181  x |= x >> 4;
182  x |= x >> 8;
183  x |= x >> 16;
184  // Strip off all but the high bit, i.e. 00011111 -> 00010000
185  // That's the power of two <= the original x
186  return x & ~(x >> 1);
187 }
188 
189 
190 // Old names -- DEPRECATED(2.1)
191 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
192 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
193 
194 
195 
196 /// Round value up to the next whole multiple.
197 /// For example, round_to_multiple(7,10) returns 10.
198 template <typename V, typename M>
199 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
200 {
201  return V (((value + V(multiple) - 1) / V(multiple)) * V(multiple));
202 }
203 
204 
205 
206 /// Round up to the next whole multiple of m, for the special case where
207 /// m is definitely a power of 2 (somewhat simpler than the more general
208 /// round_to_multiple). This is a template that should work for any
209 /// integer type.
210 template<typename T>
211 inline OIIO_HOSTDEVICE T
213 {
214  OIIO_DASSERT(ispow2(m));
215  return (x + m - 1) & (~(m - 1));
216 }
217 
218 
219 
220 /// Multiply two unsigned 32-bit ints safely, carefully checking for
221 /// overflow, and clamping to uint32_t's maximum value.
222 inline OIIO_HOSTDEVICE uint32_t
223 clamped_mult32(uint32_t a, uint32_t b)
224 {
225  const uint32_t Err = std::numeric_limits<uint32_t>::max();
226  uint64_t r = (uint64_t)a * (uint64_t)b; // Multiply into a bigger int
227  return r < Err ? (uint32_t)r : Err;
228 }
229 
230 
231 
232 /// Multiply two unsigned 64-bit ints safely, carefully checking for
233 /// overflow, and clamping to uint64_t's maximum value.
234 inline OIIO_HOSTDEVICE uint64_t
235 clamped_mult64(uint64_t a, uint64_t b)
236 {
237  uint64_t ab = a * b;
238  if (b && ab / b != a)
240  else
241  return ab;
242 }
243 
244 
245 
246 /// Bitwise circular rotation left by `s` bits (for any unsigned integer
247 /// type). For info on the C++20 std::rotl(), see
248 /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0553r4.html
249 // FIXME: this should be constexpr, but we're leaving that out for now
250 // because the Cuda specialization uses an intrinsic that isn't constexpr.
251 // Come back to this later when more of the Cuda library is properly
252 // constexpr.
253 template<class T>
255 // constexpr
256 T rotl(T x, int s) noexcept
257 {
259  "rotl only works for unsigned integer types");
260  return (x << s) | (x >> ((sizeof(T) * 8) - s));
261 }
262 
263 
264 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
265 // Cuda has an intrinsic for 32 bit unsigned int rotation
266 // FIXME: This should be constexpr, but __funnelshift_lc seems not to be
267 // marked as such.
268 template<>
270 // constexpr
271 uint32_t rotl(uint32_t x, int s) noexcept
272 {
273  return __funnelshift_lc(x, x, s);
274 }
275 #endif
276 
277 
278 
279 // Old names -- DEPRECATED(2.1)
281 rotl32(uint32_t x, int k)
282 {
283 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
284  return __funnelshift_lc(x, x, k);
285 #else
286  return (x << k) | (x >> (32 - k));
287 #endif
288 }
289 
291 rotl64(uint64_t x, int k)
292 {
293  return (x << k) | (x >> (64 - k));
294 }
295 
296 
297 
298 // safe_mod is like integer a%b, but safely returns 0 when b==0.
299 template <class T>
301 safe_mod(T a, T b)
302 {
303  return b ? (a % b) : T(0);
304 }
305 
306 // (end of integer helper functions)
307 ////////////////////////////////////////////////////////////////////////////
308 
309 
310 
311 ////////////////////////////////////////////////////////////////////////////
312 ////////////////////////////////////////////////////////////////////////////
313 //
314 // FLOAT UTILITY FUNCTIONS
315 //
316 // Helper/utility functions: clamps, blends, interpolations...
317 //
318 
319 
320 /// clamp a to bounds [low,high].
321 template <class T>
323 clamp (const T& a, const T& low, const T& high)
324 {
325 #if 1
326  // This looks clunky, but it generates minimal code. For float, it
327  // should result in just a max and min instruction, thats it.
328  // This implementation is courtesy of Alex Wells, Intel, via OSL.
329  T val = a;
330  if (!(low <= val)) // Forces clamp(NaN,low,high) to return low
331  val = low;
332  if (val > high)
333  val = high;
334  return val;
335 #else
336  // The naive implementation we originally had, below, only does the 2nd
337  // comparison in the else block, which will generate extra code in a
338  // SIMD masking scenario. I (LG) can confirm that benchmarks show that
339  // the above implementation is indeed faster than the one below, even
340  // for non-SIMD use.
341  return (a >= low) ? ((a <= high) ? a : high) : low;
342 #endif
343 }
344 
345 
346 #ifndef __CUDA_ARCH__
347 // Specialization of clamp for vfloat4
349 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
350 {
351  return simd::min (high, simd::max (low, a));
352 }
353 
355 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
356 {
357  return simd::min (high, simd::max (low, a));
358 }
359 
361 clamp (const simd::vfloat16& a, const simd::vfloat16& low, const simd::vfloat16& high)
362 {
363  return simd::min (high, simd::max (low, a));
364 }
365 
366 // Specialization of clamp for vint4
367 template<> OIIO_FORCEINLINE simd::vint4
368 clamp (const simd::vint4& a, const simd::vint4& low, const simd::vint4& high)
369 {
370  return simd::min (high, simd::max (low, a));
371 }
372 
373 template<> OIIO_FORCEINLINE simd::vint8
374 clamp (const simd::vint8& a, const simd::vint8& low, const simd::vint8& high)
375 {
376  return simd::min (high, simd::max (low, a));
377 }
378 
380 clamp (const simd::vint16& a, const simd::vint16& low, const simd::vint16& high)
381 {
382  return simd::min (high, simd::max (low, a));
383 }
384 #endif
385 
386 
387 
388 // For the multply+add (or sub) operations below, note that the results may
389 // differ slightly on different hardware, depending on whether true fused
390 // multiply and add is available or if the code generated just does an old
391 // fashioned multiply followed by a separate add. So please interpret these
392 // as "do a multiply and add as fast as possible for this hardware, with at
393 // least as much precision as a multiply followed by a separate add."
394 
395 /// Fused multiply and add: (a*b + c)
396 OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd (float a, float b, float c) {
397  // NOTE: GCC/ICC will turn this (for float) into a FMA unless
398  // explicitly asked not to, clang will do so if -ffp-contract=fast.
399  OIIO_CLANG_PRAGMA(clang fp contract(fast))
400  return a * b + c;
401 }
402 
403 
404 /// Fused multiply and subtract: (a*b - c)
405 OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub (float a, float b, float c) {
406  OIIO_CLANG_PRAGMA(clang fp contract(fast))
407  return a * b - c;
408 }
409 
410 
411 
412 /// Fused negative multiply and add: -(a*b) + c
413 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
414  OIIO_CLANG_PRAGMA(clang fp contract(fast))
415  return c - (a * b);
416 }
417 
418 
419 
420 /// Negative fused multiply and subtract: -(a*b) - c
421 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
422  OIIO_CLANG_PRAGMA(clang fp contract(fast))
423  return -(a * b) - c;
424 }
425 
426 
427 
428 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
429 /// This is a template, and so should work for any types.
430 template <class T, class Q>
432 lerp (const T& v0, const T& v1, const Q& x)
433 {
434  // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
435  return v0*(Q(1)-x) + v1*x;
436 }
437 
438 
439 
440 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
441 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
442 /// result. This is a template, and so should work for any types.
443 template <class T, class Q>
445 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
446 {
447  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
448  Q s1 = Q(1) - s;
449  return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
450 }
451 
452 
453 
454 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
455 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
456 /// storing the results in 'result'. These are all vectors, so do it
457 /// for each of 'n' contiguous values (using the same s,t interpolants).
458 template <class T, class Q>
459 inline OIIO_HOSTDEVICE void
460 bilerp (const T *v0, const T *v1,
461  const T *v2, const T *v3,
462  Q s, Q t, int n, T *result)
463 {
464  Q s1 = Q(1) - s;
465  Q t1 = Q(1) - t;
466  for (int i = 0; i < n; ++i)
467  result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
468 }
469 
470 
471 
472 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
473 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
474 /// SCALING the interpolated value by 'scale' and then ADDING to
475 /// 'result'. These are all vectors, so do it for each of 'n'
476 /// contiguous values (using the same s,t interpolants).
477 template <class T, class Q>
478 inline OIIO_HOSTDEVICE void
479 bilerp_mad (const T *v0, const T *v1,
480  const T *v2, const T *v3,
481  Q s, Q t, Q scale, int n, T *result)
482 {
483  Q s1 = Q(1) - s;
484  Q t1 = Q(1) - t;
485  for (int i = 0; i < n; ++i)
486  result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
487  t*(v2[i]*s1 + v3[i]*s)));
488 }
489 
490 
491 
492 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
493 /// upper right top, ...) at coordinates (s,t,r), and return the
494 /// result. This is a template, and so should work for any types.
495 template <class T, class Q>
497 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
498 {
499  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
500  Q s1 = Q(1) - s;
501  Q t1 = Q(1) - t;
502  Q r1 = Q(1) - r;
503  return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
504  r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
505 }
506 
507 
508 
509 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
510 /// upper right top, ...) at coordinates (s,t,r),
511 /// storing the results in 'result'. These are all vectors, so do it
512 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
513 template <class T, class Q>
514 inline OIIO_HOSTDEVICE void
515 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
516  const T *v4, const T *v5, const T *v6, const T *v7,
517  Q s, Q t, Q r, int n, T *result)
518 {
519  Q s1 = Q(1) - s;
520  Q t1 = Q(1) - t;
521  Q r1 = Q(1) - r;
522  for (int i = 0; i < n; ++i)
523  result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
524  r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
525 }
526 
527 
528 
529 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
530 /// upper right top, ...) at coordinates (s,t,r),
531 /// SCALING the interpolated value by 'scale' and then ADDING to
532 /// 'result'. These are all vectors, so do it for each of 'n'
533 /// contiguous values (using the same s,t,r interpolants).
534 template <class T, class Q>
535 inline OIIO_HOSTDEVICE void
536 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
537  const T *v4, const T *v5, const T *v6, const T *v7,
538  Q s, Q t, Q r, Q scale, int n, T *result)
539 {
540  Q r1 = Q(1) - r;
541  bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
542  bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
543 }
544 
545 
546 
547 /// Evaluate B-spline weights in w[0..3] for the given fraction. This
548 /// is an important component of performing a cubic interpolation.
549 template <typename T>
550 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
551 {
552  T one_frac = 1 - fraction;
553  w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
554  w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
555  w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
556  w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
557 }
558 
559 
560 /// Evaluate B-spline derivative weights in w[0..3] for the given
561 /// fraction. This is an important component of performing a cubic
562 /// interpolation with derivatives.
563 template <typename T>
564 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
565 {
566  T one_frac = 1 - fraction;
567  dw[0] = -T(0.5) * one_frac * one_frac;
568  dw[1] = T(0.5) * fraction * (3 * fraction - 4);
569  dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
570  dw[3] = T(0.5) * fraction * fraction;
571 }
572 
573 
574 
575 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
576 /// with val[0] pointing to the data in the upper left corner, val[15]
577 /// pointing to the lower right) at coordinates (s,t), storing the
578 /// results in 'result'. These are all vectors, so do it for each of
579 /// 'n' contiguous values (using the same s,t interpolants).
580 template <class T>
581 inline OIIO_HOSTDEVICE void
582 bicubic_interp (const T **val, T s, T t, int n, T *result)
583 {
584  for (int c = 0; c < n; ++c)
585  result[c] = T(0);
586  T wx[4]; evalBSplineWeights (wx, s);
587  T wy[4]; evalBSplineWeights (wy, t);
588  for (int j = 0; j < 4; ++j) {
589  for (int i = 0; i < 4; ++i) {
590  T w = wx[i] * wy[j];
591  for (int c = 0; c < n; ++c)
592  result[c] += w * val[j*4+i][c];
593  }
594  }
595 }
596 
597 
598 
599 /// Return floor(x) cast to an int.
601 ifloor (float x)
602 {
603  return (int)floorf(x);
604 }
605 
606 
607 
608 /// Return (x-floor(x)) and put (int)floor(x) in *xint. This is similar
609 /// to the built-in modf, but returns a true int, always rounds down
610 /// (compared to modf which rounds toward 0), and always returns
611 /// frac >= 0 (comapred to modf which can return <0 if x<0).
612 inline OIIO_HOSTDEVICE float
613 floorfrac (float x, int *xint)
614 {
615 #if 1
616  float f = std::floor(x);
617  *xint = int(f);
618  return x - f;
619 #else /* vvv This idiom is slower */
620  int i = ifloor(x);
621  *xint = i;
622  return x - static_cast<float>(i); // Return the fraction left over
623 #endif
624 }
625 
626 
627 #ifndef __CUDA_ARCH__
630  *xint = simd::vint4(f);
631  return x - f;
632 }
633 
636  *xint = simd::vint8(f);
637  return x - f;
638 }
639 
642  *xint = simd::vint16(f);
643  return x - f;
644 }
645 #endif
646 
647 
648 
649 
650 /// Convert degrees to radians.
651 template <typename T>
652 OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
653 
654 /// Convert radians to degrees
655 template <typename T>
656 OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
657 
658 
659 /// Faster floating point negation, in cases where you aren't too picky
660 /// about the difference between +0 and -0. (Courtesy of Alex Wells, Intel,
661 /// via code in OSL.)
662 ///
663 /// Beware: fast_neg(0.0f) returns 0.0f, NOT -0.0f. All other values,
664 /// including -0.0 and +/- Inf, work identically to `-x`. For many use
665 /// cases, that's fine. (When was the last time you wanted a -0.0f anyway?)
666 ///
667 /// The reason it's faster is that `-x` (for float x) is implemented by
668 /// compilers by xor'ing x against a bitmask that flips the sign bit, and
669 /// that bitmask had to come from somewhere -- memory! -- which can be
670 /// expensive, depending on whether/where it is in cache. But `0 - x`
671 /// generates a zero in a register (with xor, no memory access needed) and
672 /// then subtracts, so that's sometimes faster because there is no memory
673 /// read. This works for SIMD types as well!
674 ///
675 /// It's also safe (though pointless) to use fast_neg for integer types,
676 /// where both `-x` and `0-x` are implemented as a `neg` instruction.
677 template <typename T>
679 fast_neg(const T &x) {
680  return T(0) - x;
681 }
682 
683 
684 
685 inline OIIO_HOSTDEVICE void
686 sincos (float x, float* sine, float* cosine)
687 {
688 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
689  __builtin_sincosf(x, sine, cosine);
690 #elif defined(__CUDA_ARCH__)
691  // Explicitly select the single-precision CUDA library function
692  sincosf(x, sine, cosine);
693 #else
694  *sine = std::sin(x);
695  *cosine = std::cos(x);
696 #endif
697 }
698 
699 inline OIIO_HOSTDEVICE void
700 sincos (double x, double* sine, double* cosine)
701 {
702 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
703  __builtin_sincos(x, sine, cosine);
704 #elif defined(__CUDA_ARCH__)
705  // Use of anonymous namespace resolves to the CUDA library function and
706  // avoids infinite recursion
707  ::sincos(x, sine, cosine);
708 #else
709  *sine = std::sin(x);
710  *cosine = std::cos(x);
711 #endif
712 }
713 
714 
715 inline OIIO_HOSTDEVICE float sign (float x)
716 {
717  return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
718 }
719 
720 
721 
722 
723 // (end of float helper functions)
724 ////////////////////////////////////////////////////////////////////////////
725 
726 
727 
728 ////////////////////////////////////////////////////////////////////////////
729 ////////////////////////////////////////////////////////////////////////////
730 //
731 // CONVERSION
732 //
733 // Type and range conversion helper functions and classes.
734 
735 
736 template <typename IN_TYPE, typename OUT_TYPE>
737 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) {
738  // NOTE: this is the only standards compliant way of doing this type of casting,
739  // luckily the compilers we care about know how to optimize away this idiom.
740  static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE),
741  "bit_cast must be between objects of the same size");
742  OUT_TYPE out;
743  memcpy ((void *)&out, &in, sizeof(IN_TYPE));
744  return out;
745 }
746 
747 #if defined(__INTEL_COMPILER)
748  // On x86/x86_64 for certain compilers we can use Intel CPU intrinsics
749  // for some common bit_cast cases that might be even more understandable
750  // to the compiler and generate better code without its getting confused
751  // about the memcpy in the general case.
752  // FIXME: The intrinsics are not in clang <= 9 nor gcc <= 9.1. Check
753  // future releases.
754  template<> OIIO_FORCEINLINE uint32_t bit_cast<float, uint32_t> (const float val) {
755  return static_cast<uint32_t>(_castf32_u32(val));
756  }
757  template<> OIIO_FORCEINLINE int32_t bit_cast<float, int32_t> (const float val) {
758  return static_cast<int32_t>(_castf32_u32(val));
759  }
760  template<> OIIO_FORCEINLINE float bit_cast<uint32_t, float> (const uint32_t val) {
761  return _castu32_f32(val);
762  }
763  template<> OIIO_FORCEINLINE float bit_cast<int32_t, float> (const int32_t val) {
764  return _castu32_f32(val);
765  }
766  template<> OIIO_FORCEINLINE uint64_t bit_cast<double, uint64_t> (const double val) {
767  return static_cast<uint64_t>(_castf64_u64(val));
768  }
769  template<> OIIO_FORCEINLINE int64_t bit_cast<double, int64_t> (const double val) {
770  return static_cast<int64_t>(_castf64_u64(val));
771  }
772  template<> OIIO_FORCEINLINE double bit_cast<uint64_t, double> (const uint64_t val) {
773  return _castu64_f64(val);
774  }
775  template<> OIIO_FORCEINLINE double bit_cast<int64_t, double> (const int64_t val) {
776  return _castu64_f64(val);
777  }
778 #endif
779 
780 
782  return bit_cast<float,int>(x);
783 }
785  return bit_cast<int,float>(x);
786 }
787 
788 
789 
790 /// Change endian-ness of one or more data items that are each 2, 4,
791 /// or 8 bytes. This should work for any of short, unsigned short, int,
792 /// unsigned int, float, long long, pointers.
793 template<class T>
794 inline OIIO_HOSTDEVICE void
795 swap_endian (T *f, int len=1)
796 {
797  for (char *c = (char *) f; len--; c += sizeof(T)) {
798  if (sizeof(T) == 2) {
799  std::swap (c[0], c[1]);
800  } else if (sizeof(T) == 4) {
801  std::swap (c[0], c[3]);
802  std::swap (c[1], c[2]);
803  } else if (sizeof(T) == 8) {
804  std::swap (c[0], c[7]);
805  std::swap (c[1], c[6]);
806  std::swap (c[2], c[5]);
807  std::swap (c[3], c[4]);
808  }
809  }
810 }
811 
812 
813 
814 // big_enough_float<T>::float_t is a floating-point type big enough to
815 // handle the range and precision of a <T>. It's a float, unless T is big.
816 template <typename T> struct big_enough_float { typedef float float_t; };
817 template<> struct big_enough_float<int> { typedef double float_t; };
818 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
819 template<> struct big_enough_float<int64_t> { typedef double float_t; };
820 template<> struct big_enough_float<uint64_t> { typedef double float_t; };
821 template<> struct big_enough_float<double> { typedef double float_t; };
822 
823 
824 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
825 /// (presumed to be integer). This is just a helper for the convert_type
826 /// templates, it probably has no other use.
827 template<typename S, typename D, typename F>
828 inline OIIO_HOSTDEVICE D
829 scaled_conversion(const S& src, F scale, F min, F max)
830 {
832  F s = src * scale;
833  s += (s < 0 ? (F)-0.5 : (F)0.5);
834  return (D)clamp(s, min, max);
835  } else {
836  return (D)clamp((F)src * scale + (F)0.5, min, max);
837  }
838 }
839 
840 
841 
842 /// Convert n consecutive values from the type of S to the type of D.
843 /// The conversion is not a simple cast, but correctly remaps the
844 /// 0.0->1.0 range from and to the full positive range of integral
845 /// types. Take a memcpy shortcut if both types are the same and no
846 /// conversion is necessary. Optional arguments can give nonstandard
847 /// quantizations.
848 //
849 // FIXME: make table-based specializations for common types with only a
850 // few possible src values (like unsigned char -> float).
851 template<typename S, typename D>
852 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
853 {
855  // They must be the same type. Just memcpy.
856  memcpy (dst, src, n*sizeof(D));
857  return;
858  }
859  typedef typename big_enough_float<D>::float_t F;
861  (F(1)) / F(std::numeric_limits<S>::max()) : F(1);
863  // Converting to an integer-like type.
864  F min = (F)_min; // std::numeric_limits<D>::min();
865  F max = (F)_max; // std::numeric_limits<D>::max();
866  scale *= _max;
867  // Unroll loop for speed
868  for ( ; n >= 16; n -= 16) {
869  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
870  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
871  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
872  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
873  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
874  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
875  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
876  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
877  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
878  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
879  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
880  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
881  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
882  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
883  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
884  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
885  }
886  while (n--)
887  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
888  } else {
889  // Converting to a float-like type, so we don't need to remap
890  // the range
891  // Unroll loop for speed
892  for ( ; n >= 16; n -= 16) {
893  *dst++ = (D)((*src++) * scale);
894  *dst++ = (D)((*src++) * scale);
895  *dst++ = (D)((*src++) * scale);
896  *dst++ = (D)((*src++) * scale);
897  *dst++ = (D)((*src++) * scale);
898  *dst++ = (D)((*src++) * scale);
899  *dst++ = (D)((*src++) * scale);
900  *dst++ = (D)((*src++) * scale);
901  *dst++ = (D)((*src++) * scale);
902  *dst++ = (D)((*src++) * scale);
903  *dst++ = (D)((*src++) * scale);
904  *dst++ = (D)((*src++) * scale);
905  *dst++ = (D)((*src++) * scale);
906  *dst++ = (D)((*src++) * scale);
907  *dst++ = (D)((*src++) * scale);
908  *dst++ = (D)((*src++) * scale);
909  }
910  while (n--)
911  *dst++ = (D)((*src++) * scale);
912  }
913 }
914 
915 
916 #ifndef __CUDA_ARCH__
917 template<>
918 inline void convert_type<uint8_t,float> (const uint8_t *src,
919  float *dst, size_t n,
920  float /*_min*/, float /*_max*/)
921 {
923  simd::vfloat4 scale_simd (scale);
924  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
925  simd::vfloat4 s_simd (src);
926  simd::vfloat4 d_simd = s_simd * scale_simd;
927  d_simd.store (dst);
928  }
929  while (n--)
930  *dst++ = (*src++) * scale;
931 }
932 
933 
934 
935 template<>
936 inline void convert_type<uint16_t,float> (const uint16_t *src,
937  float *dst, size_t n,
938  float /*_min*/, float /*_max*/)
939 {
941  simd::vfloat4 scale_simd (scale);
942  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
943  simd::vfloat4 s_simd (src);
944  simd::vfloat4 d_simd = s_simd * scale_simd;
945  d_simd.store (dst);
946  }
947  while (n--)
948  *dst++ = (*src++) * scale;
949 }
950 
951 
952 #ifdef _HALF_H_
953 template<>
954 inline void convert_type<half,float> (const half *src,
955  float *dst, size_t n,
956  float /*_min*/, float /*_max*/)
957 {
958 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
959  // If f16c ops are enabled, it's worth doing this by 8's
960  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
961  simd::vfloat8 s_simd (src);
962  s_simd.store (dst);
963  }
964 #endif
965 #if OIIO_SIMD >= 4
966  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
967  simd::vfloat4 s_simd (src);
968  s_simd.store (dst);
969  }
970 #endif
971  while (n--)
972  *dst++ = (*src++);
973 }
974 #endif
975 
976 
977 
978 template<>
979 inline void
980 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
981  uint16_t /*_min*/, uint16_t /*_max*/)
982 {
985  float scale = max;
986  simd::vfloat4 max_simd (max);
987  simd::vfloat4 zero_simd (0.0f);
988  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
989  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
990  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
991  simd::vint4 i (clamped);
992  i.store (dst);
993  }
994  while (n--)
995  *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
996 }
997 
998 
999 template<>
1000 inline void
1001 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
1002  uint8_t /*_min*/, uint8_t /*_max*/)
1003 {
1006  float scale = max;
1007  simd::vfloat4 max_simd (max);
1008  simd::vfloat4 zero_simd (0.0f);
1009  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1010  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1011  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1012  simd::vint4 i (clamped);
1013  i.store (dst);
1014  }
1015  while (n--)
1016  *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
1017 }
1018 
1019 
1020 #ifdef _HALF_H_
1021 template<>
1022 inline void
1023 convert_type<float,half> (const float *src, half *dst, size_t n,
1024  half /*_min*/, half /*_max*/)
1025 {
1026 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1027  // If f16c ops are enabled, it's worth doing this by 8's
1028  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1029  simd::vfloat8 s (src);
1030  s.store (dst);
1031  }
1032 #endif
1033 #if OIIO_SIMD >= 4
1034  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1035  simd::vfloat4 s (src);
1036  s.store (dst);
1037  }
1038 #endif
1039  while (n--)
1040  *dst++ = *src++;
1041 }
1042 #endif
1043 #endif
1044 
1045 
1046 
1047 template<typename S, typename D>
1048 inline void convert_type (const S *src, D *dst, size_t n)
1049 {
1050  convert_type<S,D> (src, dst, n,
1053 }
1054 
1055 
1056 
1057 
1058 /// Convert a single value from the type of S to the type of D.
1059 /// The conversion is not a simple cast, but correctly remaps the
1060 /// 0.0->1.0 range from and to the full positive range of integral
1061 /// types. Take a copy shortcut if both types are the same and no
1062 /// conversion is necessary.
1063 template<typename S, typename D>
1064 inline D
1065 convert_type (const S &src)
1066 {
1068  // They must be the same type. Just return it.
1069  return (D)src;
1070  }
1071  typedef typename big_enough_float<D>::float_t F;
1073  F(1) / F(std::numeric_limits<S>::max()) : F(1);
1075  // Converting to an integer-like type.
1076  F min = (F) std::numeric_limits<D>::min();
1077  F max = (F) std::numeric_limits<D>::max();
1078  scale *= max;
1079  return scaled_conversion<S,D,F> (src, scale, min, max);
1080  } else {
1081  // Converting to a float-like type, so we don't need to remap
1082  // the range
1083  return (D)((F)src * scale);
1084  }
1085 }
1086 
1087 
1088 
1089 /// Helper function to convert channel values between different bit depths.
1090 /// Roughly equivalent to:
1091 ///
1092 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
1093 ///
1094 /// but utilizes an integer math trick for speed. It can be proven that the
1095 /// absolute error of this method is less or equal to 1, with an average error
1096 /// (with respect to the entire domain) below 0.2.
1097 ///
1098 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
1099 /// shifted fully to the right.
1100 template<unsigned int FROM_BITS, unsigned int TO_BITS>
1101 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
1102  unsigned int out = 0;
1103  int shift = TO_BITS - FROM_BITS;
1104  for (; shift > 0; shift -= FROM_BITS)
1105  out |= in << shift;
1106  out |= in >> -shift;
1107  return out;
1108 }
1109 
1110 
1111 
1112 // non-templated version. Slow but general
1113 inline OIIO_HOSTDEVICE unsigned int
1114 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
1115 {
1116  unsigned int out = 0;
1117  int shift = TO_BITS - FROM_BITS;
1118  for (; shift > 0; shift -= FROM_BITS)
1119  out |= in << shift;
1120  out |= in >> -shift;
1121  return out;
1122 }
1123 
1124 
1125 
1126 /// Append the `n` LSB bits of `val` into a bit sting `T out[]`, where the
1127 /// `filled` MSB bits of `*out` are already filled in. Increment `out` and
1128 /// adjust `filled` as required. Type `T` should be uint8_t, uint16_t, or
1129 /// uint32_t.
1130 template<typename T>
1131 inline void
1132 bitstring_add_n_bits (T* &out, int& filled, uint32_t val, int n)
1133 {
1134  static_assert(std::is_same<T,uint8_t>::value ||
1137  "bitstring_add_n_bits must be unsigned int 8/16/32");
1138  const int Tbits = sizeof(T) * 8;
1139  // val: | don't care | copy me |
1140  // <- n bits ->
1141  //
1142  // *out: | don't touch | fill in here |
1143  // <- filled -> <- (Tbits - filled) ->
1144  while (n > 0) {
1145  // Make sure val doesn't have any cruft in bits > n
1146  val &= ~(0xffffffff << n);
1147  // Initialize any new byte we're filling in
1148  if (filled == 0)
1149  *out = 0;
1150  // How many bits can we fill in `*out` without having to increment
1151  // to the next byte?
1152  int bits_left_in_out = Tbits - filled;
1153  int b = 0; // bit pattern to 'or' with *out
1154  int nb = 0; // number of bits to add
1155  if (n <= bits_left_in_out) { // can fit completely in this byte
1156  b = val << (bits_left_in_out - n);
1157  nb = n;
1158  } else { // n > bits_left_in_out, will spill to next byte
1159  b = val >> (n - bits_left_in_out);
1160  nb = bits_left_in_out;
1161  }
1162  *out |= b;
1163  filled += nb;
1164  OIIO_DASSERT (filled <= Tbits);
1165  n -= nb;
1166  if (filled == Tbits) {
1167  ++out;
1168  filled = 0;
1169  }
1170  }
1171 }
1172 
1173 
1174 
1175 /// Pack values from `T in[0..n-1]` (where `T` is expected to be a uint8,
1176 /// uint16, or uint32, into successive raw outbits-bit pieces of `out[]`,
1177 /// where outbits is expected to be less than the number of bits in a `T`.
1178 template<typename T>
1179 inline void
1180 bit_pack(cspan<T> data, void* out, int outbits)
1181 {
1182  static_assert(std::is_same<T,uint8_t>::value ||
1185  "bit_pack must be unsigned int 8/16/32");
1186  unsigned char* outbuffer = (unsigned char*)out;
1187  int filled = 0;
1188  for (size_t i = 0, e = data.size(); i < e; ++i)
1189  bitstring_add_n_bits (outbuffer, filled, data[i], outbits);
1190 }
1191 
1192 
1193 
1194 /// Decode n packed inbits-bits values from in[...] into normal uint8,
1195 /// uint16, or uint32 representation of `T out[0..n-1]`. In other words,
1196 /// each successive `inbits` of `in` (allowing spanning of byte boundaries)
1197 /// will be stored in a successive out[i].
1198 template<typename T>
1199 inline void
1200 bit_unpack(int n, const unsigned char* in, int inbits, T* out)
1201 {
1202  static_assert(std::is_same<T,uint8_t>::value ||
1205  "bit_unpack must be unsigned int 8/16/32");
1206  OIIO_DASSERT(inbits >= 1 && inbits < 32); // surely bugs if not
1207  // int highest = (1 << inbits) - 1;
1208  int B = 0, b = 0;
1209  // Invariant:
1210  // So far, we have used in[0..B-1] and the high b bits of in[B].
1211  for (int i = 0; i < n; ++i) {
1212  long long val = 0;
1213  int valbits = 0; // bits so far we've accumulated in val
1214  while (valbits < inbits) {
1215  // Invariant: we have already accumulated valbits of the next
1216  // needed value (of a total of inbits), living in the valbits
1217  // low bits of val.
1218  int out_left = inbits - valbits; // How much more we still need
1219  int in_left = 8 - b; // Bits still available in in[B].
1220  if (in_left <= out_left) {
1221  // Eat the rest of this byte:
1222  // |---------|--------|
1223  // b in_left
1224  val <<= in_left;
1225  val |= in[B] & ~(0xffffffff << in_left);
1226  ++B;
1227  b = 0;
1228  valbits += in_left;
1229  } else {
1230  // Eat just the bits we need:
1231  // |--|---------|-----|
1232  // b out_left extra
1233  val <<= out_left;
1234  int extra = 8 - b - out_left;
1235  val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1236  b += out_left;
1237  valbits = inbits;
1238  }
1239  }
1240  out[i] = val; //T((val * 0xff) / highest);
1241  }
1242 }
1243 
1244 
1245 
1246 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
1247 /// and does conversions (via convert_type) as it reads in and out.
1248 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1249 template<typename I, typename E>
1250 struct DataProxy {
1251  DataProxy (I &data) : m_data(data) { }
1252  E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
1253  operator E () const { return convert_type<I,E>(m_data); }
1254 private:
1255  DataProxy& operator = (const DataProxy&); // Do not implement
1256  I &m_data;
1257 };
1258 
1259 
1260 
1261 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
1262 /// a (const I &) and does conversions (via convert_type) as it reads.
1263 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1264 template<typename I, typename E>
1266  ConstDataProxy (const I &data) : m_data(data) { }
1267  operator E () const { return convert_type<E,I>(*m_data); }
1268 private:
1269  const I &m_data;
1270 };
1271 
1272 
1273 
1274 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
1275 /// and does conversions (via convert_type) as it reads in and out.
1276 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1277 template<typename I, typename E>
1279  DataArrayProxy (I *data=NULL) : m_data(data) { }
1280  E operator* () const { return convert_type<I,E>(*m_data); }
1281  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1282  DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
1283  void set (I *data) { m_data = data; }
1284  I * get () const { return m_data; }
1286  m_data += i; return *this;
1287  }
1288 private:
1289  I *m_data;
1290 };
1291 
1292 
1293 
1294 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
1295 /// (I *) and does conversions (via convert_type) as it reads in and out.
1296 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1297 template<typename I, typename E>
1299  ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
1300  E operator* () const { return convert_type<I,E>(*m_data); }
1301  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1302  void set (const I *data) { m_data = data; }
1303  const I * get () const { return m_data; }
1305  m_data += i; return *this;
1306  }
1307 private:
1308  const I *m_data;
1309 };
1310 
1311 
1312 
1313 /// Fast table-based conversion of 8-bit to other types. Declare this
1314 /// as static to avoid the expensive ctr being called all the time.
1315 template <class T=float>
1317 public:
1318  EightBitConverter () noexcept { init(); }
1319  T operator() (unsigned char c) const noexcept { return val[c]; }
1320 private:
1321  T val[256];
1322  void init () noexcept {
1323  float scale = 1.0f / 255.0f;
1325  scale *= (float)std::numeric_limits<T>::max();
1326  for (int i = 0; i < 256; ++i)
1327  val[i] = (T)(i * scale);
1328  }
1329 };
1330 
1331 
1332 
1333 /// Simple conversion of a (presumably non-negative) float into a
1334 /// rational. This does not attempt to find the simplest fraction
1335 /// that approximates the float, for example 52.83 will simply
1336 /// return 5283/100. This does not attempt to gracefully handle
1337 /// floats that are out of range that could be easily int/int.
1338 inline OIIO_HOSTDEVICE void
1339 float_to_rational (float f, unsigned int &num, unsigned int &den)
1340 {
1341  if (f <= 0) { // Trivial case of zero, and handle all negative values
1342  num = 0;
1343  den = 1;
1344  } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1345  num = 1;
1346  den = (int)f;
1347  } else {
1348  num = (int)f;
1349  den = 1;
1350  while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1351  den *= 10;
1352  f *= 10;
1353  num = (int)f;
1354  }
1355  }
1356 }
1357 
1358 
1359 
1360 /// Simple conversion of a float into a rational. This does not attempt
1361 /// to find the simplest fraction that approximates the float, for
1362 /// example 52.83 will simply return 5283/100. This does not attempt to
1363 /// gracefully handle floats that are out of range that could be easily
1364 /// int/int.
1365 inline OIIO_HOSTDEVICE void
1366 float_to_rational (float f, int &num, int &den)
1367 {
1368  unsigned int n, d;
1369  float_to_rational (fabsf(f), n, d);
1370  num = (f >= 0) ? (int)n : -(int)n;
1371  den = (int) d;
1372 }
1373 
1374 
1375 // (end of conversion helpers)
1376 ////////////////////////////////////////////////////////////////////////////
1377 
1378 
1379 
1380 
1381 ////////////////////////////////////////////////////////////////////////////
1382 ////////////////////////////////////////////////////////////////////////////
1383 //
1384 // SAFE MATH
1385 //
1386 // The functions named "safe_*" are versions with various internal clamps
1387 // or other deviations from IEEE standards with the specific intent of
1388 // never producing NaN or Inf values or throwing exceptions. But within the
1389 // valid range, they should be full precision and match IEEE standards.
1390 //
1391 
1392 
1393 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1394 template <typename T>
1396  return x >= T(0) ? std::sqrt(x) : T(0);
1397 }
1398 
1399 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1400 template <typename T>
1402  return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1403 }
1404 
1405 
1406 /// Safe (clamping) arcsine: clamp to the valid domain.
1407 template <typename T>
1409  if (x <= T(-1)) return T(-M_PI_2);
1410  if (x >= T(+1)) return T(+M_PI_2);
1411  return std::asin(x);
1412 }
1413 
1414 /// Safe (clamping) arccosine: clamp to the valid domain.
1415 template <typename T>
1417  if (x <= T(-1)) return T(M_PI);
1418  if (x >= T(+1)) return T(0);
1419  return std::acos(x);
1420 }
1421 
1422 
1423 /// Safe log2: clamp to valid domain.
1424 template <typename T>
1426  // match clamping from fast version
1429  return std::log2(x);
1430 }
1431 
1432 /// Safe log: clamp to valid domain.
1433 template <typename T>
1435  // slightly different than fast version since clamping happens before scaling
1438  return std::log(x);
1439 }
1440 
1441 /// Safe log10: clamp to valid domain.
1442 template <typename T>
1444  // slightly different than fast version since clamping happens before scaling
1447  return log10f(x);
1448 }
1449 
1450 /// Safe logb: clamp to valid domain.
1451 template <typename T>
1453  return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1454 }
1455 
1456 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1457 /// by zero error.
1458 template <typename T>
1460  if (y == T(0)) return T(1);
1461  if (x == T(0)) return T(0);
1462  // if x is negative, only deal with integer powers
1463  if ((x < T(0)) && (y != floor(y))) return T(0);
1464  // FIXME: this does not match "fast" variant because clamping limits are different
1465  T r = std::pow(x, y);
1466  // Clamp to avoid returning Inf.
1467  const T big = std::numeric_limits<T>::max();
1468  return clamp (r, -big, big);
1469 }
1470 
1471 
1472 /// Safe fmod: guard against b==0.0 (in which case, return 0.0). Also, it
1473 /// seems that this is much faster than std::fmod!
1475 {
1476  if (OIIO_LIKELY(b != 0.0f)) {
1477 #if 0
1478  return std::fmod (a,b);
1479  // std::fmod was getting called serially instead of vectorizing, so
1480  // we will just do the calculation ourselves.
1481 #else
1482  // This formulation is equivalent but much faster in our benchmarks,
1483  // also vectorizes better.
1484  // The floating-point remainder of the division operation
1485  // a/b is a - N*b, where N = a/b with its fractional part truncated.
1486  int N = static_cast<int>(a/b);
1487  return a - N*b;
1488 #endif
1489  }
1490  return 0.0f;
1491 }
1492 
1493 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1494 
1495 // (end of safe_* functions)
1496 ////////////////////////////////////////////////////////////////////////////
1497 
1498 
1499 
1500 ////////////////////////////////////////////////////////////////////////////
1501 ////////////////////////////////////////////////////////////////////////////
1502 //
1503 // FAST & APPROXIMATE MATH
1504 //
1505 // The functions named "fast_*" provide a set of replacements to libm that
1506 // are much faster at the expense of some accuracy and robust handling of
1507 // extreme values. One design goal for these approximation was to avoid
1508 // branches as much as possible and operate on single precision values only
1509 // so that SIMD versions should be straightforward ports. We also try to
1510 // implement "safe" semantics (ie: clamp to valid range where possible)
1511 // natively since wrapping these inline calls in another layer would be
1512 // wasteful.
1513 //
1514 // The "fast_*" functions are not only possibly differing in value
1515 // (slightly) from the std versions of these functions, but also we do not
1516 // guarantee that the results of "fast" will exactly match from platform to
1517 // platform. This is because if a particular platform (HW, library, or
1518 // compiler) provides an intrinsic that is even faster than our usual "fast"
1519 // implementation, we may substitute it.
1520 //
1521 // Some functions are fast_safe_*, which is both a faster approximation as
1522 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1523 //
1524 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1525 // is influenced by compiler options (-ffast-math for clang/gcc,
1526 // --use-fast-math for NVCC).
1527 //
1528 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1529 // relative to the definitions in this file. It may be better in some
1530 // cases to use the approximate versions defined below.
1531 
1532 
1533 /// Round to nearest integer, returning as an int.
1534 /// Note that this differs from std::rint, which returns a float; it's more
1535 /// akin to std::lrint, which returns a long (int). Sorry for the naming
1536 /// confusion.
1538  // used by sin/cos/tan range reduction
1539 #if OIIO_SIMD_SSE >= 4
1540  // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1541  return static_cast<int>(std::rint(x));
1542 #else
1543  // emulate rounding by adding/substracting 0.5
1544  return static_cast<int>(x + copysignf(0.5f, x));
1545 #endif
1546 }
1547 
1548 #ifndef __CUDA_ARCH__
1550  return simd::rint (x);
1551 }
1552 #endif
1553 
1554 
1556 #ifndef __CUDA_ARCH__
1557  // very accurate argument reduction from SLEEF
1558  // starts failing around x=262000
1559  // Results on: [-2pi,2pi]
1560  // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1561  int q = fast_rint (x * float(M_1_PI));
1562  float qf = float(q);
1563  x = madd(qf, -0.78515625f*4, x);
1564  x = madd(qf, -0.00024187564849853515625f*4, x);
1565  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1566  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1567  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1568  float s = x * x;
1569  if ((q & 1) != 0) x = -x;
1570  // this polynomial approximation has very low error on [-pi/2,+pi/2]
1571  // 1.19209e-07 max error in total over [-2pi,+2pi]
1572  float u = 2.6083159809786593541503e-06f;
1573  u = madd(u, s, -0.0001981069071916863322258f);
1574  u = madd(u, s, +0.00833307858556509017944336f);
1575  u = madd(u, s, -0.166666597127914428710938f);
1576  u = madd(s, u * x, x);
1577  // For large x, the argument reduction can fail and the polynomial can
1578  // be evaluated with arguments outside the valid internal. Just clamp
1579  // the bad values away.
1580  return clamp(u, -1.0f, 1.0f);
1581 #else
1582  return __sinf(x);
1583 #endif
1584 }
1585 
1586 
1588 #ifndef __CUDA_ARCH__
1589  // same argument reduction as fast_sin
1590  int q = fast_rint (x * float(M_1_PI));
1591  float qf = float(q);
1592  x = madd(qf, -0.78515625f*4, x);
1593  x = madd(qf, -0.00024187564849853515625f*4, x);
1594  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1595  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1596  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1597  float s = x * x;
1598  // polynomial from SLEEF's sincosf, max error is
1599  // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1600  float u = -2.71811842367242206819355e-07f;
1601  u = madd(u, s, +2.47990446951007470488548e-05f);
1602  u = madd(u, s, -0.00138888787478208541870117f);
1603  u = madd(u, s, +0.0416666641831398010253906f);
1604  u = madd(u, s, -0.5f);
1605  u = madd(u, s, +1.0f);
1606  if ((q & 1) != 0) u = -u;
1607  return clamp(u, -1.0f, 1.0f);
1608 #else
1609  return __cosf(x);
1610 #endif
1611 }
1612 
1613 
1614 OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1615 #ifndef __CUDA_ARCH__
1616  // same argument reduction as fast_sin
1617  int q = fast_rint (x * float(M_1_PI));
1618  float qf = float(q);
1619  x = madd(qf, -0.78515625f*4, x);
1620  x = madd(qf, -0.00024187564849853515625f*4, x);
1621  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1622  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1623  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1624  float s = x * x;
1625  // NOTE: same exact polynomials as fast_sin and fast_cos above
1626  if ((q & 1) != 0) x = -x;
1627  float su = 2.6083159809786593541503e-06f;
1628  su = madd(su, s, -0.0001981069071916863322258f);
1629  su = madd(su, s, +0.00833307858556509017944336f);
1630  su = madd(su, s, -0.166666597127914428710938f);
1631  su = madd(s, su * x, x);
1632  float cu = -2.71811842367242206819355e-07f;
1633  cu = madd(cu, s, +2.47990446951007470488548e-05f);
1634  cu = madd(cu, s, -0.00138888787478208541870117f);
1635  cu = madd(cu, s, +0.0416666641831398010253906f);
1636  cu = madd(cu, s, -0.5f);
1637  cu = madd(cu, s, +1.0f);
1638  if ((q & 1) != 0) cu = -cu;
1639  *sine = clamp(su, -1.0f, 1.0f);;
1640  *cosine = clamp(cu, -1.0f, 1.0f);;
1641 #else
1642  __sincosf(x, sine, cosine);
1643 #endif
1644 }
1645 
1646 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1647 // really poor outside of this range because the reciprocal amplifies errors
1649 #ifndef __CUDA_ARCH__
1650  // derived from SLEEF implementation
1651  // note that we cannot apply the "denormal crush" trick everywhere because
1652  // we sometimes need to take the reciprocal of the polynomial
1653  int q = fast_rint (x * float(2 * M_1_PI));
1654  float qf = float(q);
1655  x = madd(qf, -0.78515625f*2, x);
1656  x = madd(qf, -0.00024187564849853515625f*2, x);
1657  x = madd(qf, -3.7747668102383613586e-08f*2, x);
1658  x = madd(qf, -1.2816720341285448015e-12f*2, x);
1659  if ((q & 1) == 0)
1660  x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1661  float s = x * x;
1662  float u = 0.00927245803177356719970703f;
1663  u = madd(u, s, 0.00331984995864331722259521f);
1664  u = madd(u, s, 0.0242998078465461730957031f);
1665  u = madd(u, s, 0.0534495301544666290283203f);
1666  u = madd(u, s, 0.133383005857467651367188f);
1667  u = madd(u, s, 0.333331853151321411132812f);
1668  u = madd(s, u * x, x);
1669  if ((q & 1) != 0)
1670  u = -1.0f / u;
1671  return u;
1672 #else
1673  return __tanf(x);
1674 #endif
1675 }
1676 
1677 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1678 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1679 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1681 {
1682 #ifndef __CUDA_ARCH__
1683  // Fast trick to strip the integral part off, so our domain is [-1,1]
1684  const float z = x - ((x + 25165824.0f) - 25165824.0f);
1685  const float y = z - z * fabsf(z);
1686  const float Q = 3.10396624f;
1687  const float P = 3.584135056f; // P = 16-4*Q
1688  return y * (Q + P * fabsf(y));
1689  /* The original article used used inferior constants for Q and P and
1690  * so had max error 1.091e-3.
1691  *
1692  * The optimal value for Q was determined by exhaustive search, minimizing
1693  * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1694  * over the interval [0,2] (which is where most of the invocations happen).
1695  *
1696  * The basic idea of this approximation starts with the coarse approximation:
1697  * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
1698  *
1699  * This approximation always _over_ estimates the target. On the otherhand, the
1700  * curve:
1701  * sin(pi*x) ~= f(x) * abs(f(x)) / 4
1702  *
1703  * always lies _under_ the target. Thus we can simply numerically search for the
1704  * optimal constant to LERP these curves into a more precise approximation.
1705  * After folding the constants together and simplifying the resulting math, we
1706  * end up with the compact implementation below.
1707  *
1708  * NOTE: this function actually computes sin(x * pi) which avoids one or two
1709  * mults in many cases and guarantees exact values at integer periods.
1710  */
1711 #else
1712  return sinpif(x);
1713 #endif
1714 }
1715 
1716 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1717 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1719 {
1720 #ifndef __CUDA_ARCH__
1721  return fast_sinpi (x+0.5f);
1722 #else
1723  return cospif(x);
1724 #endif
1725 }
1726 
1728 #ifndef __CUDA_ARCH__
1729  const float f = fabsf(x);
1730  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1731  // based on http://www.pouet.net/topic.php?which=9132&page=2
1732  // 85% accurate (ulp 0)
1733  // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1734  // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1735  const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1736  return x < 0 ? float(M_PI) - a : a;
1737 #else
1738  return acosf(x);
1739 #endif
1740 }
1741 
1743 #ifndef __CUDA_ARCH__
1744  // based on acosf approximation above
1745  // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1746  const float f = fabsf(x);
1747  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1748  const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1749  return copysignf(a, x);
1750 #else
1751  return asinf(x);
1752 #endif
1753 }
1754 
1756 #ifndef __CUDA_ARCH__
1757  const float a = fabsf(x);
1758  const float k = a > 1.0f ? 1 / a : a;
1759  const float s = 1.0f - (1.0f - k); // crush denormals
1760  const float t = s * s;
1761  // http://mathforum.org/library/drmath/view/62672.html
1762  // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1763  // (slightly higher error but no discontinuities)
1764  // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error // (with denormals)
1765  // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1766  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1767  if (a > 1.0f) r = 1.570796326794896557998982f - r;
1768  return copysignf(r, x);
1769 #else
1770  return atanf(x);
1771 #endif
1772 }
1773 
1775 #ifndef __CUDA_ARCH__
1776  // based on atan approximation above
1777  // the special cases around 0 and infinity were tested explicitly
1778  // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1779  const float a = fabsf(x);
1780  const float b = fabsf(y);
1781  bool b_is_greater_than_a = b > a;
1782 
1783 #if OIIO_FMATH_SIMD_FRIENDLY
1784  // When applying to all lanes in SIMD, we end up doing extra masking and
1785  // 2 divides. So lets just do 1 divide and swap the parameters instead.
1786  // And if we are going to do a doing a divide anyway, when a == b it
1787  // should be 1.0f anyway so lets not bother special casing it.
1788  float sa = b_is_greater_than_a ? b : a;
1789  float sb = b_is_greater_than_a ? a : b;
1790  const float k = (b == 0) ? 0.0f : sb/sa;
1791 #else
1792  const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1793 #endif
1794 
1795  const float s = 1.0f - (1.0f - k); // crush denormals
1796  const float t = s * s;
1797 
1798  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1799 
1800  if (b_is_greater_than_a)
1801  r = 1.570796326794896557998982f - r; // account for arg reduction
1802  // TODO: investigate if testing x < 0.0f is more efficient
1803  if (bit_cast<float, unsigned>(x) & 0x80000000u) // test sign bit of x
1804  r = float(M_PI) - r;
1805  return copysignf(r, y);
1806 #else
1807  return atan2f(y, x);
1808 #endif
1809 }
1810 
1811 
1812 template<typename T>
1814  using namespace simd;
1815  typedef typename T::int_t intN;
1816  // See float fast_log2 for explanations
1818  intN bits = bitcast_to_int(x);
1819  intN exponent = srl (bits, 23) - intN(127);
1820  T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1821  T f2 = f * f;
1822  T f4 = f2 * f2;
1823  T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1824  T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1825  hi = madd(f, hi, T(-0.13753123777116f));
1826  hi = madd(f, hi, T( 0.24187369696082f));
1827  hi = madd(f, hi, T(-0.34730547155299f));
1828  lo = madd(f, lo, T( 1.442689881667200f));
1829  return ((f4 * hi) + (f * lo)) + T(exponent);
1830 }
1831 
1832 
1833 template<>
1834 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1835 #ifndef __CUDA_ARCH__
1836  // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1838  // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1839  unsigned bits = bit_cast<float, unsigned>(x);
1840  int exponent = int(bits >> 23) - 127;
1841  float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1842  // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1843  // ulp histogram:
1844  // 0 = 97.46%
1845  // 1 = 2.29%
1846  // 2 = 0.11%
1847  float f2 = f * f;
1848  float f4 = f2 * f2;
1849  float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
1850  float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
1851  hi = madd(f, hi, -0.13753123777116f);
1852  hi = madd(f, hi, 0.24187369696082f);
1853  hi = madd(f, hi, -0.34730547155299f);
1854  lo = madd(f, lo, 1.442689881667200f);
1855  return ((f4 * hi) + (f * lo)) + exponent;
1856 #else
1857  return __log2f(xval);
1858 #endif
1859 }
1860 
1861 
1862 
1863 template<typename T>
1865  // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1866  return fast_log2(x) * T(M_LN2);
1867 }
1868 
1869 #ifdef __CUDA_ARCH__
1870 template<>
1871 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log(const float& x)
1872 {
1873  return __logf(x);
1874 }
1875 #endif
1876 
1877 
1878 template<typename T>
1880  // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
1881  return fast_log2(x) * T(M_LN2 / M_LN10);
1882 }
1883 
1884 #ifdef __CUDA_ARCH__
1885 template<>
1886 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log10(const float& x)
1887 {
1888  return __log10f(x);
1889 }
1890 #endif
1891 
1893 #ifndef __CUDA_ARCH__
1894  // don't bother with denormals
1895  x = fabsf(x);
1898  unsigned bits = bit_cast<float, unsigned>(x);
1899  return float (int(bits >> 23) - 127);
1900 #else
1901  return logbf(x);
1902 #endif
1903 }
1904 
1906 #ifndef __CUDA_ARCH__
1907  if (fabsf(x) < 0.01f) {
1908  float y = 1.0f - (1.0f - x); // crush denormals
1909  return copysignf(madd(-0.5f, y * y, y), x);
1910  } else {
1911  return fast_log(x + 1);
1912  }
1913 #else
1914  return log1pf(x);
1915 #endif
1916 }
1917 
1918 
1919 
1920 template<typename T>
1922  using namespace simd;
1923  typedef typename T::int_t intN;
1924 #if OIIO_SIMD_SSE
1925  // See float specialization for explanations
1926  T x = clamp (xval, T(-126.0f), T(126.0f));
1927  intN m (x); x -= T(m);
1928  T one (1.0f);
1929  x = one - (one - x); // crush denormals (does not affect max ulps!)
1930  const T kA (1.33336498402e-3f);
1931  const T kB (9.810352697968e-3f);
1932  const T kC (5.551834031939e-2f);
1933  const T kD (0.2401793301105f);
1934  const T kE (0.693144857883f);
1935  T r (kA);
1936  r = madd(x, r, kB);
1937  r = madd(x, r, kC);
1938  r = madd(x, r, kD);
1939  r = madd(x, r, kE);
1940  r = madd(x, r, one);
1941  return bitcast_to_float (bitcast_to_int(r) + (m << 23));
1942 #else
1943  T r;
1944  for (int i = 0; i < r.elements; ++i)
1945  r[i] = fast_exp2(xval[i]);
1946  for (int i = r.elements; i < r.paddedelements; ++i)
1947  r[i] = 0.0f;
1948  return r;
1949 #endif
1950 }
1951 
1952 
1953 template<>
1954 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
1955 #if OIIO_NON_INTEL_CLANG && OIIO_FMATH_SIMD_FRIENDLY
1956  // Clang was unhappy using the bitcast/memcpy/reinter_cast/union inside
1957  // an explicit SIMD loop, so revert to calling the standard version.
1958  return std::exp2(xval);
1959 #elif !defined(__CUDA_ARCH__)
1960  // clamp to safe range for final addition
1961  float x = clamp (xval, -126.0f, 126.0f);
1962  // range reduction
1963  int m = int(x); x -= m;
1964  x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
1965  // 5th degree polynomial generated with sollya
1966  // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
1967  // ulp histogram:
1968  // 0 = 87.81%
1969  // 1 = 4.18%
1970  float r = 1.33336498402e-3f;
1971  r = madd(x, r, 9.810352697968e-3f);
1972  r = madd(x, r, 5.551834031939e-2f);
1973  r = madd(x, r, 0.2401793301105f);
1974  r = madd(x, r, 0.693144857883f);
1975  r = madd(x, r, 1.0f);
1976  // multiply by 2 ^ m by adding in the exponent
1977  // NOTE: left-shift of negative number is undefined behavior
1978  return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
1979  // Clang: loop not vectorized: unsafe dependent memory operations in loop.
1980  // This is why we special case the OIIO_FMATH_SIMD_FRIENDLY above.
1981  // FIXME: as clang releases continue to improve, periodically check if
1982  // this is still the case.
1983 #else
1984  return exp2f(xval);
1985 #endif
1986 }
1987 
1988 
1989 
1990 
1991 template <typename T>
1993  // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
1994  return fast_exp2(x * T(1 / M_LN2));
1995 }
1996 
1997 #ifdef __CUDA_ARCH__
1998 template<>
1999 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp (const float& x) {
2000  return __expf(x);
2001 }
2002 #endif
2003 
2004 
2005 
2006 /// Faster float exp than is in libm, but still 100% accurate
2008 {
2009 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2010  // On x86_64, versions of glibc < 2.16 have an issue where expf is
2011  // much slower than the double version. This was fixed in glibc 2.16.
2012  return static_cast<float>(std::exp(static_cast<double>(x)));
2013 #else
2014  return std::exp(x);
2015 #endif
2016 }
2017 
2018 
2020 #ifndef __CUDA_ARCH__
2021  // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
2022  return fast_exp2(x * float(M_LN10 / M_LN2));
2023 #else
2024  return __exp10f(x);
2025 #endif
2026 }
2027 
2029 #ifndef __CUDA_ARCH__
2030  if (fabsf(x) < 0.03f) {
2031  float y = 1.0f - (1.0f - x); // crush denormals
2032  return copysignf(madd(0.5f, y * y, y), x);
2033  } else
2034  return fast_exp(x) - 1.0f;
2035 #else
2036  return expm1f(x);
2037 #endif
2038 }
2039 
2041 #ifndef __CUDA_ARCH__
2042  float a = fabsf(x);
2043  if (a > 1.0f) {
2044  // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
2045  float e = fast_exp(a);
2046  return copysignf(0.5f * e - 0.5f / e, x);
2047  } else {
2048  a = 1.0f - (1.0f - a); // crush denorms
2049  float a2 = a * a;
2050  // degree 7 polynomial generated with sollya
2051  // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
2052  float r = 2.03945513931e-4f;
2053  r = madd(r, a2, 8.32990277558e-3f);
2054  r = madd(r, a2, 0.1666673421859f);
2055  r = madd(r * a, a2, a);
2056  return copysignf(r, x);
2057  }
2058 #else
2059  return sinhf(x);
2060 #endif
2061 }
2062 
2064 #ifndef __CUDA_ARCH__
2065  // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
2066  float e = fast_exp(fabsf(x));
2067  return 0.5f * e + 0.5f / e;
2068 #else
2069  return coshf(x);
2070 #endif
2071 }
2072 
2074 #ifndef __CUDA_ARCH__
2075  // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
2076  // NOTE: ulp error is high because of sub-optimal handling around the origin
2077  float e = fast_exp(2.0f * fabsf(x));
2078  return copysignf(1 - 2 / (1 + e), x);
2079 #else
2080  return tanhf(x);
2081 #endif
2082 }
2083 
2085  if (y == 0) return 1.0f; // x^0=1
2086  if (x == 0) return 0.0f; // 0^y=0
2087  // be cheap & exact for special case of squaring and identity
2088  if (y == 1.0f)
2089  return x;
2090  if (y == 2.0f) {
2091 #ifndef __CUDA_ARCH__
2092  return std::min (x*x, std::numeric_limits<float>::max());
2093 #else
2094  return fminf (x*x, std::numeric_limits<float>::max());
2095 #endif
2096  }
2097  float sign = 1.0f;
2098  if (OIIO_UNLIKELY(x < 0.0f)) {
2099  // if x is negative, only deal with integer powers
2100  // powf returns NaN for non-integers, we will return 0 instead
2101  int ybits = bit_cast<float, int>(y) & 0x7fffffff;
2102  if (ybits >= 0x4b800000) {
2103  // always even int, keep positive
2104  } else if (ybits >= 0x3f800000) {
2105  // bigger than 1, check
2106  int k = (ybits >> 23) - 127; // get exponent
2107  int j = ybits >> (23 - k); // shift out possible fractional bits
2108  if ((j << (23 - k)) == ybits) // rebuild number and check for a match
2109  sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
2110  else
2111  return 0.0f; // not integer
2112  } else {
2113  return 0.0f; // not integer
2114  }
2115  }
2116  return sign * fast_exp2(y * fast_log2(fabsf(x)));
2117 }
2118 
2119 
2120 // Fast simd pow that only needs to work for positive x
2121 template<typename T, typename U>
2123  return fast_exp2(y * fast_log2(x));
2124 }
2125 
2126 
2127 // Fast cube root (performs better that using fast_pow's above with y=1/3)
2129 #ifndef __CUDA_ARCH__
2130  float x0 = fabsf(x);
2131  // from hacker's delight
2132  float a = bit_cast<int, float>(0x2a5137a0 + bit_cast<float, int>(x0) / 3); // Initial guess.
2133  // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
2134  // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
2135  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step.
2136  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step again.
2137  a = (x0 == 0) ? 0 : a; // Fix 0 case
2138  return copysignf(a, x);
2139 #else
2140  return cbrtf(x);
2141 #endif
2142 }
2143 
2144 
2146 {
2147 #ifndef __CUDA_ARCH__
2148  // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
2149  // Abramowitz and Stegun, 7.1.28
2150  const float a1 = 0.0705230784f;
2151  const float a2 = 0.0422820123f;
2152  const float a3 = 0.0092705272f;
2153  const float a4 = 0.0001520143f;
2154  const float a5 = 0.0002765672f;
2155  const float a6 = 0.0000430638f;
2156  const float a = fabsf(x);
2157  const float b = 1.0f - (1.0f - a); // crush denormals
2158  const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2159  const float s = r * r; // ^2
2160  const float t = s * s; // ^4
2161  const float u = t * t; // ^8
2162  const float v = u * u; // ^16
2163  return copysignf(1.0f - 1.0f / v, x);
2164 #else
2165  return erff(x);
2166 #endif
2167 }
2168 
2170 {
2171 #ifndef __CUDA_ARCH__
2172  // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
2173  // ulp histogram:
2174  // 0 = 80.30%
2175  return 1.0f - fast_erf(x);
2176 #else
2177  return erfcf(x);
2178 #endif
2179 }
2180 
2182 {
2183  // from: Approximating the erfinv function by Mike Giles
2184  // to avoid trouble at the limit, clamp input to 1-eps
2185  float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
2186  float w = -fast_log((1.0f - a) * (1.0f + a)), p;
2187  if (w < 5.0f) {
2188  w = w - 2.5f;
2189  p = 2.81022636e-08f;
2190  p = madd(p, w, 3.43273939e-07f);
2191  p = madd(p, w, -3.5233877e-06f );
2192  p = madd(p, w, -4.39150654e-06f);
2193  p = madd(p, w, 0.00021858087f );
2194  p = madd(p, w, -0.00125372503f );
2195  p = madd(p, w, -0.00417768164f );
2196  p = madd(p, w, 0.246640727f );
2197  p = madd(p, w, 1.50140941f );
2198  } else {
2199  w = sqrtf(w) - 3.0f;
2200  p = -0.000200214257f;
2201  p = madd(p, w, 0.000100950558f);
2202  p = madd(p, w, 0.00134934322f );
2203  p = madd(p, w, -0.00367342844f );
2204  p = madd(p, w, 0.00573950773f );
2205  p = madd(p, w, -0.0076224613f );
2206  p = madd(p, w, 0.00943887047f );
2207  p = madd(p, w, 1.00167406f );
2208  p = madd(p, w, 2.83297682f );
2209  }
2210  return p * x;
2211 }
2212 
2213 // (end of fast* functions)
2214 ////////////////////////////////////////////////////////////////////////////
2215 
2216 
2217 
2218 
2219 ////////////////////////////////////////////////////////////////////////////
2220 ////////////////////////////////////////////////////////////////////////////
2221 //
2222 // MISCELLANEOUS NUMERICAL METHODS
2223 //
2224 
2225 
2226 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
2227 /// Use a maximum of maxiter iterations, and stop any time the remaining
2228 /// search interval or the function evaluations <= eps. If brack is
2229 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
2230 /// false (in which case the caller should know that the results may be
2231 /// unreliable. Results are undefined if the function is not monotonic
2232 /// on that interval or if there are multiple roots in the interval (it
2233 /// may not converge, or may converge to any of the roots without
2234 /// telling you that there are more than one).
2235 template<class T, class Func> OIIO_HOSTDEVICE
2236 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
2237  int maxiters=32, T eps=1.0e-6, bool *brack=0)
2238 {
2239  // Use the Regula Falsi method, falling back to bisection if it
2240  // hasn't converged after 3/4 of the maximum number of iterations.
2241  // See, e.g., Numerical Recipes for the basic ideas behind both
2242  // methods.
2243  T v0 = func(xmin), v1 = func(xmax);
2244  T x = xmin, v = v0;
2245  bool increasing = (v0 < v1);
2246  T vmin = increasing ? v0 : v1;
2247  T vmax = increasing ? v1 : v0;
2248  bool bracketed = (y >= vmin && y <= vmax);
2249  if (brack)
2250  *brack = bracketed;
2251  if (! bracketed) {
2252  // If our bounds don't bracket the zero, just give up, and
2253  // return the appropriate "edge" of the interval
2254  return ((y < vmin) == increasing) ? xmin : xmax;
2255  }
2256  if (fabs(v0-v1) < eps) // already close enough
2257  return x;
2258  int rfiters = (3*maxiters)/4; // how many times to try regula falsi
2259  for (int iters = 0; iters < maxiters; ++iters) {
2260  T t; // interpolation factor
2261  if (iters < rfiters) {
2262  // Regula falsi
2263  t = (y-v0)/(v1-v0);
2264  if (t <= T(0) || t >= T(1))
2265  t = T(0.5); // RF convergence failure -- bisect instead
2266  } else {
2267  t = T(0.5); // bisection
2268  }
2269  x = lerp (xmin, xmax, t);
2270  v = func(x);
2271  if ((v < y) == increasing) {
2272  xmin = x; v0 = v;
2273  } else {
2274  xmax = x; v1 = v;
2275  }
2276  if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2277  return x; // converged
2278  }
2279  return x;
2280 }
2281 
2282 
2283 
2284 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
2285 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
2286 /// x==1.0.
2287 inline OIIO_HOSTDEVICE float
2289 {
2290 #ifndef __CUDA_ARCH__
2291  DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (%zd)", y.size());
2292 #endif
2293  x = clamp (x, float(0.0), float(1.0));
2294  int nsegs = int(y.size()) - 1;
2295  int segnum;
2296  x = floorfrac (x*nsegs, &segnum);
2297 #ifndef __CUDA_ARCH__
2298  int nextseg = std::min (segnum+1, nsegs);
2299 #else
2300  int nextseg = min (segnum+1, nsegs);
2301 #endif
2302  return lerp (y[segnum], y[nextseg], x);
2303 }
2304 
2305 // (end miscellaneous numerical methods)
2306 ////////////////////////////////////////////////////////////////////////////
2307 
2308 
2309 
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
Definition: fmath.h:2084
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4822
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
Definition: fmath.h:2007
#define OIIO_LIKELY(x)
Definition: platform.h:342
SYS_API double cos(double x)
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1221
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
Definition: fmath.h:2122
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
Definition: fmath.h:1864
bool_constant< is_integral< T >::value &&!std::is_same< T, bool >::value &&!std::is_same< T, char >::value &&!std::is_same< T, wchar_t >::value > is_integer
Definition: format.h:2392
EightBitConverter() noexcept
Definition: fmath.h:1318
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
Definition: fmath.h:2019
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
Definition: fmath.h:1774
#define OIIO_FORCEINLINE
Definition: platform.h:358
GLenum GLenum GLenum GLenum GLenum scale
Definition: glew.h:14163
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:7210
#define M_LN2
Definition: fmath.h:114
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
Definition: fmath.h:550
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4515
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
Definition: fmath.h:1200
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
Definition: fmath.h:1755
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
Definition: fmath.h:1408
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
Definition: UT_ArraySet.h:1629
#define M_PI
Definition: fmath.h:90
const DataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1285
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
Definition: fmath.h:421
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
Definition: fmath.h:1555
#define M_PI_2
Definition: fmath.h:93
const GLfloat * c
Definition: glew.h:16631
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
Definition: fmath.h:413
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7458
Definition: span.h:74
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
Definition: fmath.h:1474
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
Definition: fmath.h:686
SYS_API float atan2f(float y, float x)
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition: glew.h:12900
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
Definition: fmath.h:405
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
Definition: fmath.h:1101
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
Definition: fmath.h:199
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
Definition: fmath.h:176
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1169
std::integral_constant< bool, std::numeric_limits< T >::is_signed||std::is_same< T, int128_t >::value > is_signed
Definition: format.h:787
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
Definition: fmath.h:601
GLenum src
Definition: glcorearb.h:1792
OIIO_HOSTDEVICE int pow2roundup(int x)
Definition: fmath.h:191
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
Definition: fmath.h:2145
GLdouble GLdouble t
Definition: glew.h:1403
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
Definition: fmath.h:1001
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
Definition: fmath.h:1587
constexpr size_type size() const noexcept
Definition: span.h:322
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
Definition: fmath.h:613
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
Definition: glew.h:12900
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
Definition: fmath.h:2063
GLuint in
Definition: glew.h:11552
void set(I *data)
Definition: fmath.h:1283
SYS_API float log10f(float x)
GLint GLenum GLint x
Definition: glcorearb.h:408
void set(const I *data)
Definition: fmath.h:1302
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:856
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
GLdouble GLdouble GLdouble GLdouble q
Definition: glew.h:1419
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7404
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
Definition: fmath.h:1425
OIIO_HOSTDEVICE int pow2rounddown(int x)
Definition: fmath.h:192
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
Definition: fmath.h:256
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
Definition: fmath.h:852
GLuint64EXT * result
Definition: glew.h:14311
SYS_API double asin(double x)
DataArrayProxy(I *data=NULL)
Definition: fmath.h:1279
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Definition: fmath.h:1727
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:882
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
Definition: fmath.h:153
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
GLuint ybits
Definition: glew.h:12768
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
Definition: fmath.h:1648
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
Definition: fmath.h:2181
GLfloat GLfloat GLfloat v2
Definition: glcorearb.h:817
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
Definition: fmath.h:536
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast(const IN_TYPE &in)
Definition: fmath.h:737
#define OIIO_DASSERT
Definition: dassert.h:55
GLenum GLsizei len
Definition: glew.h:7782
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7531
const GLdouble * v
Definition: glcorearb.h:836
DataProxy(I &data)
Definition: fmath.h:1251
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
Definition: fmath.h:323
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1221
GLuint num
Definition: glew.h:2695
void bit_pack(cspan< T > data, void *out, int outbits)
Definition: fmath.h:1180
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
Definition: fmath.h:2040
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
Definition: fmath.h:479
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:847
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
Definition: fmath.h:2236
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:396
#define OIIO_HOSTDEVICE
Definition: platform.h:471
T operator()(unsigned char c) const noexcept
Definition: fmath.h:1319
void store(float *values) const
Definition: simd.h:6717
GLfloat GLfloat p
Definition: glew.h:16656
typedef int(WINAPI *PFNWGLRELEASEPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
Definition: fmath.h:212
constexpr size_type size() const noexcept
Definition: span.h:186
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
Definition: fmath.h:1443
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
Definition: fmath.h:281
#define M_1_PI
Definition: fmath.h:102
GLfloat GLfloat GLfloat GLfloat v3
Definition: glcorearb.h:818
SYS_API double acos(double x)
ConstDataProxy(const I &data)
Definition: fmath.h:1266
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
Definition: fmath.h:140
OIIO_HOSTDEVICE float sign(float x)
Definition: fmath.h:715
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7413
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
Definition: fmath.h:1434
#define M_PI_4
Definition: fmath.h:96
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
Definition: fmath.h:1921
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
Definition: fmath.h:2288
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
Definition: fmath.h:1459
GLfloat v0
Definition: glcorearb.h:815
int floor(T x)
Definition: ImathFun.h:150
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
Definition: fmath.h:1614
E operator*() const
Definition: fmath.h:1300
#define OIIO_CLANG_PRAGMA(UnQuotedPragma)
Definition: platform.h:260
#define OIIO_NODISCARD
Definition: platform.h:440
GLdouble n
Definition: glcorearb.h:2007
ConstDataArrayProxy(const I *data=NULL)
Definition: fmath.h:1299
GLboolean * data
Definition: glcorearb.h:130
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1465
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
Definition: fmath.h:1718
GLuint GLfloat * val
Definition: glcorearb.h:1607
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
Definition: fmath.h:1892
GLenum func
Definition: glcorearb.h:782
#define DASSERT_MSG
Definition: dassert.h:110
#define OIIO_UNLIKELY(x)
Definition: platform.h:343
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
Definition: fmath.h:432
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
Definition: fmath.h:445
T log(const T &v)
Definition: simd.h:7665
E operator*() const
Definition: fmath.h:1280
void store(int *values) const
Store the values into memory.
Definition: simd.h:4187
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
Definition: fmath.h:1537
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
Definition: fmath.h:1401
GLuint GLfloat x0
Definition: glew.h:12900
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
Definition: fmath.h:2028
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
Definition: fmath.h:784
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
Definition: fmath.h:679
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
Definition: fmath.h:781
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
Definition: fmath.h:2128
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
Definition: fmath.h:1339
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
Definition: fmath.h:301
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:918
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4688
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
Definition: fmath.h:1879
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
Definition: fmath.h:1416
const GLdouble * m
Definition: glew.h:9166
GLsizei const GLfloat * value
Definition: glcorearb.h:823
GLfloat f
Definition: glcorearb.h:1925
SYS_API float asinf(float x)
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
Definition: fmath.h:1452
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
Definition: fmath.h:1905
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
Definition: fmath.h:829
E operator[](int i) const
Definition: fmath.h:1301
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:936
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
Definition: fmath.h:497
GLfloat GLfloat v1
Definition: glcorearb.h:816
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
Definition: fmath.h:223
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:94
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
Definition: fmath.h:291
const ConstDataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1304
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:4811
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
Definition: fmath.h:795
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Definition: fmath.h:1132
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
Definition: fmath.h:2073
GLenum GLenum dst
Definition: glcorearb.h:1792
GLboolean r
Definition: glcorearb.h:1221
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
Definition: fmath.h:1742
vint4 rint(const vfloat4 &a)
Definition: simd.h:7433
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
Definition: fmath.h:652
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
Definition: fmath.h:980
GLdouble s
Definition: glew.h:1395
#define M_LN10
Definition: fmath.h:117
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
Definition: fmath.h:1992
Definition: half.h:91
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
Definition: fmath.h:2169
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
Definition: fmath.h:582
GLint y
Definition: glcorearb.h:102
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
Definition: fmath.h:235
SYS_API double sin(double x)
E operator[](int i) const
Definition: fmath.h:1281
float float_t
Definition: fmath.h:816
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
Definition: fmath.h:564
E operator=(E newval)
Definition: fmath.h:1252
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
Definition: fmath.h:1395
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
Definition: fmath.h:1680
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
Definition: fmath.h:1813
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:93
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.
Definition: fmath.h:656