HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fmath.h
Go to the documentation of this file.
1 /*
2  Copyright 2008-2014 Larry Gritz and the other authors and contributors.
3  All Rights Reserved.
4 
5  Redistribution and use in source and binary forms, with or without
6  modification, are permitted provided that the following conditions are
7  met:
8  * Redistributions of source code must retain the above copyright
9  notice, this list of conditions and the following disclaimer.
10  * Redistributions in binary form must reproduce the above copyright
11  notice, this list of conditions and the following disclaimer in the
12  documentation and/or other materials provided with the distribution.
13  * Neither the name of the software's owners nor the names of its
14  contributors may be used to endorse or promote products derived from
15  this software without specific prior written permission.
16  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28  (This is the Modified BSD License)
29 
30  A few bits here are based upon code from NVIDIA that was also released
31  under the same modified BSD license, and marked as:
32  Copyright 2004 NVIDIA Corporation. All Rights Reserved.
33 
34  Some parts of this file were first open-sourced in Open Shading Language,
35  then later moved here. The original copyright notice was:
36  Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
37 
38  Many of the math functions were copied from or inspired by other
39  public domain sources or open source packages with compatible licenses.
40  The individual functions give references were applicable.
41 */
42 
43 
44 // clang-format off
45 
46 /// \file
47 ///
48 /// A variety of floating-point math helper routines (and, slight
49 /// misnomer, some int stuff as well).
50 ///
51 
52 
53 #pragma once
54 #define OIIO_FMATH_H 1
55 
56 #include <algorithm>
57 #include <cmath>
58 #include <cstring>
59 #include <limits>
60 #include <typeinfo>
61 
62 #include <OpenImageIO/span.h>
63 #include <OpenImageIO/dassert.h>
66 #include <OpenImageIO/platform.h>
67 #include <OpenImageIO/simd.h>
68 
69 
71 
72 
73 /// Helper template to let us tell if two types are the same.
74 template<typename T, typename U> struct is_same { static const bool value = false; };
75 template<typename T> struct is_same<T,T> { static const bool value = true; };
76 
77 
78 
79 
80 ////////////////////////////////////////////////////////////////////////////
81 ////////////////////////////////////////////////////////////////////////////
82 //
83 // INTEGER HELPER FUNCTIONS
84 //
85 // A variety of handy functions that operate on integers.
86 //
87 
88 /// Quick test for whether an integer is a power of 2.
89 ///
90 template<typename T>
91 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool
92 ispow2(T x) noexcept
93 {
94  // Numerous references for this bit trick are on the web. The
95  // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
96  // all 1 bits for bits < b.
97  return (x & (x - 1)) == 0 && (x >= 0);
98 }
99 
100 
101 
102 /// Round up to next higher power of 2 (return x if it's already a power
103 /// of 2).
104 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
105 ceil2(int x) noexcept
106 {
107  // Here's a version with no loops.
108  if (x < 0)
109  return 0;
110  // Subtract 1, then round up to a power of 2, that way if we are
111  // already a power of 2, we end up with the same number.
112  --x;
113  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
114  x |= x >> 1;
115  x |= x >> 2;
116  x |= x >> 4;
117  x |= x >> 8;
118  x |= x >> 16;
119  // Now we have 2^n-1, by adding 1, we make it a power of 2 again
120  return x + 1;
121 }
122 
123 
124 
125 /// Round down to next lower power of 2 (return x if it's already a power
126 /// of 2).
127 inline OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int
128 floor2(int x) noexcept
129 {
130  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
131  x |= x >> 1;
132  x |= x >> 2;
133  x |= x >> 4;
134  x |= x >> 8;
135  x |= x >> 16;
136  // Strip off all but the high bit, i.e. 00011111 -> 00010000
137  // That's the power of two <= the original x
138  return x & ~(x >> 1);
139 }
140 
141 
142 // Old names -- DEPRECATED(2.1)
143 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
144 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
145 
146 
147 
148 /// Round value up to the next whole multiple.
149 /// For example, round_to_multiple(7,10) returns 10.
150 template <typename V, typename M>
151 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
152 {
153  return V (((value + V(multiple) - 1) / V(multiple)) * V(multiple));
154 }
155 
156 
157 
158 /// Round up to the next whole multiple of m, for the special case where
159 /// m is definitely a power of 2 (somewhat simpler than the more general
160 /// round_to_multiple). This is a template that should work for any
161 // integer type.
162 template<typename T>
163 inline OIIO_HOSTDEVICE T
165 {
166  DASSERT(ispow2(m));
167  return (x + m - 1) & (~(m - 1));
168 }
169 
170 
171 
172 /// Multiply two unsigned 32-bit ints safely, carefully checking for
173 /// overflow, and clamping to uint32_t's maximum value.
174 inline OIIO_HOSTDEVICE uint32_t
175 clamped_mult32(uint32_t a, uint32_t b)
176 {
177  const uint32_t Err = std::numeric_limits<uint32_t>::max();
178  uint64_t r = (uint64_t)a * (uint64_t)b; // Multiply into a bigger int
179  return r < Err ? (uint32_t)r : Err;
180 }
181 
182 
183 
184 /// Multiply two unsigned 64-bit ints safely, carefully checking for
185 /// overflow, and clamping to uint64_t's maximum value.
186 inline OIIO_HOSTDEVICE uint64_t
187 clamped_mult64(uint64_t a, uint64_t b)
188 {
189  uint64_t ab = a * b;
190  if (b && ab / b != a)
192  else
193  return ab;
194 }
195 
196 
197 
198 /// Bitwise circular rotation left by k bits (for 32 bit unsigned integers)
200 rotl32(uint32_t x, int k)
201 {
202 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
203  return __funnelshift_lc(x, x, k);
204 #else
205  return (x << k) | (x >> (32 - k));
206 #endif
207 }
208 
209 /// Bitwise circular rotation left by k bits (for 64 bit unsigned integers)
211 rotl64(uint64_t x, int k)
212 {
213  return (x << k) | (x >> (64 - k));
214 }
215 
216 
217 
218 // safe_mod is like integer a%b, but safely returns 0 when b==0.
219 template <class T>
220 inline OIIO_HOSTDEVICE T
221 safe_mod(T a, T b)
222 {
223  return b ? (a % b) : T(0);
224 }
225 
226 // (end of integer helper functions)
227 ////////////////////////////////////////////////////////////////////////////
228 
229 
230 
231 ////////////////////////////////////////////////////////////////////////////
232 ////////////////////////////////////////////////////////////////////////////
233 //
234 // FLOAT UTILITY FUNCTIONS
235 //
236 // Helper/utility functions: clamps, blends, interpolations...
237 //
238 
239 
240 /// clamp a to bounds [low,high].
241 template <class T>
242 inline OIIO_HOSTDEVICE T
243 clamp (const T& a, const T& low, const T& high)
244 {
245  return (a >= low) ? ((a <= high) ? a : high) : low;
246 }
247 
248 
249 #ifndef __CUDA_ARCH__
250 // Specialization of clamp for vfloat4
251 template<> inline simd::vfloat4
252 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
253 {
254  return simd::min (high, simd::max (low, a));
255 }
256 
257 template<> inline simd::vfloat8
258 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
259 {
260  return simd::min (high, simd::max (low, a));
261 }
262 #endif
263 
264 
265 
266 /// Fused multiply and add: (a*b + c)
267 inline OIIO_HOSTDEVICE float madd (float a, float b, float c) {
268 #if OIIO_FMA_ENABLED
269  // C++11 defines std::fma, which we assume is implemented using an
270  // intrinsic.
271  return std::fma (a, b, c);
272 #else
273  // NOTE: GCC/ICC will turn this (for float) into a FMA unless
274  // explicitly asked not to, clang will do so if -ffp-contract=fast.
275  return a * b + c;
276 #endif
277 }
278 
279 
280 /// Fused multiply and subtract: -(a*b - c)
281 inline OIIO_HOSTDEVICE float msub (float a, float b, float c) {
282  return a * b - c; // Hope for the best
283 }
284 
285 
286 
287 /// Fused negative multiply and add: -(a*b) + c
288 inline OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
289  return c - (a * b); // Hope for the best
290 }
291 
292 
293 
294 /// Negative fused multiply and subtract: -(a*b) - c
295 inline OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
296  return -(a * b) - c; // Hope for the best
297 }
298 
299 
300 
301 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
302 /// This is a template, and so should work for any types.
303 template <class T, class Q>
304 inline OIIO_HOSTDEVICE T
305 lerp (const T& v0, const T& v1, const Q& x)
306 {
307  // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
308  return v0*(Q(1)-x) + v1*x;
309 }
310 
311 
312 
313 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
314 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
315 /// result. This is a template, and so should work for any types.
316 template <class T, class Q>
317 inline OIIO_HOSTDEVICE T
318 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
319 {
320  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
321  Q s1 = Q(1) - s;
322  return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
323 }
324 
325 
326 
327 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
328 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
329 /// storing the results in 'result'. These are all vectors, so do it
330 /// for each of 'n' contiguous values (using the same s,t interpolants).
331 template <class T, class Q>
332 inline OIIO_HOSTDEVICE void
333 bilerp (const T *v0, const T *v1,
334  const T *v2, const T *v3,
335  Q s, Q t, int n, T *result)
336 {
337  Q s1 = Q(1) - s;
338  Q t1 = Q(1) - t;
339  for (int i = 0; i < n; ++i)
340  result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
341 }
342 
343 
344 
345 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
346 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
347 /// SCALING the interpolated value by 'scale' and then ADDING to
348 /// 'result'. These are all vectors, so do it for each of 'n'
349 /// contiguous values (using the same s,t interpolants).
350 template <class T, class Q>
351 inline OIIO_HOSTDEVICE void
352 bilerp_mad (const T *v0, const T *v1,
353  const T *v2, const T *v3,
354  Q s, Q t, Q scale, int n, T *result)
355 {
356  Q s1 = Q(1) - s;
357  Q t1 = Q(1) - t;
358  for (int i = 0; i < n; ++i)
359  result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
360  t*(v2[i]*s1 + v3[i]*s)));
361 }
362 
363 
364 
365 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
366 /// upper right top, ...) at coordinates (s,t,r), and return the
367 /// result. This is a template, and so should work for any types.
368 template <class T, class Q>
369 inline OIIO_HOSTDEVICE T
370 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
371 {
372  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
373  Q s1 = Q(1) - s;
374  Q t1 = Q(1) - t;
375  Q r1 = Q(1) - r;
376  return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
377  r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
378 }
379 
380 
381 
382 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
383 /// upper right top, ...) at coordinates (s,t,r),
384 /// storing the results in 'result'. These are all vectors, so do it
385 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
386 template <class T, class Q>
387 inline OIIO_HOSTDEVICE void
388 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
389  const T *v4, const T *v5, const T *v6, const T *v7,
390  Q s, Q t, Q r, int n, T *result)
391 {
392  Q s1 = Q(1) - s;
393  Q t1 = Q(1) - t;
394  Q r1 = Q(1) - r;
395  for (int i = 0; i < n; ++i)
396  result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
397  r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
398 }
399 
400 
401 
402 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
403 /// upper right top, ...) at coordinates (s,t,r),
404 /// SCALING the interpolated value by 'scale' and then ADDING to
405 /// 'result'. These are all vectors, so do it for each of 'n'
406 /// contiguous values (using the same s,t,r interpolants).
407 template <class T, class Q>
408 inline OIIO_HOSTDEVICE void
409 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
410  const T *v4, const T *v5, const T *v6, const T *v7,
411  Q s, Q t, Q r, Q scale, int n, T *result)
412 {
413  Q r1 = Q(1) - r;
414  bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
415  bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
416 }
417 
418 
419 
420 /// Evaluate B-spline weights in w[0..3] for the given fraction. This
421 /// is an important component of performing a cubic interpolation.
422 template <typename T>
423 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
424 {
425  T one_frac = 1 - fraction;
426  w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
427  w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
428  w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
429  w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
430 }
431 
432 
433 /// Evaluate B-spline derivative weights in w[0..3] for the given
434 /// fraction. This is an important component of performing a cubic
435 /// interpolation with derivatives.
436 template <typename T>
437 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
438 {
439  T one_frac = 1 - fraction;
440  dw[0] = -T(0.5) * one_frac * one_frac;
441  dw[1] = T(0.5) * fraction * (3 * fraction - 4);
442  dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
443  dw[3] = T(0.5) * fraction * fraction;
444 }
445 
446 
447 
448 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
449 /// with val[0] pointing to the data in the upper left corner, val[15]
450 /// pointing to the lower right) at coordinates (s,t), storing the
451 /// results in 'result'. These are all vectors, so do it for each of
452 /// 'n' contiguous values (using the same s,t interpolants).
453 template <class T>
454 inline OIIO_HOSTDEVICE void
455 bicubic_interp (const T **val, T s, T t, int n, T *result)
456 {
457  for (int c = 0; c < n; ++c)
458  result[c] = T(0);
459  T wx[4]; evalBSplineWeights (wx, s);
460  T wy[4]; evalBSplineWeights (wy, t);
461  for (int j = 0; j < 4; ++j) {
462  for (int i = 0; i < 4; ++i) {
463  T w = wx[i] * wy[j];
464  for (int c = 0; c < n; ++c)
465  result[c] += w * val[j*4+i][c];
466  }
467  }
468 }
469 
470 
471 
472 /// Return floor(x) cast to an int.
473 inline OIIO_HOSTDEVICE int
474 ifloor (float x)
475 {
476  return (int)floorf(x);
477 }
478 
479 
480 
481 /// Return (x-floor(x)) and put (int)floor(x) in *xint. This is similar
482 /// to the built-in modf, but returns a true int, always rounds down
483 /// (compared to modf which rounds toward 0), and always returns
484 /// frac >= 0 (comapred to modf which can return <0 if x<0).
485 inline OIIO_HOSTDEVICE float
486 floorfrac (float x, int *xint)
487 {
488 #if 1
489  float f = std::floor(x);
490  *xint = int(f);
491  return x - f;
492 #else /* vvv This idiom is slower */
493  int i = ifloor(x);
494  *xint = i;
495  return x - static_cast<float>(i); // Return the fraction left over
496 #endif
497 }
498 
499 
500 #ifndef __CUDA_ARCH__
503  *xint = simd::vint4(f);
504  return x - f;
505 }
506 
509  *xint = simd::vint8(f);
510  return x - f;
511 }
512 
515  *xint = simd::vint16(f);
516  return x - f;
517 }
518 #endif
519 
520 
521 
522 
523 /// Convert degrees to radians.
524 template <typename T>
525 inline OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
526 
527 /// Convert radians to degrees
528 template <typename T>
529 inline OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
530 
531 
532 
533 inline OIIO_HOSTDEVICE void
534 sincos (float x, float* sine, float* cosine)
535 {
536 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
537  __builtin_sincosf(x, sine, cosine);
538 #elif defined(__CUDA_ARCH__)
539  // Explicitly select the single-precision CUDA library function
540  sincosf(x, sine, cosine);
541 #else
542  *sine = std::sin(x);
543  *cosine = std::cos(x);
544 #endif
545 }
546 
547 inline OIIO_HOSTDEVICE void
548 sincos (double x, double* sine, double* cosine)
549 {
550 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
551  __builtin_sincos(x, sine, cosine);
552 #elif defined(__CUDA_ARCH__)
553  // Use of anonymous namespace resolves to the CUDA library function and
554  // avoids infinite recursion
555  ::sincos(x, sine, cosine);
556 #else
557  *sine = std::sin(x);
558  *cosine = std::cos(x);
559 #endif
560 }
561 
562 
563 inline OIIO_HOSTDEVICE float sign (float x)
564 {
565  return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
566 }
567 
568 
569 // (end of float helper functions)
570 ////////////////////////////////////////////////////////////////////////////
571 
572 
573 
574 ////////////////////////////////////////////////////////////////////////////
575 ////////////////////////////////////////////////////////////////////////////
576 //
577 // CONVERSION
578 //
579 // Type and range conversion helper functions and classes.
580 
581 
582 template <typename IN_TYPE, typename OUT_TYPE>
583 inline OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE in) {
584  // NOTE: this is the only standards compliant way of doing this type of casting,
585  // luckily the compilers we care about know how to optimize away this idiom.
586  OUT_TYPE out;
587  memcpy ((void *)&out, &in, sizeof(IN_TYPE));
588  return out;
589 }
590 
591 
592 inline OIIO_HOSTDEVICE int bitcast_to_int (float x) { return bit_cast<float,int>(x); }
593 inline OIIO_HOSTDEVICE float bitcast_to_float (int x) { return bit_cast<int,float>(x); }
594 
595 
596 
597 /// Change endian-ness of one or more data items that are each 2, 4,
598 /// or 8 bytes. This should work for any of short, unsigned short, int,
599 /// unsigned int, float, long long, pointers.
600 template<class T>
601 inline OIIO_HOSTDEVICE void
602 swap_endian (T *f, int len=1)
603 {
604  for (char *c = (char *) f; len--; c += sizeof(T)) {
605  if (sizeof(T) == 2) {
606  std::swap (c[0], c[1]);
607  } else if (sizeof(T) == 4) {
608  std::swap (c[0], c[3]);
609  std::swap (c[1], c[2]);
610  } else if (sizeof(T) == 8) {
611  std::swap (c[0], c[7]);
612  std::swap (c[1], c[6]);
613  std::swap (c[2], c[5]);
614  std::swap (c[3], c[4]);
615  }
616  }
617 }
618 
619 
620 
621 // big_enough_float<T>::float_t is a floating-point type big enough to
622 // handle the range and precision of a <T>. It's a float, unless T is big.
623 template <typename T> struct big_enough_float { typedef float float_t; };
624 template<> struct big_enough_float<int> { typedef double float_t; };
625 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
626 template<> struct big_enough_float<int64_t> { typedef double float_t; };
627 template<> struct big_enough_float<uint64_t> { typedef double float_t; };
628 template<> struct big_enough_float<double> { typedef double float_t; };
629 
630 
631 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
632 /// (presumed to be integer). This is just a helper for the convert_type
633 /// templates, it probably has no other use.
634 template<typename S, typename D, typename F>
635 inline OIIO_HOSTDEVICE D
636 scaled_conversion(const S& src, F scale, F min, F max)
637 {
638  if (std::numeric_limits<S>::is_signed) {
639  F s = src * scale;
640  s += (s < 0 ? (F)-0.5 : (F)0.5);
641  return (D)clamp(s, min, max);
642  } else {
643  return (D)clamp((F)src * scale + (F)0.5, min, max);
644  }
645 }
646 
647 
648 
649 /// Convert n consecutive values from the type of S to the type of D.
650 /// The conversion is not a simple cast, but correctly remaps the
651 /// 0.0->1.0 range from and to the full positive range of integral
652 /// types. Take a memcpy shortcut if both types are the same and no
653 /// conversion is necessary. Optional arguments can give nonstandard
654 /// quantizations.
655 //
656 // FIXME: make table-based specializations for common types with only a
657 // few possible src values (like unsigned char -> float).
658 template<typename S, typename D>
659 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
660 {
661  if (is_same<S,D>::value) {
662  // They must be the same type. Just memcpy.
663  memcpy (dst, src, n*sizeof(D));
664  return;
665  }
666  typedef typename big_enough_float<D>::float_t F;
667  F scale = std::numeric_limits<S>::is_integer ?
668  ((F)1.0)/std::numeric_limits<S>::max() : (F)1.0;
669  if (std::numeric_limits<D>::is_integer) {
670  // Converting to an integer-like type.
671  F min = (F)_min; // std::numeric_limits<D>::min();
672  F max = (F)_max; // std::numeric_limits<D>::max();
673  scale *= _max;
674  // Unroll loop for speed
675  for ( ; n >= 16; n -= 16) {
676  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
677  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
678  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
679  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
680  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
681  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
682  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
683  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
684  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
685  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
686  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
687  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
688  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
689  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
690  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
691  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
692  }
693  while (n--)
694  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
695  } else {
696  // Converting to a float-like type, so we don't need to remap
697  // the range
698  // Unroll loop for speed
699  for ( ; n >= 16; n -= 16) {
700  *dst++ = (D)((*src++) * scale);
701  *dst++ = (D)((*src++) * scale);
702  *dst++ = (D)((*src++) * scale);
703  *dst++ = (D)((*src++) * scale);
704  *dst++ = (D)((*src++) * scale);
705  *dst++ = (D)((*src++) * scale);
706  *dst++ = (D)((*src++) * scale);
707  *dst++ = (D)((*src++) * scale);
708  *dst++ = (D)((*src++) * scale);
709  *dst++ = (D)((*src++) * scale);
710  *dst++ = (D)((*src++) * scale);
711  *dst++ = (D)((*src++) * scale);
712  *dst++ = (D)((*src++) * scale);
713  *dst++ = (D)((*src++) * scale);
714  *dst++ = (D)((*src++) * scale);
715  *dst++ = (D)((*src++) * scale);
716  }
717  while (n--)
718  *dst++ = (D)((*src++) * scale);
719  }
720 }
721 
722 
723 #ifndef __CUDA_ARCH__
724 template<>
725 inline void convert_type<uint8_t,float> (const uint8_t *src,
726  float *dst, size_t n,
727  float _min, float _max)
728 {
730  simd::vfloat4 scale_simd (scale);
731  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
732  simd::vfloat4 s_simd (src);
733  simd::vfloat4 d_simd = s_simd * scale_simd;
734  d_simd.store (dst);
735  }
736  while (n--)
737  *dst++ = (*src++) * scale;
738 }
739 
740 
741 
742 template<>
743 inline void convert_type<uint16_t,float> (const uint16_t *src,
744  float *dst, size_t n,
745  float _min, float _max)
746 {
748  simd::vfloat4 scale_simd (scale);
749  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
750  simd::vfloat4 s_simd (src);
751  simd::vfloat4 d_simd = s_simd * scale_simd;
752  d_simd.store (dst);
753  }
754  while (n--)
755  *dst++ = (*src++) * scale;
756 }
757 
758 
759 #ifdef _HALF_H_
760 template<>
761 inline void convert_type<half,float> (const half *src,
762  float *dst, size_t n,
763  float _min, float _max)
764 {
765 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
766  // If f16c ops are enabled, it's worth doing this by 8's
767  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
768  simd::vfloat8 s_simd (src);
769  s_simd.store (dst);
770  }
771 #endif
772 #if OIIO_SIMD >= 4
773  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
774  simd::vfloat4 s_simd (src);
775  s_simd.store (dst);
776  }
777 #endif
778  while (n--)
779  *dst++ = (*src++);
780 }
781 #endif
782 
783 
784 
785 template<>
786 inline void
787 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
788  uint16_t _min, uint16_t _max)
789 {
792  float scale = max;
793  simd::vfloat4 max_simd (max);
794  simd::vfloat4 zero_simd (0.0f);
795  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
796  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
797  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
798  simd::vint4 i (clamped);
799  i.store (dst);
800  }
801  while (n--)
802  *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
803 }
804 
805 
806 template<>
807 inline void
808 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
809  uint8_t _min, uint8_t _max)
810 {
813  float scale = max;
814  simd::vfloat4 max_simd (max);
815  simd::vfloat4 zero_simd (0.0f);
816  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
817  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
818  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
819  simd::vint4 i (clamped);
820  i.store (dst);
821  }
822  while (n--)
823  *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
824 }
825 
826 
827 #ifdef _HALF_H_
828 template<>
829 inline void
830 convert_type<float,half> (const float *src, half *dst, size_t n,
831  half _min, half _max)
832 {
833 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
834  // If f16c ops are enabled, it's worth doing this by 8's
835  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
836  simd::vfloat8 s (src);
837  s.store (dst);
838  }
839 #endif
840 #if OIIO_SIMD >= 4
841  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
842  simd::vfloat4 s (src);
843  s.store (dst);
844  }
845 #endif
846  while (n--)
847  *dst++ = *src++;
848 }
849 #endif
850 #endif
851 
852 
853 
854 template<typename S, typename D>
855 inline void convert_type (const S *src, D *dst, size_t n)
856 {
857  convert_type<S,D> (src, dst, n,
860 }
861 
862 
863 
864 
865 /// Convert a single value from the type of S to the type of D.
866 /// The conversion is not a simple cast, but correctly remaps the
867 /// 0.0->1.0 range from and to the full positive range of integral
868 /// types. Take a copy shortcut if both types are the same and no
869 /// conversion is necessary.
870 template<typename S, typename D>
871 inline D
872 convert_type (const S &src)
873 {
874  if (is_same<S,D>::value) {
875  // They must be the same type. Just return it.
876  return (D)src;
877  }
878  typedef typename big_enough_float<D>::float_t F;
879  F scale = std::numeric_limits<S>::is_integer ?
880  ((F)1.0)/std::numeric_limits<S>::max() : (F)1.0;
881  if (std::numeric_limits<D>::is_integer) {
882  // Converting to an integer-like type.
885  scale *= max;
886  return scaled_conversion<S,D,F> (src, scale, min, max);
887  } else {
888  // Converting to a float-like type, so we don't need to remap
889  // the range
890  return (D)((F)src * scale);
891  }
892 }
893 
894 
895 
896 /// Helper function to convert channel values between different bit depths.
897 /// Roughly equivalent to:
898 ///
899 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
900 ///
901 /// but utilizes an integer math trick for speed. It can be proven that the
902 /// absolute error of this method is less or equal to 1, with an average error
903 /// (with respect to the entire domain) below 0.2.
904 ///
905 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
906 /// shifted fully to the right.
907 template<unsigned int FROM_BITS, unsigned int TO_BITS>
908 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
909  unsigned int out = 0;
910  int shift = TO_BITS - FROM_BITS;
911  for (; shift > 0; shift -= FROM_BITS)
912  out |= in << shift;
913  out |= in >> -shift;
914  return out;
915 }
916 
917 
918 
919 // non-templated version. Slow but general
920 inline OIIO_HOSTDEVICE unsigned int
921 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
922 {
923  unsigned int out = 0;
924  int shift = TO_BITS - FROM_BITS;
925  for (; shift > 0; shift -= FROM_BITS)
926  out |= in << shift;
927  out |= in >> -shift;
928  return out;
929 }
930 
931 
932 
933 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
934 /// and does conversions (via convert_type) as it reads in and out.
935 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
936 template<typename I, typename E>
937 struct DataProxy {
938  DataProxy (I &data) : m_data(data) { }
939  E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
940  operator E () const { return convert_type<I,E>(m_data); }
941 private:
942  DataProxy& operator = (const DataProxy&); // Do not implement
943  I &m_data;
944 };
945 
946 
947 
948 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
949 /// a (const I &) and does conversions (via convert_type) as it reads.
950 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
951 template<typename I, typename E>
953  ConstDataProxy (const I &data) : m_data(data) { }
954  operator E () const { return convert_type<E,I>(*m_data); }
955 private:
956  const I &m_data;
957 };
958 
959 
960 
961 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
962 /// and does conversions (via convert_type) as it reads in and out.
963 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
964 template<typename I, typename E>
966  DataArrayProxy (I *data=NULL) : m_data(data) { }
967  E operator* () const { return convert_type<I,E>(*m_data); }
968  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
969  DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
970  void set (I *data) { m_data = data; }
971  I * get () const { return m_data; }
973  m_data += i; return *this;
974  }
975 private:
976  I *m_data;
977 };
978 
979 
980 
981 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
982 /// (I *) and does conversions (via convert_type) as it reads in and out.
983 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
984 template<typename I, typename E>
986  ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
987  E operator* () const { return convert_type<I,E>(*m_data); }
988  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
989  void set (const I *data) { m_data = data; }
990  const I * get () const { return m_data; }
992  m_data += i; return *this;
993  }
994 private:
995  const I *m_data;
996 };
997 
998 
999 
1000 /// Fast table-based conversion of 8-bit to other types. Declare this
1001 /// as static to avoid the expensive ctr being called all the time.
1002 template <class T=float>
1004 public:
1005  EightBitConverter () { init(); }
1006  T operator() (unsigned char c) const { return val[c]; }
1007 private:
1008  T val[256];
1009  void init () {
1010  float scale = 1.0f / 255.0f;
1011  if (std::numeric_limits<T>::is_integer)
1012  scale *= (float)std::numeric_limits<T>::max();
1013  for (int i = 0; i < 256; ++i)
1014  val[i] = (T)(i * scale);
1015  }
1016 };
1017 
1018 
1019 
1020 /// Simple conversion of a (presumably non-negative) float into a
1021 /// rational. This does not attempt to find the simplest fraction
1022 /// that approximates the float, for example 52.83 will simply
1023 /// return 5283/100. This does not attempt to gracefully handle
1024 /// floats that are out of range that could be easily int/int.
1025 inline OIIO_HOSTDEVICE void
1026 float_to_rational (float f, unsigned int &num, unsigned int &den)
1027 {
1028  if (f <= 0) { // Trivial case of zero, and handle all negative values
1029  num = 0;
1030  den = 1;
1031  } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1032  num = 1;
1033  den = (int)f;
1034  } else {
1035  num = (int)f;
1036  den = 1;
1037  while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1038  den *= 10;
1039  f *= 10;
1040  num = (int)f;
1041  }
1042  }
1043 }
1044 
1045 
1046 
1047 /// Simple conversion of a float into a rational. This does not attempt
1048 /// to find the simplest fraction that approximates the float, for
1049 /// example 52.83 will simply return 5283/100. This does not attempt to
1050 /// gracefully handle floats that are out of range that could be easily
1051 /// int/int.
1052 inline OIIO_HOSTDEVICE void
1053 float_to_rational (float f, int &num, int &den)
1054 {
1055  unsigned int n, d;
1056  float_to_rational (fabsf(f), n, d);
1057  num = (f >= 0) ? (int)n : -(int)n;
1058  den = (int) d;
1059 }
1060 
1061 
1062 // (end of conversion helpers)
1063 ////////////////////////////////////////////////////////////////////////////
1064 
1065 
1066 
1067 
1068 ////////////////////////////////////////////////////////////////////////////
1069 ////////////////////////////////////////////////////////////////////////////
1070 //
1071 // SAFE MATH
1072 //
1073 // The functions named "safe_*" are versions with various internal clamps
1074 // or other deviations from IEEE standards with the specific intent of
1075 // never producing NaN or Inf values or throwing exceptions. But within the
1076 // valid range, they should be full precision and match IEEE standards.
1077 //
1078 
1079 
1080 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1081 template <typename T>
1083  return x >= T(0) ? std::sqrt(x) : T(0);
1084 }
1085 
1086 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1087 template <typename T>
1089  return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1090 }
1091 
1092 
1093 /// Safe (clamping) arcsine: clamp to the valid domain.
1094 template <typename T>
1096  if (x <= T(-1)) return T(-M_PI_2);
1097  if (x >= T(+1)) return T(+M_PI_2);
1098  return std::asin(x);
1099 }
1100 
1101 /// Safe (clamping) arccosine: clamp to the valid domain.
1102 template <typename T>
1104  if (x <= T(-1)) return T(M_PI);
1105  if (x >= T(+1)) return T(0);
1106  return std::acos(x);
1107 }
1108 
1109 
1110 /// Safe log2: clamp to valid domain.
1111 template <typename T>
1113  // match clamping from fast version
1116  return std::log2(x);
1117 }
1118 
1119 /// Safe log: clamp to valid domain.
1120 template <typename T>
1122  // slightly different than fast version since clamping happens before scaling
1125  return std::log(x);
1126 }
1127 
1128 /// Safe log10: clamp to valid domain.
1129 template <typename T>
1131  // slightly different than fast version since clamping happens before scaling
1134  return log10f(x);
1135 }
1136 
1137 /// Safe logb: clamp to valid domain.
1138 template <typename T>
1140  return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1141 }
1142 
1143 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1144 /// by zero error.
1145 template <typename T>
1146 inline OIIO_HOSTDEVICE T safe_pow (T x, T y) {
1147  if (y == T(0)) return T(1);
1148  if (x == T(0)) return T(0);
1149  // if x is negative, only deal with integer powers
1150  if ((x < T(0)) && (y != floor(y))) return T(0);
1151  // FIXME: this does not match "fast" variant because clamping limits are different
1152  T r = std::pow(x, y);
1153  // Clamp to avoid returning Inf.
1154  const T big = std::numeric_limits<T>::max();
1155  return clamp (r, -big, big);
1156 }
1157 
1158 // (end of safe_* functions)
1159 ////////////////////////////////////////////////////////////////////////////
1160 
1161 
1162 
1163 ////////////////////////////////////////////////////////////////////////////
1164 ////////////////////////////////////////////////////////////////////////////
1165 //
1166 // FAST & APPROXIMATE MATH
1167 //
1168 // The functions named "fast_*" provide a set of replacements to libm that
1169 // are much faster at the expense of some accuracy and robust handling of
1170 // extreme values. One design goal for these approximation was to avoid
1171 // branches as much as possible and operate on single precision values only
1172 // so that SIMD versions should be straightforward ports We also try to
1173 // implement "safe" semantics (ie: clamp to valid range where possible)
1174 // natively since wrapping these inline calls in another layer would be
1175 // wasteful.
1176 //
1177 // Some functions are fast_safe_*, which is both a faster approximation as
1178 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1179 //
1180 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1181 // is influenced by compiler options (-ffast-math for clang/gcc,
1182 // --use-fast-math for NVCC).
1183 //
1184 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1185 // relative to the definitions in this file. It may be better in some
1186 // cases to use the approximate versions defined below.
1187 
1188 
1189 /// Round to nearest integer, returning as an int.
1190 inline OIIO_HOSTDEVICE int fast_rint (float x) {
1191  // used by sin/cos/tan range reduction
1192 #if OIIO_SIMD_SSE >= 4
1193  // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1194  return static_cast<int>(rintf(x));
1195 #else
1196  // emulate rounding by adding/substracting 0.5
1197  return static_cast<int>(x + copysignf(0.5f, x));
1198 #endif
1199 }
1200 
1201 #ifndef __CUDA_ARCH__
1203  return simd::rint (x);
1204 }
1205 #endif
1206 
1207 
1208 inline OIIO_HOSTDEVICE float fast_sin (float x) {
1209 #ifndef __CUDA_ARCH__
1210  // very accurate argument reduction from SLEEF
1211  // starts failing around x=262000
1212  // Results on: [-2pi,2pi]
1213  // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1214  int q = fast_rint (x * float(M_1_PI));
1215  float qf = float(q);
1216  x = madd(qf, -0.78515625f*4, x);
1217  x = madd(qf, -0.00024187564849853515625f*4, x);
1218  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1219  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1220  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1221  float s = x * x;
1222  if ((q & 1) != 0) x = -x;
1223  // this polynomial approximation has very low error on [-pi/2,+pi/2]
1224  // 1.19209e-07 max error in total over [-2pi,+2pi]
1225  float u = 2.6083159809786593541503e-06f;
1226  u = madd(u, s, -0.0001981069071916863322258f);
1227  u = madd(u, s, +0.00833307858556509017944336f);
1228  u = madd(u, s, -0.166666597127914428710938f);
1229  u = madd(s, u * x, x);
1230  // For large x, the argument reduction can fail and the polynomial can be
1231  // evaluated with arguments outside the valid internal. Just clamp the bad
1232  // values away (setting to 0.0f means no branches need to be generated).
1233  if (fabsf(u) > 1.0f) u = 0.0f;
1234  return u;
1235 #else
1236  return __sinf(x);
1237 #endif
1238 }
1239 
1240 
1241 inline OIIO_HOSTDEVICE float fast_cos (float x) {
1242 #ifndef __CUDA_ARCH__
1243  // same argument reduction as fast_sin
1244  int q = fast_rint (x * float(M_1_PI));
1245  float qf = float(q);
1246  x = madd(qf, -0.78515625f*4, x);
1247  x = madd(qf, -0.00024187564849853515625f*4, x);
1248  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1249  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1250  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1251  float s = x * x;
1252  // polynomial from SLEEF's sincosf, max error is
1253  // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1254  float u = -2.71811842367242206819355e-07f;
1255  u = madd(u, s, +2.47990446951007470488548e-05f);
1256  u = madd(u, s, -0.00138888787478208541870117f);
1257  u = madd(u, s, +0.0416666641831398010253906f);
1258  u = madd(u, s, -0.5f);
1259  u = madd(u, s, +1.0f);
1260  if ((q & 1) != 0) u = -u;
1261  if (fabsf(u) > 1.0f) u = 0.0f;
1262  return u;
1263 #else
1264  return __cosf(x);
1265 #endif
1266 }
1267 
1268 
1269 inline OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1270 #ifndef __CUDA_ARCH__
1271  // same argument reduction as fast_sin
1272  int q = fast_rint (x * float(M_1_PI));
1273  float qf = float(q);
1274  x = madd(qf, -0.78515625f*4, x);
1275  x = madd(qf, -0.00024187564849853515625f*4, x);
1276  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1277  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1278  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1279  float s = x * x;
1280  // NOTE: same exact polynomials as fast_sin and fast_cos above
1281  if ((q & 1) != 0) x = -x;
1282  float su = 2.6083159809786593541503e-06f;
1283  su = madd(su, s, -0.0001981069071916863322258f);
1284  su = madd(su, s, +0.00833307858556509017944336f);
1285  su = madd(su, s, -0.166666597127914428710938f);
1286  su = madd(s, su * x, x);
1287  float cu = -2.71811842367242206819355e-07f;
1288  cu = madd(cu, s, +2.47990446951007470488548e-05f);
1289  cu = madd(cu, s, -0.00138888787478208541870117f);
1290  cu = madd(cu, s, +0.0416666641831398010253906f);
1291  cu = madd(cu, s, -0.5f);
1292  cu = madd(cu, s, +1.0f);
1293  if ((q & 1) != 0) cu = -cu;
1294  if (fabsf(su) > 1.0f) su = 0.0f;
1295  if (fabsf(cu) > 1.0f) cu = 0.0f;
1296  *sine = su;
1297  *cosine = cu;
1298 #else
1299  __sincosf(x, sine, cosine);
1300 #endif
1301 }
1302 
1303 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1304 // really poor outside of this range because the reciprocal amplifies errors
1305 inline OIIO_HOSTDEVICE float fast_tan (float x) {
1306 #ifndef __CUDA_ARCH__
1307  // derived from SLEEF implementation
1308  // note that we cannot apply the "denormal crush" trick everywhere because
1309  // we sometimes need to take the reciprocal of the polynomial
1310  int q = fast_rint (x * float(2 * M_1_PI));
1311  float qf = float(q);
1312  x = madd(qf, -0.78515625f*2, x);
1313  x = madd(qf, -0.00024187564849853515625f*2, x);
1314  x = madd(qf, -3.7747668102383613586e-08f*2, x);
1315  x = madd(qf, -1.2816720341285448015e-12f*2, x);
1316  if ((q & 1) == 0)
1317  x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1318  float s = x * x;
1319  float u = 0.00927245803177356719970703f;
1320  u = madd(u, s, 0.00331984995864331722259521f);
1321  u = madd(u, s, 0.0242998078465461730957031f);
1322  u = madd(u, s, 0.0534495301544666290283203f);
1323  u = madd(u, s, 0.133383005857467651367188f);
1324  u = madd(u, s, 0.333331853151321411132812f);
1325  u = madd(s, u * x, x);
1326  if ((q & 1) != 0) u = -1.0f / u;
1327  return u;
1328 #else
1329  return __tanf(x);
1330 #endif
1331 }
1332 
1333 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1334 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1335 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1336 inline OIIO_HOSTDEVICE float fast_sinpi (float x)
1337 {
1338 #ifndef __CUDA_ARCH__
1339  // Fast trick to strip the integral part off, so our domain is [-1,1]
1340  const float z = x - ((x + 25165824.0f) - 25165824.0f);
1341  const float y = z - z * fabsf(z);
1342  const float Q = 3.10396624f;
1343  const float P = 3.584135056f; // P = 16-4*Q
1344  return y * (Q + P * fabsf(y));
1345  /* The original article used used inferior constants for Q and P and
1346  * so had max error 1.091e-3.
1347  *
1348  * The optimal value for Q was determined by exhaustive search, minimizing
1349  * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1350  * over the interval [0,2] (which is where most of the invocations happen).
1351  *
1352  * The basic idea of this approximation starts with the coarse approximation:
1353  * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
1354  *
1355  * This approximation always _over_ estimates the target. On the otherhand, the
1356  * curve:
1357  * sin(pi*x) ~= f(x) * abs(f(x)) / 4
1358  *
1359  * always lies _under_ the target. Thus we can simply numerically search for the
1360  * optimal constant to LERP these curves into a more precise approximation.
1361  * After folding the constants together and simplifying the resulting math, we
1362  * end up with the compact implementation below.
1363  *
1364  * NOTE: this function actually computes sin(x * pi) which avoids one or two
1365  * mults in many cases and guarantees exact values at integer periods.
1366  */
1367 #else
1368  return sinpif(x);
1369 #endif
1370 }
1371 
1372 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1373 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1374 inline OIIO_HOSTDEVICE float fast_cospi (float x)
1375 {
1376 #ifndef __CUDA_ARCH__
1377  return fast_sinpi (x+0.5f);
1378 #else
1379  return cospif(x);
1380 #endif
1381 }
1382 
1383 inline OIIO_HOSTDEVICE float fast_acos (float x) {
1384 #ifndef __CUDA_ARCH__
1385  const float f = fabsf(x);
1386  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1387  // based on http://www.pouet.net/topic.php?which=9132&page=2
1388  // 85% accurate (ulp 0)
1389  // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1390  // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1391  const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1392  return x < 0 ? float(M_PI) - a : a;
1393 #else
1394  return acosf(x);
1395 #endif
1396 }
1397 
1398 inline OIIO_HOSTDEVICE float fast_asin (float x) {
1399 #ifndef __CUDA_ARCH__
1400  // based on acosf approximation above
1401  // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1402  const float f = fabsf(x);
1403  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1404  const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1405  return copysignf(a, x);
1406 #else
1407  return asinf(x);
1408 #endif
1409 }
1410 
1411 inline OIIO_HOSTDEVICE float fast_atan (float x) {
1412 #ifndef __CUDA_ARCH__
1413  const float a = fabsf(x);
1414  const float k = a > 1.0f ? 1 / a : a;
1415  const float s = 1.0f - (1.0f - k); // crush denormals
1416  const float t = s * s;
1417  // http://mathforum.org/library/drmath/view/62672.html
1418  // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1419  // (slightly higher error but no discontinuities)
1420  // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error // (with denormals)
1421  // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1422  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1423  if (a > 1.0f) r = 1.570796326794896557998982f - r;
1424  return copysignf(r, x);
1425 #else
1426  return atanf(x);
1427 #endif
1428 }
1429 
1430 inline OIIO_HOSTDEVICE float fast_atan2 (float y, float x) {
1431 #ifndef __CUDA_ARCH__
1432  // based on atan approximation above
1433  // the special cases around 0 and infinity were tested explicitly
1434  // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1435  const float a = fabsf(x);
1436  const float b = fabsf(y);
1437 
1438  const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1439  const float s = 1.0f - (1.0f - k); // crush denormals
1440  const float t = s * s;
1441 
1442  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1443 
1444  if (b > a) r = 1.570796326794896557998982f - r; // account for arg reduction
1445  if (bit_cast<float, unsigned>(x) & 0x80000000u) // test sign bit of x
1446  r = float(M_PI) - r;
1447  return copysignf(r, y);
1448 #else
1449  return atan2f(y, x);
1450 #endif
1451 }
1452 
1453 
1454 template<typename T>
1455 inline OIIO_HOSTDEVICE T fast_log2 (const T& xval) {
1456  using namespace simd;
1457  typedef typename T::int_t intN;
1458  // See float fast_log2 for explanations
1460  intN bits = bitcast_to_int(x);
1461  intN exponent = srl (bits, 23) - intN(127);
1462  T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1463  T f2 = f * f;
1464  T f4 = f2 * f2;
1465  T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1466  T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1467  hi = madd(f, hi, T(-0.13753123777116f));
1468  hi = madd(f, hi, T( 0.24187369696082f));
1469  hi = madd(f, hi, T(-0.34730547155299f));
1470  lo = madd(f, lo, T( 1.442689881667200f));
1471  return ((f4 * hi) + (f * lo)) + T(exponent);
1472 }
1473 
1474 
1475 template<>
1476 inline OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1477 #ifndef __CUDA_ARCH__
1478  // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1480  // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1481  unsigned bits = bit_cast<float, unsigned>(x);
1482  int exponent = int(bits >> 23) - 127;
1483  float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1484  // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1485  // ulp histogram:
1486  // 0 = 97.46%
1487  // 1 = 2.29%
1488  // 2 = 0.11%
1489  float f2 = f * f;
1490  float f4 = f2 * f2;
1491  float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
1492  float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
1493  hi = madd(f, hi, -0.13753123777116f);
1494  hi = madd(f, hi, 0.24187369696082f);
1495  hi = madd(f, hi, -0.34730547155299f);
1496  lo = madd(f, lo, 1.442689881667200f);
1497  return ((f4 * hi) + (f * lo)) + exponent;
1498 #else
1499  return __log2f(xval);
1500 #endif
1501 }
1502 
1503 
1504 
1505 template<typename T>
1506 inline OIIO_HOSTDEVICE T fast_log (const T& x) {
1507  // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1508  return fast_log2(x) * T(M_LN2);
1509 }
1510 
1511 #ifdef __CUDA_ARCH__
1512 template<>
1513 inline OIIO_HOSTDEVICE float fast_log(const float& x)
1514 {
1515  return __logf(x);
1516 }
1517 #endif
1518 
1519 
1520 template<typename T>
1521 inline OIIO_HOSTDEVICE T fast_log10 (const T& x) {
1522  // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
1523  return fast_log2(x) * T(M_LN2 / M_LN10);
1524 }
1525 
1526 #ifdef __CUDA_ARCH__
1527 template<>
1528 inline OIIO_HOSTDEVICE float fast_log10(const float& x)
1529 {
1530  return __log10f(x);
1531 }
1532 #endif
1533 
1534 inline OIIO_HOSTDEVICE float fast_logb (float x) {
1535 #ifndef __CUDA_ARCH__
1536  // don't bother with denormals
1537  x = fabsf(x);
1540  unsigned bits = bit_cast<float, unsigned>(x);
1541  return float (int(bits >> 23) - 127);
1542 #else
1543  return logbf(x);
1544 #endif
1545 }
1546 
1547 inline OIIO_HOSTDEVICE float fast_log1p (float x) {
1548 #ifndef __CUDA_ARCH__
1549  if (fabsf(x) < 0.01f) {
1550  float y = 1.0f - (1.0f - x); // crush denormals
1551  return copysignf(madd(-0.5f, y * y, y), x);
1552  } else {
1553  return fast_log(x + 1);
1554  }
1555 #else
1556  return log1pf(x);
1557 #endif
1558 }
1559 
1560 
1561 
1562 template<typename T>
1563 inline OIIO_HOSTDEVICE T fast_exp2 (const T& xval) {
1564  using namespace simd;
1565  typedef typename T::int_t intN;
1566 #if OIIO_SIMD_SSE
1567  // See float specialization for explanations
1568  T x = clamp (xval, T(-126.0f), T(126.0f));
1569  intN m (x); x -= T(m);
1570  T one (1.0f);
1571  x = one - (one - x); // crush denormals (does not affect max ulps!)
1572  const T kA (1.33336498402e-3f);
1573  const T kB (9.810352697968e-3f);
1574  const T kC (5.551834031939e-2f);
1575  const T kD (0.2401793301105f);
1576  const T kE (0.693144857883f);
1577  T r (kA);
1578  r = madd(x, r, kB);
1579  r = madd(x, r, kC);
1580  r = madd(x, r, kD);
1581  r = madd(x, r, kE);
1582  r = madd(x, r, one);
1583  return bitcast_to_float (bitcast_to_int(r) + (m << 23));
1584 #else
1585  T r;
1586  for (int i = 0; i < r.elements; ++i)
1587  r[i] = fast_exp2(xval[i]);
1588  for (int i = r.elements; i < r.paddedelements; ++i)
1589  r[i] = 0.0f;
1590  return r;
1591 #endif
1592 }
1593 
1594 
1595 template<>
1596 inline OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
1597 #ifndef __CUDA_ARCH__
1598  // clamp to safe range for final addition
1599  float x = clamp (xval, -126.0f, 126.0f);
1600  // range reduction
1601  int m = int(x); x -= m;
1602  x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
1603  // 5th degree polynomial generated with sollya
1604  // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
1605  // ulp histogram:
1606  // 0 = 87.81%
1607  // 1 = 4.18%
1608  float r = 1.33336498402e-3f;
1609  r = madd(x, r, 9.810352697968e-3f);
1610  r = madd(x, r, 5.551834031939e-2f);
1611  r = madd(x, r, 0.2401793301105f);
1612  r = madd(x, r, 0.693144857883f);
1613  r = madd(x, r, 1.0f);
1614  // multiply by 2 ^ m by adding in the exponent
1615  // NOTE: left-shift of negative number is undefined behavior
1616  return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
1617 #else
1618  return exp2f(xval);
1619 #endif
1620 }
1621 
1622 
1623 
1624 
1625 template <typename T>
1626 inline OIIO_HOSTDEVICE T fast_exp (const T& x) {
1627  // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
1628  return fast_exp2(x * T(1 / M_LN2));
1629 }
1630 
1631 #ifdef __CUDA_ARCH__
1632 template<>
1633 inline OIIO_HOSTDEVICE float fast_exp (const float& x) {
1634  return __expf(x);
1635 }
1636 #endif
1637 
1638 
1639 
1640 /// Faster float exp than is in libm, but still 100% accurate
1641 inline OIIO_HOSTDEVICE float fast_correct_exp (float x)
1642 {
1643 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
1644  // On x86_64, versions of glibc < 2.16 have an issue where expf is
1645  // much slower than the double version. This was fixed in glibc 2.16.
1646  return static_cast<float>(std::exp(static_cast<double>(x)));
1647 #else
1648  return std::exp(x);
1649 #endif
1650 }
1651 
1652 
1653 inline OIIO_HOSTDEVICE float fast_exp10 (float x) {
1654 #ifndef __CUDA_ARCH__
1655  // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
1656  return fast_exp2(x * float(M_LN10 / M_LN2));
1657 #else
1658  return __exp10f(x);
1659 #endif
1660 }
1661 
1662 inline OIIO_HOSTDEVICE float fast_expm1 (float x) {
1663 #ifndef __CUDA_ARCH__
1664  if (fabsf(x) < 0.03f) {
1665  float y = 1.0f - (1.0f - x); // crush denormals
1666  return copysignf(madd(0.5f, y * y, y), x);
1667  } else
1668  return fast_exp(x) - 1.0f;
1669 #else
1670  return expm1f(x);
1671 #endif
1672 }
1673 
1674 inline OIIO_HOSTDEVICE float fast_sinh (float x) {
1675 #ifndef __CUDA_ARCH__
1676  float a = fabsf(x);
1677  if (a > 1.0f) {
1678  // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
1679  float e = fast_exp(a);
1680  return copysignf(0.5f * e - 0.5f / e, x);
1681  } else {
1682  a = 1.0f - (1.0f - a); // crush denorms
1683  float a2 = a * a;
1684  // degree 7 polynomial generated with sollya
1685  // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
1686  float r = 2.03945513931e-4f;
1687  r = madd(r, a2, 8.32990277558e-3f);
1688  r = madd(r, a2, 0.1666673421859f);
1689  r = madd(r * a, a2, a);
1690  return copysignf(r, x);
1691  }
1692 #else
1693  return sinhf(x);
1694 #endif
1695 }
1696 
1697 inline OIIO_HOSTDEVICE float fast_cosh (float x) {
1698 #ifndef __CUDA_ARCH__
1699  // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
1700  float e = fast_exp(fabsf(x));
1701  return 0.5f * e + 0.5f / e;
1702 #else
1703  return coshf(x);
1704 #endif
1705 }
1706 
1707 inline OIIO_HOSTDEVICE float fast_tanh (float x) {
1708 #ifndef __CUDA_ARCH__
1709  // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
1710  // NOTE: ulp error is high because of sub-optimal handling around the origin
1711  float e = fast_exp(2.0f * fabsf(x));
1712  return copysignf(1 - 2 / (1 + e), x);
1713 #else
1714  return tanhf(x);
1715 #endif
1716 }
1717 
1718 inline OIIO_HOSTDEVICE float fast_safe_pow (float x, float y) {
1719  if (y == 0) return 1.0f; // x^0=1
1720  if (x == 0) return 0.0f; // 0^y=0
1721  // be cheap & exact for special case of squaring and identity
1722  if (y == 1.0f)
1723  return x;
1724  if (y == 2.0f) {
1725 #ifndef __CUDA_ARCH__
1726  return std::min (x*x, std::numeric_limits<float>::max());
1727 #else
1728  return fminf (x*x, std::numeric_limits<float>::max());
1729 #endif
1730  }
1731  float sign = 1.0f;
1732  if (x < 0) {
1733  // if x is negative, only deal with integer powers
1734  // powf returns NaN for non-integers, we will return 0 instead
1735  int ybits = bit_cast<float, int>(y) & 0x7fffffff;
1736  if (ybits >= 0x4b800000) {
1737  // always even int, keep positive
1738  } else if (ybits >= 0x3f800000) {
1739  // bigger than 1, check
1740  int k = (ybits >> 23) - 127; // get exponent
1741  int j = ybits >> (23 - k); // shift out possible fractional bits
1742  if ((j << (23 - k)) == ybits) // rebuild number and check for a match
1743  sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
1744  else
1745  return 0.0f; // not integer
1746  } else {
1747  return 0.0f; // not integer
1748  }
1749  }
1750  return sign * fast_exp2(y * fast_log2(fabsf(x)));
1751 }
1752 
1753 
1754 // Fast simd pow that only needs to work for positive x
1755 template<typename T, typename U>
1756 inline OIIO_HOSTDEVICE T fast_pow_pos (const T& x, const U& y) {
1757  return fast_exp2(y * fast_log2(x));
1758 }
1759 
1760 
1761 // Fast cube root (performs better that using fast_pow's above with y=1/3)
1762 inline OIIO_HOSTDEVICE float fast_cbrt (float x) {
1763 #ifndef __CUDA_ARCH__
1764  float x0 = fabsf(x);
1765  // from hacker's delight
1766  float a = bit_cast<int, float>(0x2a5137a0 + bit_cast<float, int>(x0) / 3); // Initial guess.
1767  // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
1768  // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
1769  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step.
1770  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step again.
1771  a = (x0 == 0) ? 0 : a; // Fix 0 case
1772  return copysignf(a, x);
1773 #else
1774  return cbrtf(x);
1775 #endif
1776 }
1777 
1778 
1779 inline OIIO_HOSTDEVICE float fast_erf (float x)
1780 {
1781 #ifndef __CUDA_ARCH__
1782  // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
1783  // Abramowitz and Stegun, 7.1.28
1784  const float a1 = 0.0705230784f;
1785  const float a2 = 0.0422820123f;
1786  const float a3 = 0.0092705272f;
1787  const float a4 = 0.0001520143f;
1788  const float a5 = 0.0002765672f;
1789  const float a6 = 0.0000430638f;
1790  const float a = fabsf(x);
1791  const float b = 1.0f - (1.0f - a); // crush denormals
1792  const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
1793  const float s = r * r; // ^2
1794  const float t = s * s; // ^4
1795  const float u = t * t; // ^8
1796  const float v = u * u; // ^16
1797  return copysignf(1.0f - 1.0f / v, x);
1798 #else
1799  return erff(x);
1800 #endif
1801 }
1802 
1803 inline OIIO_HOSTDEVICE float fast_erfc (float x)
1804 {
1805 #ifndef __CUDA_ARCH__
1806  // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
1807  // ulp histogram:
1808  // 0 = 80.30%
1809  return 1.0f - fast_erf(x);
1810 #else
1811  return erfcf(x);
1812 #endif
1813 }
1814 
1815 inline OIIO_HOSTDEVICE float fast_ierf (float x)
1816 {
1817  // from: Approximating the erfinv function by Mike Giles
1818  // to avoid trouble at the limit, clamp input to 1-eps
1819  float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
1820  float w = -fast_log((1.0f - a) * (1.0f + a)), p;
1821  if (w < 5.0f) {
1822  w = w - 2.5f;
1823  p = 2.81022636e-08f;
1824  p = madd(p, w, 3.43273939e-07f);
1825  p = madd(p, w, -3.5233877e-06f );
1826  p = madd(p, w, -4.39150654e-06f);
1827  p = madd(p, w, 0.00021858087f );
1828  p = madd(p, w, -0.00125372503f );
1829  p = madd(p, w, -0.00417768164f );
1830  p = madd(p, w, 0.246640727f );
1831  p = madd(p, w, 1.50140941f );
1832  } else {
1833  w = sqrtf(w) - 3.0f;
1834  p = -0.000200214257f;
1835  p = madd(p, w, 0.000100950558f);
1836  p = madd(p, w, 0.00134934322f );
1837  p = madd(p, w, -0.00367342844f );
1838  p = madd(p, w, 0.00573950773f );
1839  p = madd(p, w, -0.0076224613f );
1840  p = madd(p, w, 0.00943887047f );
1841  p = madd(p, w, 1.00167406f );
1842  p = madd(p, w, 2.83297682f );
1843  }
1844  return p * x;
1845 }
1846 
1847 // (end of fast* functions)
1848 ////////////////////////////////////////////////////////////////////////////
1849 
1850 
1851 
1852 
1853 ////////////////////////////////////////////////////////////////////////////
1854 ////////////////////////////////////////////////////////////////////////////
1855 //
1856 // MISCELLANEOUS NUMERICAL METHODS
1857 //
1858 
1859 
1860 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
1861 /// Use a maximum of maxiter iterations, and stop any time the remaining
1862 /// search interval or the function evaluations <= eps. If brack is
1863 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
1864 /// false (in which case the caller should know that the results may be
1865 /// unreliable. Results are undefined if the function is not monotonic
1866 /// on that interval or if there are multiple roots in the interval (it
1867 /// may not converge, or may converge to any of the roots without
1868 /// telling you that there are more than one).
1869 template<class T, class Func>
1870 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
1871  int maxiters=32, T eps=1.0e-6, bool *brack=0)
1872 {
1873  // Use the Regula Falsi method, falling back to bisection if it
1874  // hasn't converged after 3/4 of the maximum number of iterations.
1875  // See, e.g., Numerical Recipes for the basic ideas behind both
1876  // methods.
1877  T v0 = func(xmin), v1 = func(xmax);
1878  T x = xmin, v = v0;
1879  bool increasing = (v0 < v1);
1880  T vmin = increasing ? v0 : v1;
1881  T vmax = increasing ? v1 : v0;
1882  bool bracketed = (y >= vmin && y <= vmax);
1883  if (brack)
1884  *brack = bracketed;
1885  if (! bracketed) {
1886  // If our bounds don't bracket the zero, just give up, and
1887  // return the approprate "edge" of the interval
1888  return ((y < vmin) == increasing) ? xmin : xmax;
1889  }
1890  if (fabs(v0-v1) < eps) // already close enough
1891  return x;
1892  int rfiters = (3*maxiters)/4; // how many times to try regula falsi
1893  for (int iters = 0; iters < maxiters; ++iters) {
1894  T t; // interpolation factor
1895  if (iters < rfiters) {
1896  // Regula falsi
1897  t = (y-v0)/(v1-v0);
1898  if (t <= T(0) || t >= T(1))
1899  t = T(0.5); // RF convergence failure -- bisect instead
1900  } else {
1901  t = T(0.5); // bisection
1902  }
1903  x = lerp (xmin, xmax, t);
1904  v = func(x);
1905  if ((v < y) == increasing) {
1906  xmin = x; v0 = v;
1907  } else {
1908  xmax = x; v1 = v;
1909  }
1910  if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
1911  return x; // converged
1912  }
1913  return x;
1914 }
1915 
1916 
1917 
1918 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
1919 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
1920 /// x==1.0.
1921 inline OIIO_HOSTDEVICE float
1923 {
1924 #ifndef __CUDA_ARCH__
1925  DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (%zd)", y.size());
1926 #endif
1927  x = clamp (x, float(0.0), float(1.0));
1928  int nsegs = int(y.size()) - 1;
1929  int segnum;
1930  x = floorfrac (x*nsegs, &segnum);
1931 #ifndef __CUDA_ARCH__
1932  int nextseg = std::min (segnum+1, nsegs);
1933 #else
1934  int nextseg = min (segnum+1, nsegs);
1935 #endif
1936  return lerp (y[segnum], y[nextseg], x);
1937 }
1938 
1939 // (end miscellaneous numerical methods)
1940 ////////////////////////////////////////////////////////////////////////////
1941 
1942 
1943 
GLdouble s
Definition: glew.h:1390
SYS_API float acosf(float x)
OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
Definition: fmath.h:1088
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4703
SYS_API double cos(double x)
Definition: SYS_FPUMath.h:69
OIIO_HOSTDEVICE int fast_rint(float x)
Round to nearest integer, returning as an int.
Definition: fmath.h:1190
SYS_API float coshf(float x)
GLenum src
Definition: glew.h:2410
OIIO_HOSTDEVICE float fast_cos(float x)
Definition: fmath.h:1241
#define OIIO_FORCEINLINE
Definition: platform.h:249
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:6989
#define M_1_PI
1/PI
Definition: missing_math.h:68
OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
Definition: fmath.h:1641
OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
Definition: fmath.h:288
OIIO_HOSTDEVICE T fast_exp(const T &x)
Definition: fmath.h:1626
#define M_PI_4
PI / 4.
Definition: missing_math.h:58
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
Definition: fmath.h:423
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4406
OIIO_HOSTDEVICE float bitcast_to_float(int x)
Definition: fmath.h:593
OIIO_HOSTDEVICE OUT_TYPE bit_cast(const IN_TYPE in)
Definition: fmath.h:583
OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
Definition: fmath.h:525
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
Definition: UT_ArraySet.h:1599
OIIO_HOSTDEVICE float fast_cbrt(float x)
Definition: fmath.h:1762
OIIO_HOSTDEVICE T safe_pow(T x, T y)
Definition: fmath.h:1146
const DataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:972
#define M_LN10
ln(10)
Definition: missing_math.h:93
GLuint const GLfloat * val
Definition: glew.h:2794
OIIO_HOSTDEVICE float fast_sinpi(float x)
Definition: fmath.h:1336
GLenum GLenum GLenum GLenum GLenum scale
Definition: glew.h:13880
GLboolean GLboolean GLboolean GLboolean a
Definition: glew.h:9477
OIIO_HOSTDEVICE float fast_exp10(float x)
Definition: fmath.h:1653
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7231
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
Definition: fmath.h:534
OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
Definition: fmath.h:243
SYS_API float atan2f(float y, float x)
const GLdouble * m
Definition: glew.h:9124
const GLdouble * v
Definition: glew.h:1391
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
Definition: fmath.h:908
OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
Definition: fmath.h:1269
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
Definition: fmath.h:151
OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
Definition: fmath.h:370
OIIO_HOSTDEVICE float fast_expm1(float x)
Definition: fmath.h:1662
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
Definition: fmath.h:128
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1176
OIIO_HOSTDEVICE int pow2roundup(int x)
Definition: fmath.h:143
Platform-related macros.
OIIO_HOSTDEVICE float fast_ierf(float x)
Definition: fmath.h:1815
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
Definition: fmath.h:486
#define M_PI
Definition: ImathPlatform.h:51
OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
Definition: fmath.h:295
void set(I *data)
Definition: fmath.h:970
OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:267
SYS_API float log10f(float x)
OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: -(a*b - c)
Definition: fmath.h:281
void set(const I *data)
Definition: fmath.h:989
GLdouble GLdouble z
Definition: glew.h:1559
OIIO_HOSTDEVICE T fast_log2(const T &xval)
Definition: fmath.h:1455
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7177
OIIO_HOSTDEVICE int pow2rounddown(int x)
Definition: fmath.h:144
GLdouble GLdouble GLdouble GLdouble q
Definition: glew.h:1414
OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
Definition: fmath.h:1139
GLfloat GLfloat GLfloat v2
Definition: glew.h:1856
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t _min, uint8_t _max)
Definition: fmath.h:808
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
Definition: fmath.h:659
OIIO_HOSTDEVICE float fast_cospi(float x)
Definition: fmath.h:1374
T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
Definition: fmath.h:1870
SYS_API double asin(double x)
Definition: SYS_FPUMath.h:81
DataArrayProxy(I *data=NULL)
Definition: fmath.h:966
OIIO_HOSTDEVICE float fast_erfc(float x)
Definition: fmath.h:1803
OIIO_HOSTDEVICE float fast_logb(float x)
Definition: fmath.h:1534
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:892
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
Definition: fmath.h:105
#define DASSERT(x)
Definition: dassert.h:99
OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
Definition: fmath.h:1082
OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
Definition: fmath.h:318
GLclampf f
Definition: glew.h:3499
OIIO_HOSTDEVICE float fast_erf(float x)
Definition: fmath.h:1779
GLint GLint GLint GLint GLint x
Definition: glew.h:1252
OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
Definition: fmath.h:1130
GLint GLint GLint GLint GLint GLint y
Definition: glew.h:1252
GLuint in
Definition: glew.h:11510
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
Definition: fmath.h:409
GLint GLenum GLsizei GLint GLsizei const void * data
Definition: glew.h:1379
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7300
OIIO_HOSTDEVICE float fast_sinh(float x)
Definition: fmath.h:1674
OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
Definition: fmath.h:1095
DataProxy(I &data)
Definition: fmath.h:938
GLubyte GLubyte GLubyte GLubyte w
Definition: glew.h:1890
OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
Definition: fmath.h:474
OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.
Definition: fmath.h:529
GLsizei n
Definition: glew.h:4040
const GLfloat * c
Definition: glew.h:16296
SYS_API void sincosf(float x, float *s, float *c)
GLenum GLenum dst
Definition: glew.h:2410
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
Definition: fmath.h:352
OIIO_HOSTDEVICE float fast_atan(float x)
Definition: fmath.h:1411
#define OIIO_HOSTDEVICE
Definition: platform.h:348
void store(float *values) const
Definition: simd.h:6529
OIIO_HOSTDEVICE float fast_tan(float x)
Definition: fmath.h:1305
typedef int(WINAPI *PFNWGLRELEASEPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
Definition: fmath.h:164
#define M_LN2
ln(2)
Definition: missing_math.h:88
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
Bitwise circular rotation left by k bits (for 32 bit unsigned integers)
Definition: fmath.h:200
SYS_FORCE_INLINE float log2(float x)
Definition: SYS_FPUMath.h:90
T exp(const T &v)
Definition: simd.h:7377
SYS_API double acos(double x)
Definition: SYS_FPUMath.h:83
ConstDataProxy(const I &data)
Definition: fmath.h:953
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
Definition: fmath.h:92
OIIO_HOSTDEVICE float sign(float x)
Definition: fmath.h:563
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7186
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
Definition: fmath.h:1922
OIIO_HOSTDEVICE T safe_mod(T a, T b)
Definition: fmath.h:221
int floor(T x)
Definition: ImathFun.h:150
OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
Definition: fmath.h:1718
E operator*() const
Definition: fmath.h:987
GLuint GLfloat x0
Definition: glew.h:12681
ConstDataArrayProxy(const I *data=NULL)
Definition: fmath.h:986
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1469
GLdouble GLdouble GLdouble b
Definition: glew.h:9122
GLfloat GLfloat p
Definition: glew.h:16321
OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
Definition: fmath.h:1103
GLenum func
Definition: glcorearb.h:782
OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
Definition: fmath.h:305
#define DASSERT_MSG
Definition: dassert.h:109
T log(const T &v)
Definition: simd.h:7432
E operator*() const
Definition: fmath.h:967
void store(int *values) const
Store the values into memory.
Definition: simd.h:4086
OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
Definition: fmath.h:1112
SYS_API float sinhf(float x)
GLuint num
Definition: glew.h:2690
OIIO_HOSTDEVICE T fast_log10(const T &x)
Definition: fmath.h:1521
SYS_API float copysignf(float x, float y)
GLuint ybits
Definition: glew.h:12566
OIIO_HOSTDEVICE float fast_acos(float x)
Definition: fmath.h:1383
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
Definition: fmath.h:1026
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
Definition: glew.h:12681
constexpr index_type size() const noexcept
Definition: span.h:279
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
Definition: glew.h:12681
GLdouble GLdouble GLdouble r
Definition: glew.h:1406
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4577
OIIO_HOSTDEVICE float fast_log1p(float x)
Definition: fmath.h:1547
GLfloat v0
Definition: glew.h:1848
SYS_API float asinf(float x)
GLuint64EXT * result
Definition: glew.h:14007
Helper template to let us tell if two types are the same.
Definition: fmath.h:74
OIIO_HOSTDEVICE int bitcast_to_int(float x)
Definition: fmath.h:592
OIIO_HOSTDEVICE float fast_asin(float x)
Definition: fmath.h:1398
SYS_API float expm1f(float x)
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t _min, uint16_t _max)
Definition: fmath.h:787
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
Definition: fmath.h:636
E operator[](int i) const
Definition: fmath.h:988
T operator()(unsigned char c) const
Definition: fmath.h:1006
OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
Definition: fmath.h:1121
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
Definition: fmath.h:175
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:66
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
Bitwise circular rotation left by k bits (for 64 bit unsigned integers)
Definition: fmath.h:211
const ConstDataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:991
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:4694
OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
Definition: fmath.h:1756
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float _min, float _max)
Definition: fmath.h:743
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
Definition: fmath.h:602
Classes for SIMD processing.
#define M_PI_2
Definition: ImathPlatform.h:55
vint4 rint(const vfloat4 &a)
Definition: simd.h:7206
GLsizei const GLfloat * value
Definition: glew.h:1849
Definition: half.h:91
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float _min, float _max)
Definition: fmath.h:725
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE T fast_exp2(const T &xval)
Definition: fmath.h:1563
GLfloat GLfloat GLfloat GLfloat v3
Definition: glew.h:1860
GLdouble GLdouble t
Definition: glew.h:1398
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
Definition: fmath.h:455
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
Definition: fmath.h:187
OIIO_HOSTDEVICE float fast_atan2(float y, float x)
Definition: fmath.h:1430
GLfloat GLfloat v1
Definition: glew.h:1852
SYS_API double sin(double x)
Definition: SYS_FPUMath.h:71
E operator[](int i) const
Definition: fmath.h:968
OIIO_HOSTDEVICE float fast_cosh(float x)
Definition: fmath.h:1697
float float_t
Definition: fmath.h:623
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
Definition: fmath.h:437
OIIO_HOSTDEVICE float fast_tanh(float x)
Definition: fmath.h:1707
E operator=(E newval)
Definition: fmath.h:939
OIIO_HOSTDEVICE T fast_log(const T &x)
Definition: fmath.h:1506
GLenum GLsizei len
Definition: glew.h:7752
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:65
OIIO_HOSTDEVICE float fast_sin(float x)
Definition: fmath.h:1208