HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fmath.h
Go to the documentation of this file.
1 // Copyright Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause and Apache-2.0
3 // https://github.com/AcademySoftwareFoundation/OpenImageIO
4 
5 /*
6  A few bits here are based upon code from NVIDIA that was also released
7  under the same 3-Clause BSD license, and marked as:
8  Copyright 2004 NVIDIA Corporation. All Rights Reserved.
9 
10  Some parts of this file were first open-sourced in Open Shading Language,
11  also 3-Clause BSD license, then later moved here. The original copyright
12  notice was:
13  Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
14 
15  Many of the math functions were copied from or inspired by other
16  public domain sources or open source packages with compatible licenses.
17  The individual functions give references were applicable.
18 */
19 
20 
21 // clang-format off
22 
23 /// \file
24 ///
25 /// A variety of floating-point math helper routines (and, slight
26 /// misnomer, some int stuff as well).
27 ///
28 
29 
30 #pragma once
31 #define OIIO_FMATH_H 1
32 
33 #include <algorithm>
34 #include <cmath>
35 #include <cstring>
36 #include <limits>
37 #include <typeinfo>
38 #include <type_traits>
39 
40 #include <OpenImageIO/span.h>
41 #include <OpenImageIO/dassert.h>
43 #include <OpenImageIO/platform.h>
44 #include <OpenImageIO/simd.h>
45 
46 
48 
49 /// If the caller defines OIIO_FMATH_HEADER_ONLY to nonzero, then 100% of the
50 /// implementation of fmath functions will be defined directly in this header
51 /// file. If it is not defined, or set to 0, then there are a few functions
52 /// for which this header will only provide the definition.
53 #ifndef OIIO_FMATH_HEADER_ONLY
54 # define OIIO_FMATH_HEADER_ONLY 0
55 #endif
56 
57 
58 /// Occasionally there may be a tradeoff where the best/fastest
59 /// implementation of a math function in an ordinary scalar context does
60 /// things that prevent autovectorization (or OMP pragma vectorization) of a
61 /// loop containing that operation, and the only way to make it SIMD
62 /// friendly slows it down as a scalar op.
63 ///
64 /// By default, we always prefer doing it as correctly and quickly as
65 /// possible in scalar mode, but if OIIO_FMATH_SIMD_FRIENDLY is defined to
66 /// be nonzero, we prefer ensuring that the SIMD loops are as efficient as
67 /// possible, even at the expense of scalar performance.
68 ///
69 /// Because everything here consists of inline functions, and it doesn't
70 /// really matter for OIIO internals themselves, this is primarily intended
71 /// for the sake of 3rd party apps that happen to have OIIO as a dependency
72 /// and use the fmath.h functions. Such a downstream dependency just needs
73 /// to ensure that OIIO_FMATH_SIMD_FRIENDLY is defined to 1 prior to
74 /// including fmath.h.
75 ///
76 #ifndef OIIO_FMATH_SIMD_FRIENDLY
77 # define OIIO_FMATH_SIMD_FRIENDLY 0
78 #endif
79 
80 
81 // Helper template to let us tell if two types are the same.
82 // C++11 defines this, keep in OIIO namespace for back compat.
83 // DEPRECATED(2.0) -- clients should switch OIIO::is_same -> std::is_same.
84 using std::is_same;
85 
86 
87 // For back compatibility: expose these in the OIIO namespace.
88 // DEPRECATED(2.0) -- clients should switch OIIO:: -> std:: for these.
89 using std::isfinite;
90 using std::isinf;
91 using std::isnan;
92 
93 
94 // Define math constants just in case they aren't included (Windows is a
95 // little finicky about this, only defining these if _USE_MATH_DEFINES is
96 // defined before <cmath> is included, which is hard to control).
97 #ifndef M_PI
98 # define M_PI 3.14159265358979323846264338327950288
99 #endif
100 #ifndef M_PI_2
101 # define M_PI_2 1.57079632679489661923132169163975144
102 #endif
103 #ifndef M_PI_4
104 # define M_PI_4 0.785398163397448309615660845819875721
105 #endif
106 #ifndef M_TWO_PI
107 # define M_TWO_PI (M_PI * 2.0)
108 #endif
109 #ifndef M_1_PI
110 # define M_1_PI 0.318309886183790671537767526745028724
111 #endif
112 #ifndef M_2_PI
113 # define M_2_PI 0.636619772367581343075535053490057448
114 #endif
115 #ifndef M_SQRT2
116 # define M_SQRT2 1.41421356237309504880168872420969808
117 #endif
118 #ifndef M_SQRT1_2
119 # define M_SQRT1_2 0.707106781186547524400844362104849039
120 #endif
121 #ifndef M_LN2
122 # define M_LN2 0.69314718055994530941723212145817656
123 #endif
124 #ifndef M_LN10
125 # define M_LN10 2.30258509299404568401799145468436421
126 #endif
127 #ifndef M_E
128 # define M_E 2.71828182845904523536028747135266250
129 #endif
130 #ifndef M_LOG2E
131 # define M_LOG2E 1.44269504088896340735992468100189214
132 #endif
133 
134 
135 
136 ////////////////////////////////////////////////////////////////////////////
137 ////////////////////////////////////////////////////////////////////////////
138 //
139 // INTEGER HELPER FUNCTIONS
140 //
141 // A variety of handy functions that operate on integers.
142 //
143 
144 /// Quick test for whether an integer is a power of 2.
145 ///
146 template<typename T>
148 ispow2(T x) noexcept
149 {
150  // Numerous references for this bit trick are on the web. The
151  // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
152  // all 1 bits for bits < b.
153  return (x & (x - 1)) == 0 && (x >= 0);
154 }
155 
156 
157 
158 /// Round up to next higher power of 2 (return x if it's already a power
159 /// of 2).
161 ceil2(int x) noexcept
162 {
163  // Here's a version with no loops.
164  if (x < 0)
165  return 0;
166  // Subtract 1, then round up to a power of 2, that way if we are
167  // already a power of 2, we end up with the same number.
168  --x;
169  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
170  x |= x >> 1;
171  x |= x >> 2;
172  x |= x >> 4;
173  x |= x >> 8;
174  x |= x >> 16;
175  // Now we have 2^n-1, by adding 1, we make it a power of 2 again
176  return x + 1;
177 }
178 
179 
180 
181 /// Round down to next lower power of 2 (return x if it's already a power
182 /// of 2).
184 floor2(int x) noexcept
185 {
186  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
187  x |= x >> 1;
188  x |= x >> 2;
189  x |= x >> 4;
190  x |= x >> 8;
191  x |= x >> 16;
192  // Strip off all but the high bit, i.e. 00011111 -> 00010000
193  // That's the power of two <= the original x
194  return x & ~(x >> 1);
195 }
196 
197 
198 // Old names -- DEPRECATED(2.1)
199 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
200 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
201 
202 
203 
204 /// Round value up to the next whole multiple. For example,
205 /// `round_to_multiple(10,10) == 10`, `round_to_multiple(17,10) == 20`, and
206 /// `round_to_multiple(-17,10) == -10`.
207 template <typename V, typename M>
208 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
209 {
210  if (value >= 0)
211  value += V(multiple) - 1;
212  return value - (value % V(multiple));
213 }
214 
215 
216 
217 /// Round up to the next whole multiple of m, for the special case where
218 /// m is definitely a power of 2 (somewhat simpler than the more general
219 /// round_to_multiple). This is a template that should work for any
220 /// integer type.
221 template<typename T>
222 inline OIIO_HOSTDEVICE T
224 {
225  OIIO_DASSERT(ispow2(m));
226  return (x + m - 1) & (~(m - 1));
227 }
228 
229 
230 
231 /// Round `value` down to a whole multiple of `multiple`. For example,
232 /// `round_down_to_multiple(10,10) == 10`,
233 /// `round_down_to_multiple(17,10) == 10`, and
234 /// `round_down_to_multiple(-17,10) == -20`.
235 template <typename V, typename M>
237 {
238  if (value < 0)
239  value -= V(multiple - 1);
240  return value - (value % V(multiple));
241 }
242 
243 
244 
245 /// Multiply two unsigned 32-bit ints safely, carefully checking for
246 /// overflow, and clamping to uint32_t's maximum value.
247 inline OIIO_HOSTDEVICE uint32_t
248 clamped_mult32(uint32_t a, uint32_t b)
249 {
250  const uint32_t Err = std::numeric_limits<uint32_t>::max();
251  uint64_t r = (uint64_t)a * (uint64_t)b; // Multiply into a bigger int
252  return r < Err ? (uint32_t)r : Err;
253 }
254 
255 
256 
257 /// Multiply two unsigned 64-bit ints safely, carefully checking for
258 /// overflow, and clamping to uint64_t's maximum value.
259 inline OIIO_HOSTDEVICE uint64_t
260 clamped_mult64(uint64_t a, uint64_t b)
261 {
262  uint64_t ab = a * b;
263  if (b && ab / b != a)
265  else
266  return ab;
267 }
268 
269 
270 
271 /// Bitwise circular rotation left by `s` bits (for any unsigned integer
272 /// type). For info on the C++20 std::rotl(), see
273 /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0553r4.html
274 // FIXME: this should be constexpr, but we're leaving that out for now
275 // because the Cuda specialization uses an intrinsic that isn't constexpr.
276 // Come back to this later when more of the Cuda library is properly
277 // constexpr.
278 template<class T>
280 // constexpr
281 T rotl(T x, int s) noexcept
282 {
284  "rotl only works for unsigned integer types");
285  return (x << s) | (x >> ((sizeof(T) * 8) - s));
286 }
287 
288 
289 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
290 // Cuda has an intrinsic for 32 bit unsigned int rotation
291 // FIXME: This should be constexpr, but __funnelshift_lc seems not to be
292 // marked as such.
293 template<>
295 // constexpr
296 uint32_t rotl(uint32_t x, int s) noexcept
297 {
298  return __funnelshift_lc(x, x, s);
299 }
300 #endif
301 
302 
303 
304 // Old names -- DEPRECATED(2.1)
306 rotl32(uint32_t x, int k)
307 {
308 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
309  return __funnelshift_lc(x, x, k);
310 #else
311  return (x << k) | (x >> (32 - k));
312 #endif
313 }
314 
316 rotl64(uint64_t x, int k)
317 {
318  return (x << k) | (x >> (64 - k));
319 }
320 
321 
322 
323 // safe_mod is like integer a%b, but safely returns 0 when b==0.
324 template <class T>
326 safe_mod(T a, T b)
327 {
328  return b ? (a % b) : T(0);
329 }
330 
331 // (end of integer helper functions)
332 ////////////////////////////////////////////////////////////////////////////
333 
334 
335 
336 ////////////////////////////////////////////////////////////////////////////
337 ////////////////////////////////////////////////////////////////////////////
338 //
339 // FLOAT UTILITY FUNCTIONS
340 //
341 // Helper/utility functions: clamps, blends, interpolations...
342 //
343 
344 
345 /// clamp a to bounds [low,high].
346 template <class T>
348 clamp (const T& a, const T& low, const T& high)
349 {
350 #if 1
351  // This looks clunky, but it generates minimal code. For float, it
352  // should result in just a max and min instruction, that's it.
353  // This implementation is courtesy of Alex Wells, Intel, via OSL.
354  T val = a;
355  if (!(low <= val)) // Forces clamp(NaN,low,high) to return low
356  val = low;
357  if (val > high)
358  val = high;
359  return val;
360 #else
361  // The naive implementation we originally had, below, only does the 2nd
362  // comparison in the else block, which will generate extra code in a
363  // SIMD masking scenario. I (LG) can confirm that benchmarks show that
364  // the above implementation is indeed faster than the one below, even
365  // for non-SIMD use.
366  return (a >= low) ? ((a <= high) ? a : high) : low;
367 #endif
368 }
369 
370 
371 #ifndef __CUDA_ARCH__
372 // Specialization of clamp for vfloat4
374 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
375 {
376  return simd::min (high, simd::max (low, a));
377 }
378 
380 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
381 {
382  return simd::min (high, simd::max (low, a));
383 }
384 
386 clamp (const simd::vfloat16& a, const simd::vfloat16& low, const simd::vfloat16& high)
387 {
388  return simd::min (high, simd::max (low, a));
389 }
390 
391 // Specialization of clamp for vint4
392 template<> OIIO_FORCEINLINE simd::vint4
393 clamp (const simd::vint4& a, const simd::vint4& low, const simd::vint4& high)
394 {
395  return simd::min (high, simd::max (low, a));
396 }
397 
398 template<> OIIO_FORCEINLINE simd::vint8
399 clamp (const simd::vint8& a, const simd::vint8& low, const simd::vint8& high)
400 {
401  return simd::min (high, simd::max (low, a));
402 }
403 
405 clamp (const simd::vint16& a, const simd::vint16& low, const simd::vint16& high)
406 {
407  return simd::min (high, simd::max (low, a));
408 }
409 #endif
410 
411 
412 
413 // For the multiply+add (or sub) operations below, note that the results may
414 // differ slightly on different hardware, depending on whether true fused
415 // multiply and add is available or if the code generated just does an old
416 // fashioned multiply followed by a separate add. So please interpret these
417 // as "do a multiply and add as fast as possible for this hardware, with at
418 // least as much precision as a multiply followed by a separate add."
419 
420 /// Fused multiply and add: (a*b + c)
421 OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd (float a, float b, float c) {
422  // NOTE: GCC/ICC will turn this (for float) into a FMA unless
423  // explicitly asked not to, clang will do so if -ffp-contract=fast.
424  OIIO_CLANG_PRAGMA(clang fp contract(fast))
425  return a * b + c;
426 }
427 
428 
429 /// Fused multiply and subtract: (a*b - c)
430 OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub (float a, float b, float c) {
431  OIIO_CLANG_PRAGMA(clang fp contract(fast))
432  return a * b - c;
433 }
434 
435 
436 
437 /// Fused negative multiply and add: -(a*b) + c
438 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
439  OIIO_CLANG_PRAGMA(clang fp contract(fast))
440  return c - (a * b);
441 }
442 
443 
444 
445 /// Negative fused multiply and subtract: -(a*b) - c
446 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
447  OIIO_CLANG_PRAGMA(clang fp contract(fast))
448  return -(a * b) - c;
449 }
450 
451 
452 
453 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
454 /// This is a template, and so should work for any types.
455 template <class T, class Q>
457 lerp (const T& v0, const T& v1, const Q& x)
458 {
459  // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
460  return v0*(Q(1)-x) + v1*x;
461 }
462 
463 
464 
465 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
466 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
467 /// result. This is a template, and so should work for any types.
468 template <class T, class Q>
470 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
471 {
472  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
473  Q s1 = Q(1) - s;
474  return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
475 }
476 
477 
478 
479 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
480 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
481 /// storing the results in 'result'. These are all vectors, so do it
482 /// for each of 'n' contiguous values (using the same s,t interpolants).
483 template <class T, class Q>
484 inline OIIO_HOSTDEVICE void
485 bilerp (const T *v0, const T *v1,
486  const T *v2, const T *v3,
487  Q s, Q t, int n, T *result)
488 {
489  Q s1 = Q(1) - s;
490  Q t1 = Q(1) - t;
491  for (int i = 0; i < n; ++i)
492  result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
493 }
494 
495 
496 
497 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
498 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
499 /// SCALING the interpolated value by 'scale' and then ADDING to
500 /// 'result'. These are all vectors, so do it for each of 'n'
501 /// contiguous values (using the same s,t interpolants).
502 template <class T, class Q>
503 inline OIIO_HOSTDEVICE void
504 bilerp_mad (const T *v0, const T *v1,
505  const T *v2, const T *v3,
506  Q s, Q t, Q scale, int n, T *result)
507 {
508  Q s1 = Q(1) - s;
509  Q t1 = Q(1) - t;
510  for (int i = 0; i < n; ++i)
511  result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
512  t*(v2[i]*s1 + v3[i]*s)));
513 }
514 
515 
516 
517 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
518 /// upper right top, ...) at coordinates (s,t,r), and return the
519 /// result. This is a template, and so should work for any types.
520 template <class T, class Q>
522 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
523 {
524  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
525  Q s1 = Q(1) - s;
526  Q t1 = Q(1) - t;
527  Q r1 = Q(1) - r;
528  return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
529  r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
530 }
531 
532 
533 
534 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
535 /// upper right top, ...) at coordinates (s,t,r),
536 /// storing the results in 'result'. These are all vectors, so do it
537 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
538 template <class T, class Q>
539 inline OIIO_HOSTDEVICE void
540 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
541  const T *v4, const T *v5, const T *v6, const T *v7,
542  Q s, Q t, Q r, int n, T *result)
543 {
544  Q s1 = Q(1) - s;
545  Q t1 = Q(1) - t;
546  Q r1 = Q(1) - r;
547  for (int i = 0; i < n; ++i)
548  result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
549  r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
550 }
551 
552 
553 
554 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
555 /// upper right top, ...) at coordinates (s,t,r),
556 /// SCALING the interpolated value by 'scale' and then ADDING to
557 /// 'result'. These are all vectors, so do it for each of 'n'
558 /// contiguous values (using the same s,t,r interpolants).
559 template <class T, class Q>
560 inline OIIO_HOSTDEVICE void
561 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
562  const T *v4, const T *v5, const T *v6, const T *v7,
563  Q s, Q t, Q r, Q scale, int n, T *result)
564 {
565  Q r1 = Q(1) - r;
566  bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
567  bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
568 }
569 
570 
571 
572 /// Evaluate B-spline weights in w[0..3] for the given fraction. This
573 /// is an important component of performing a cubic interpolation.
574 template <typename T>
575 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
576 {
577  T one_frac = 1 - fraction;
578  w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
579  w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
580  w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
581  w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
582 }
583 
584 
585 /// Evaluate B-spline derivative weights in w[0..3] for the given
586 /// fraction. This is an important component of performing a cubic
587 /// interpolation with derivatives.
588 template <typename T>
589 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
590 {
591  T one_frac = 1 - fraction;
592  dw[0] = -T(0.5) * one_frac * one_frac;
593  dw[1] = T(0.5) * fraction * (3 * fraction - 4);
594  dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
595  dw[3] = T(0.5) * fraction * fraction;
596 }
597 
598 
599 
600 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
601 /// with val[0] pointing to the data in the upper left corner, val[15]
602 /// pointing to the lower right) at coordinates (s,t), storing the
603 /// results in 'result'. These are all vectors, so do it for each of
604 /// 'n' contiguous values (using the same s,t interpolants).
605 template <class T>
606 inline OIIO_HOSTDEVICE void
607 bicubic_interp (const T **val, T s, T t, int n, T *result)
608 {
609  for (int c = 0; c < n; ++c)
610  result[c] = T(0);
611  T wx[4]; evalBSplineWeights (wx, s);
612  T wy[4]; evalBSplineWeights (wy, t);
613  for (int j = 0; j < 4; ++j) {
614  for (int i = 0; i < 4; ++i) {
615  T w = wx[i] * wy[j];
616  for (int c = 0; c < n; ++c)
617  result[c] += w * val[j*4+i][c];
618  }
619  }
620 }
621 
622 
623 
624 /// Return floor(x) cast to an int.
626 ifloor (float x)
627 {
628  return (int)floorf(x);
629 }
630 
631 
632 
633 /// Return (x-floor(x)) and put (int)floor(x) in *xint. This is similar
634 /// to the built-in modf, but returns a true int, always rounds down
635 /// (compared to modf which rounds toward 0), and always returns
636 /// frac >= 0 (comapred to modf which can return <0 if x<0).
637 inline OIIO_HOSTDEVICE float
638 floorfrac (float x, int *xint)
639 {
640 #if 1
641  float f = std::floor(x);
642  *xint = int(f);
643  return x - f;
644 #else /* vvv This idiom is slower */
645  int i = ifloor(x);
646  *xint = i;
647  return x - static_cast<float>(i); // Return the fraction left over
648 #endif
649 }
650 
651 
652 #ifndef __CUDA_ARCH__
655  *xint = simd::vint4(f);
656  return x - f;
657 }
658 
661  *xint = simd::vint8(f);
662  return x - f;
663 }
664 
667  *xint = simd::vint16(f);
668  return x - f;
669 }
670 #endif
671 
672 
673 
674 
675 /// Convert degrees to radians.
676 template <typename T>
677 OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
678 
679 /// Convert radians to degrees
680 template <typename T>
681 OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
682 
683 
684 /// Faster floating point negation, in cases where you aren't too picky
685 /// about the difference between +0 and -0. (Courtesy of Alex Wells, Intel,
686 /// via code in OSL.)
687 ///
688 /// Beware: fast_neg(0.0f) returns 0.0f, NOT -0.0f. All other values,
689 /// including -0.0 and +/- Inf, work identically to `-x`. For many use
690 /// cases, that's fine. (When was the last time you wanted a -0.0f anyway?)
691 ///
692 /// The reason it's faster is that `-x` (for float x) is implemented by
693 /// compilers by xor'ing x against a bitmask that flips the sign bit, and
694 /// that bitmask had to come from somewhere -- memory! -- which can be
695 /// expensive, depending on whether/where it is in cache. But `0 - x`
696 /// generates a zero in a register (with xor, no memory access needed) and
697 /// then subtracts, so that's sometimes faster because there is no memory
698 /// read. This works for SIMD types as well!
699 ///
700 /// It's also safe (though pointless) to use fast_neg for integer types,
701 /// where both `-x` and `0-x` are implemented as a `neg` instruction.
702 template <typename T>
704 fast_neg(const T &x) {
705  return T(0) - x;
706 }
707 
708 
709 
710 inline OIIO_HOSTDEVICE void
711 sincos (float x, float* sine, float* cosine)
712 {
713 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
714  __builtin_sincosf(x, sine, cosine);
715 #elif defined(__CUDA_ARCH__)
716  // Explicitly select the single-precision CUDA library function
717  sincosf(x, sine, cosine);
718 #else
719  *sine = std::sin(x);
720  *cosine = std::cos(x);
721 #endif
722 }
723 
724 inline OIIO_HOSTDEVICE void
725 sincos (double x, double* sine, double* cosine)
726 {
727 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
728  __builtin_sincos(x, sine, cosine);
729 #elif defined(__CUDA_ARCH__)
730  // Use of anonymous namespace resolves to the CUDA library function and
731  // avoids infinite recursion
732  ::sincos(x, sine, cosine);
733 #else
734  *sine = std::sin(x);
735  *cosine = std::cos(x);
736 #endif
737 }
738 
739 
740 inline OIIO_HOSTDEVICE float sign (float x)
741 {
742  return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
743 }
744 
745 
746 
747 
748 // (end of float helper functions)
749 ////////////////////////////////////////////////////////////////////////////
750 
751 
752 
753 ////////////////////////////////////////////////////////////////////////////
754 ////////////////////////////////////////////////////////////////////////////
755 //
756 // CONVERSION
757 //
758 // Type and range conversion helper functions and classes.
759 
760 /// Standards-compliant bit cast of two equally sized types. This is used
761 /// equivalently to C++20 std::bit_cast, but it works prior to C++20 and
762 /// it has the right decorators to work with Cuda.
763 /// @version 2.4.1
764 template <typename OUT_TYPE, typename IN_TYPE>
765 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bitcast (const IN_TYPE& in) noexcept {
766  // NOTE: this is the only standards compliant way of doing this type of casting,
767  // luckily the compilers we care about know how to optimize away this idiom.
768  static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE),
769  "bit_cast must be between objects of the same size");
770  OUT_TYPE out;
771  memcpy ((void *)&out, &in, sizeof(IN_TYPE));
772  return out;
773 }
774 
775 #if OIIO_VERSION_LESS(3, 0, 0)
776 /// Note: The C++20 std::bit_cast has the reverse order of the template
777 /// arguments of our original bit_cast! That is unfortunate. For now, we
778 /// prefer using OIIO::bitcast. We'll keep this old one for backward
779 /// compatibility, but will eventually deprecate for OIIO 2.5 and remove it
780 /// for 3.0.
781 template <typename IN_TYPE, typename OUT_TYPE>
782 #if OIIO_VERSION_GREATER_EQUAL(2, 5, 0)
783 OIIO_DEPRECATED("Use OIIO::bitcast<To, From> instead")
784 #endif
785 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) {
786  return bitcast<OUT_TYPE, IN_TYPE>(in);
787 }
788 #endif
789 
790 #if defined(__x86_64__) && !defined(__CUDA_ARCH__) && \
791  (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) \
792  || OIIO_CLANG_VERSION >= 100000 || OIIO_APPLE_CLANG_VERSION >= 130000)
793 // On x86/x86_64 for certain compilers we can use Intel CPU intrinsics for
794 // some common bitcast cases that might be even more understandable to the
795 // compiler and generate better code without its getting confused about the
796 // memcpy in the general case. We're a bit conservative with the compiler
797 // version checks here, it may be that some earlier versions support these
798 // intrinsics.
799 
800 template<> OIIO_FORCEINLINE uint32_t bitcast<uint32_t, float>(const float& val) noexcept {
801  return static_cast<uint32_t>(_castf32_u32(val));
802 }
803 template<> OIIO_FORCEINLINE int32_t bitcast<int32_t, float>(const float& val) noexcept {
804  return static_cast<int32_t>(_castf32_u32(val));
805 }
806 template<> OIIO_FORCEINLINE float bitcast<float, uint32_t>(const uint32_t& val) noexcept {
807  return _castu32_f32(val);
808 }
809 template<> OIIO_FORCEINLINE float bitcast<float, int32_t>(const int32_t& val) noexcept {
810  return _castu32_f32(val);
811 }
812 template<> OIIO_FORCEINLINE uint64_t bitcast<uint64_t, double>(const double& val) noexcept {
813  return static_cast<uint64_t>(_castf64_u64(val));
814 }
815 template<> OIIO_FORCEINLINE int64_t bitcast<int64_t, double>(const double& val) noexcept {
816  return static_cast<int64_t>(_castf64_u64(val));
817 }
818 template<> OIIO_FORCEINLINE double bitcast<double, uint64_t>(const uint64_t& val) noexcept {
819  return _castu64_f64(val);
820 }
821 template<> OIIO_FORCEINLINE double bitcast<double, int64_t>(const int64_t& val) noexcept {
822  return _castu64_f64(val);
823 }
824 #endif
825 
826 
828  return bitcast<int, float>(x);
829 }
831  return bitcast<float, int>(x);
832 }
833 
834 
835 
836 /// Change endian-ness of one or more data items that are each 2, 4,
837 /// or 8 bytes. This should work for any of short, unsigned short, int,
838 /// unsigned int, float, long long, pointers.
839 template<class T>
840 inline OIIO_HOSTDEVICE void
841 swap_endian (T *f, int len=1)
842 {
843  for (char *c = (char *) f; len--; c += sizeof(T)) {
844  if (sizeof(T) == 2) {
845  std::swap (c[0], c[1]);
846  } else if (sizeof(T) == 4) {
847  std::swap (c[0], c[3]);
848  std::swap (c[1], c[2]);
849  } else if (sizeof(T) == 8) {
850  std::swap (c[0], c[7]);
851  std::swap (c[1], c[6]);
852  std::swap (c[2], c[5]);
853  std::swap (c[3], c[4]);
854  }
855  }
856 }
857 
858 
859 #if (OIIO_GNUC_VERSION || OIIO_ANY_CLANG || OIIO_INTEL_CLASSIC_COMPILER_VERSION) && !defined(__CUDACC__)
860 // CPU gcc and compatible can use these intrinsics, 8-15x faster
861 
862 template<> inline void swap_endian(uint16_t* f, int len) {
863  for (int i = 0; i < len; ++i)
864  f[i] = __builtin_bswap16(f[i]);
865 }
866 
867 template<> inline void swap_endian(uint32_t* f, int len) {
868  for (int i = 0; i < len; ++i)
869  f[i] = __builtin_bswap32(f[i]);
870 }
871 
872 template<> inline void swap_endian(uint64_t* f, int len) {
873  for (int i = 0; i < len; ++i)
874  f[i] = __builtin_bswap64(f[i]);
875 }
876 
877 template<> inline void swap_endian(int16_t* f, int len) {
878  for (int i = 0; i < len; ++i)
879  f[i] = __builtin_bswap16(f[i]);
880 }
881 
882 template<> inline void swap_endian(int32_t* f, int len) {
883  for (int i = 0; i < len; ++i)
884  f[i] = __builtin_bswap32(f[i]);
885 }
886 
887 template<> inline void swap_endian(int64_t* f, int len) {
888  for (int i = 0; i < len; ++i)
889  f[i] = __builtin_bswap64(f[i]);
890 }
891 
892 template<> inline void swap_endian(float* f, int len) {
893  swap_endian((uint32_t*)f, len);
894 }
895 
896 template<> inline void swap_endian(double* f, int len) {
897  swap_endian((uint64_t*)f, len);
898 }
899 
900 #elif defined(_MSC_VER) && !defined(__CUDACC__)
901 // CPU MSVS can use these intrinsics
902 
903 template<> inline void swap_endian(uint16_t* f, int len) {
904  for (int i = 0; i < len; ++i)
905  f[i] = _byteswap_ushort(f[i]);
906 }
907 
908 template<> inline void swap_endian(uint32_t* f, int len) {
909  for (int i = 0; i < len; ++i)
910  f[i] = _byteswap_ulong(f[i]);
911 }
912 
913 template<> inline void swap_endian(uint64_t* f, int len) {
914  for (int i = 0; i < len; ++i)
915  f[i] = _byteswap_uint64(f[i]);
916 }
917 
918 template<> inline void swap_endian(int16_t* f, int len) {
919  for (int i = 0; i < len; ++i)
920  f[i] = _byteswap_ushort(f[i]);
921 }
922 
923 template<> inline void swap_endian(int32_t* f, int len) {
924  for (int i = 0; i < len; ++i)
925  f[i] = _byteswap_ulong(f[i]);
926 }
927 
928 template<> inline void swap_endian(int64_t* f, int len) {
929  for (int i = 0; i < len; ++i)
930  f[i] = _byteswap_uint64(f[i]);
931 }
932 #endif
933 
934 
935 
936 // big_enough_float<T>::float_t is a floating-point type big enough to
937 // handle the range and precision of a <T>. It's a float, unless T is big.
938 template <typename T> struct big_enough_float { typedef float float_t; };
939 template<> struct big_enough_float<int> { typedef double float_t; };
940 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
941 template<> struct big_enough_float<int64_t> { typedef double float_t; };
942 template<> struct big_enough_float<uint64_t> { typedef double float_t; };
943 template<> struct big_enough_float<double> { typedef double float_t; };
944 
945 
946 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
947 /// (presumed to be integer). This is just a helper for the convert_type
948 /// templates, it probably has no other use.
949 template<typename S, typename D, typename F>
950 inline OIIO_HOSTDEVICE D
951 scaled_conversion(const S& src, F scale, F min, F max)
952 {
954  F s = src * scale; //NOSONAR
955  s += (s < 0 ? (F)-0.5 : (F)0.5);
956  return (D)clamp(s, min, max);
957  } else {
958  return (D)clamp((F)src * scale + (F)0.5, min, max);
959  }
960 }
961 
962 
963 
964 /// Convert n consecutive values from the type of S to the type of D.
965 /// The conversion is not a simple cast, but correctly remaps the
966 /// 0.0->1.0 range from and to the full positive range of integral
967 /// types. Take a memcpy shortcut if both types are the same and no
968 /// conversion is necessary. Optional arguments can give nonstandard
969 /// quantizations.
970 //
971 // FIXME: make table-based specializations for common types with only a
972 // few possible src values (like unsigned char -> float).
973 template<typename S, typename D>
974 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
975 {
977  // They must be the same type. Just memcpy.
978  memcpy (dst, src, n*sizeof(D));
979  return;
980  }
981  typedef typename big_enough_float<D>::float_t F;
983  (F(1)) / F(std::numeric_limits<S>::max()) : F(1);
985  // Converting to an integer-like type.
986  F min = (F)_min; // std::numeric_limits<D>::min();
987  F max = (F)_max; // std::numeric_limits<D>::max();
988  scale *= _max;
989  // Unroll loop for speed
990  for ( ; n >= 16; n -= 16) {
991  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
992  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
993  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
994  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
995  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
996  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
997  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
998  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
999  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1000  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1001  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1002  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1003  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1004  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1005  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1006  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1007  }
1008  while (n--)
1009  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
1010  } else {
1011  // Converting to a float-like type, so we don't need to remap
1012  // the range
1013  // Unroll loop for speed
1014  for ( ; n >= 16; n -= 16) {
1015  *dst++ = (D)((*src++) * scale);
1016  *dst++ = (D)((*src++) * scale);
1017  *dst++ = (D)((*src++) * scale);
1018  *dst++ = (D)((*src++) * scale);
1019  *dst++ = (D)((*src++) * scale);
1020  *dst++ = (D)((*src++) * scale);
1021  *dst++ = (D)((*src++) * scale);
1022  *dst++ = (D)((*src++) * scale);
1023  *dst++ = (D)((*src++) * scale);
1024  *dst++ = (D)((*src++) * scale);
1025  *dst++ = (D)((*src++) * scale);
1026  *dst++ = (D)((*src++) * scale);
1027  *dst++ = (D)((*src++) * scale);
1028  *dst++ = (D)((*src++) * scale);
1029  *dst++ = (D)((*src++) * scale);
1030  *dst++ = (D)((*src++) * scale);
1031  }
1032  while (n--)
1033  *dst++ = (D)((*src++) * scale);
1034  }
1035 }
1036 
1037 
1038 #ifndef __CUDA_ARCH__
1039 template<>
1040 inline void convert_type<uint8_t,float> (const uint8_t *src,
1041  float *dst, size_t n,
1042  float /*_min*/, float /*_max*/)
1043 {
1045  simd::vfloat4 scale_simd (scale);
1046  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1047  simd::vfloat4 s_simd (src);
1048  simd::vfloat4 d_simd = s_simd * scale_simd;
1049  d_simd.store (dst);
1050  }
1051  while (n--)
1052  *dst++ = (*src++) * scale;
1053 }
1054 
1055 
1056 
1057 template<>
1058 inline void convert_type<uint16_t,float> (const uint16_t *src,
1059  float *dst, size_t n,
1060  float /*_min*/, float /*_max*/)
1061 {
1063  simd::vfloat4 scale_simd (scale);
1064  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1065  simd::vfloat4 s_simd (src);
1066  simd::vfloat4 d_simd = s_simd * scale_simd;
1067  d_simd.store (dst);
1068  }
1069  while (n--)
1070  *dst++ = (*src++) * scale;
1071 }
1072 
1073 
1074 template<>
1075 inline void
1076 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
1077  uint16_t /*_min*/, uint16_t /*_max*/)
1078 {
1081  float scale = max;
1082  simd::vfloat4 max_simd (max);
1083  simd::vfloat4 zero_simd (0.0f);
1084  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1085  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1086  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1087  simd::vint4 i (clamped);
1088  i.store (dst);
1089  }
1090  while (n--)
1091  *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
1092 }
1093 
1094 
1095 template<>
1096 inline void
1097 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
1098  uint8_t /*_min*/, uint8_t /*_max*/)
1099 {
1102  float scale = max;
1103  simd::vfloat4 max_simd (max);
1104  simd::vfloat4 zero_simd (0.0f);
1105  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1106  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1107  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1108  simd::vint4 i (clamped);
1109  i.store (dst);
1110  }
1111  while (n--)
1112  *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
1113 }
1114 
1115 
1116 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1117 template<>
1119 void convert_type<half,float> (const half *src, float *dst, size_t n,
1120  float /*_min*/, float /*_max*/);
1121 template<>
1123 void convert_type<float,half> (const float *src, half *dst, size_t n,
1124  half /*_min*/, half /*_max*/);
1125 
1126 #if OIIO_FMATH_HEADER_ONLY
1127 // Not just the declarations, give the definitions here.
1128 template<>
1129 void convert_type<half,float> (const half *src, float *dst, size_t n,
1130  float /*_min*/, float /*_max*/)
1131 {
1132 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1133  // If f16c ops are enabled, it's worth doing this by 8's
1134  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1135  simd::vfloat8 s_simd (src);
1136  s_simd.store (dst);
1137  }
1138 #endif
1139 #if OIIO_SIMD >= 4
1140  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1141  simd::vfloat4 s_simd (src);
1142  s_simd.store (dst);
1143  }
1144 #endif
1145  while (n--)
1146  *dst++ = (*src++);
1147 }
1148 
1149 template<>
1150 void
1151 convert_type<float,half> (const float *src, half *dst, size_t n,
1152  half /*_min*/, half /*_max*/)
1153 {
1154 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1155  // If f16c ops are enabled, it's worth doing this by 8's
1156  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1157  simd::vfloat8 s (src);
1158  s.store (dst);
1159  }
1160 #endif
1161 #if OIIO_SIMD >= 4
1162  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1163  simd::vfloat4 s (src);
1164  s.store (dst);
1165  }
1166 #endif
1167  while (n--)
1168  *dst++ = *src++;
1169 }
1170 #endif /* if OIIO_FMATH_HEADER_ONLY */
1171 #endif /* if defined(IMATH_HALF_H_) */
1172 
1173 #endif /* ifndef __CUDA_ARCH__ */
1174 
1175 
1176 
1177 template<typename S, typename D>
1178 inline void convert_type (const S *src, D *dst, size_t n)
1179 {
1180  convert_type<S,D> (src, dst, n,
1183 }
1184 
1185 
1186 
1187 
1188 /// Convert a single value from the type of S to the type of D.
1189 /// The conversion is not a simple cast, but correctly remaps the
1190 /// 0.0->1.0 range from and to the full positive range of integral
1191 /// types. Take a copy shortcut if both types are the same and no
1192 /// conversion is necessary.
1193 template<typename S, typename D>
1194 inline D
1195 convert_type (const S &src)
1196 {
1198  // They must be the same type. Just return it.
1199  return (D)src;
1200  }
1201  typedef typename big_enough_float<D>::float_t F;
1203  F(1) / F(std::numeric_limits<S>::max()) : F(1);
1205  // Converting to an integer-like type.
1206  F min = (F) std::numeric_limits<D>::min();
1207  F max = (F) std::numeric_limits<D>::max();
1208  scale *= max;
1209  return scaled_conversion<S,D,F> (src, scale, min, max);
1210  } else {
1211  // Converting to a float-like type, so we don't need to remap
1212  // the range
1213  return (D)((F)src * scale);
1214  }
1215 }
1216 
1217 
1218 
1219 /// Helper function to convert channel values between different bit depths.
1220 /// Roughly equivalent to:
1221 ///
1222 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
1223 ///
1224 /// but utilizes an integer math trick for speed. It can be proven that the
1225 /// absolute error of this method is less or equal to 1, with an average error
1226 /// (with respect to the entire domain) below 0.2.
1227 ///
1228 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
1229 /// shifted fully to the right.
1230 template<unsigned int FROM_BITS, unsigned int TO_BITS>
1231 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
1232  static_assert(FROM_BITS > 0, "FROM_BITS cannot be 0");
1233  unsigned int out = 0;
1234  int shift = TO_BITS - FROM_BITS;
1235  for (; shift > 0; shift -= FROM_BITS)
1236  out |= in << shift;
1237  out |= in >> -shift;
1238  return out;
1239 }
1240 
1241 
1242 
1243 // non-templated version. Slow but general
1244 inline OIIO_HOSTDEVICE unsigned int
1245 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
1246 {
1247  unsigned int out = 0;
1248  if (FROM_BITS) {
1249  int shift = TO_BITS - FROM_BITS;
1250  for (; shift > 0; shift -= FROM_BITS)
1251  out |= in << shift;
1252  out |= in >> -shift;
1253  }
1254  return out;
1255 }
1256 
1257 
1258 
1259 /// Append the `n` LSB bits of `val` into a bit string `T out[]`, where the
1260 /// `filled` MSB bits of `*out` are already filled in. Increment `out` and
1261 /// adjust `filled` as required. Type `T` should be uint8_t, uint16_t, or
1262 /// uint32_t.
1263 template<typename T>
1264 inline void
1265 bitstring_add_n_bits (T* &out, int& filled, uint32_t val, int n)
1266 {
1267  static_assert(std::is_same<T,uint8_t>::value ||
1270  "bitstring_add_n_bits must be unsigned int 8/16/32");
1271  const int Tbits = sizeof(T) * 8;
1272  // val: | don't care | copy me |
1273  // <- n bits ->
1274  //
1275  // *out: | don't touch | fill in here |
1276  // <- filled -> <- (Tbits - filled) ->
1277  while (n > 0) {
1278  // Make sure val doesn't have any cruft in bits > n
1279  val &= ~(0xffffffff << n);
1280  // Initialize any new byte we're filling in
1281  if (filled == 0)
1282  *out = 0;
1283  // How many bits can we fill in `*out` without having to increment
1284  // to the next byte?
1285  int bits_left_in_out = Tbits - filled;
1286  int b = 0; // bit pattern to 'or' with *out
1287  int nb = 0; // number of bits to add
1288  if (n <= bits_left_in_out) { // can fit completely in this byte
1289  b = val << (bits_left_in_out - n);
1290  nb = n;
1291  } else { // n > bits_left_in_out, will spill to next byte
1292  b = val >> (n - bits_left_in_out);
1293  nb = bits_left_in_out;
1294  }
1295  *out |= b;
1296  filled += nb;
1297  OIIO_DASSERT (filled <= Tbits);
1298  n -= nb;
1299  if (filled == Tbits) {
1300  ++out;
1301  filled = 0;
1302  }
1303  }
1304 }
1305 
1306 
1307 
1308 /// Pack values from `T in[0..n-1]` (where `T` is expected to be a uint8,
1309 /// uint16, or uint32, into successive raw outbits-bit pieces of `out[]`,
1310 /// where outbits is expected to be less than the number of bits in a `T`.
1311 template<typename T>
1312 inline void
1313 bit_pack(cspan<T> data, void* out, int outbits)
1314 {
1315  static_assert(std::is_same<T,uint8_t>::value ||
1318  "bit_pack must be unsigned int 8/16/32");
1319  unsigned char* outbuffer = (unsigned char*)out;
1320  int filled = 0;
1321  for (size_t i = 0, e = data.size(); i < e; ++i)
1322  bitstring_add_n_bits (outbuffer, filled, data[i], outbits);
1323 }
1324 
1325 
1326 
1327 /// Decode n packed inbits-bits values from in[...] into normal uint8,
1328 /// uint16, or uint32 representation of `T out[0..n-1]`. In other words,
1329 /// each successive `inbits` of `in` (allowing spanning of byte boundaries)
1330 /// will be stored in a successive out[i].
1331 template<typename T>
1332 inline void
1333 bit_unpack(int n, const unsigned char* in, int inbits, T* out)
1334 {
1335  static_assert(std::is_same<T,uint8_t>::value ||
1338  "bit_unpack must be unsigned int 8/16/32");
1339  OIIO_DASSERT(inbits >= 1 && inbits < 32); // surely bugs if not
1340  // int highest = (1 << inbits) - 1;
1341  int B = 0, b = 0;
1342  // Invariant:
1343  // So far, we have used in[0..B-1] and the high b bits of in[B].
1344  for (int i = 0; i < n; ++i) {
1345  long long val = 0;
1346  int valbits = 0; // bits so far we've accumulated in val
1347  while (valbits < inbits) {
1348  // Invariant: we have already accumulated valbits of the next
1349  // needed value (of a total of inbits), living in the valbits
1350  // low bits of val.
1351  int out_left = inbits - valbits; // How much more we still need
1352  int in_left = 8 - b; // Bits still available in in[B].
1353  if (in_left <= out_left) {
1354  // Eat the rest of this byte:
1355  // |---------|--------|
1356  // b in_left
1357  val <<= in_left;
1358  val |= in[B] & ~(0xffffffff << in_left);
1359  ++B;
1360  b = 0;
1361  valbits += in_left;
1362  } else {
1363  // Eat just the bits we need:
1364  // |--|---------|-----|
1365  // b out_left extra
1366  val <<= out_left;
1367  int extra = 8 - b - out_left;
1368  val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1369  b += out_left;
1370  valbits = inbits;
1371  }
1372  }
1373  out[i] = val; //T((val * 0xff) / highest);
1374  }
1375 }
1376 
1377 
1378 
1379 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
1380 /// and does conversions (via convert_type) as it reads in and out.
1381 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1382 template<typename I, typename E>
1383 struct DataProxy {
1384  DataProxy (I &data) : m_data(data) { }
1385  E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
1386  operator E () const { return convert_type<I,E>(m_data); }
1387 private:
1388  DataProxy& operator = (const DataProxy&); // Do not implement
1389  I &m_data;
1390 };
1391 
1392 
1393 
1394 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
1395 /// a (const I &) and does conversions (via convert_type) as it reads.
1396 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1397 template<typename I, typename E>
1399  ConstDataProxy (const I &data) : m_data(data) { }
1400  operator E () const { return convert_type<E,I>(*m_data); }
1401 private:
1402  const I &m_data;
1403 };
1404 
1405 
1406 
1407 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
1408 /// and does conversions (via convert_type) as it reads in and out.
1409 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1410 template<typename I, typename E>
1412  DataArrayProxy (I *data=NULL) : m_data(data) { }
1413  E operator* () const { return convert_type<I,E>(*m_data); }
1414  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1415  DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
1416  void set (I *data) { m_data = data; }
1417  I * get () const { return m_data; }
1419  m_data += i; return *this;
1420  }
1421 private:
1422  I *m_data;
1423 };
1424 
1425 
1426 
1427 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
1428 /// (I *) and does conversions (via convert_type) as it reads in and out.
1429 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1430 template<typename I, typename E>
1432  ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
1433  E operator* () const { return convert_type<I,E>(*m_data); }
1434  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1435  void set (const I *data) { m_data = data; }
1436  const I * get () const { return m_data; }
1438  m_data += i; return *this;
1439  }
1440 private:
1441  const I *m_data;
1442 };
1443 
1444 
1445 
1446 /// Fast table-based conversion of 8-bit to other types. Declare this
1447 /// as static to avoid the expensive ctr being called all the time.
1448 template <class T=float>
1450 public:
1451  EightBitConverter () noexcept { init(); }
1452  T operator() (unsigned char c) const noexcept { return val[c]; }
1453 private:
1454  T val[256];
1455  void init () noexcept {
1456  float scale = 1.0f / 255.0f;
1458  scale *= (float)std::numeric_limits<T>::max();
1459  for (int i = 0; i < 256; ++i)
1460  val[i] = (T)(i * scale);
1461  }
1462 };
1463 
1464 
1465 
1466 /// Simple conversion of a (presumably non-negative) float into a
1467 /// rational. This does not attempt to find the simplest fraction
1468 /// that approximates the float, for example 52.83 will simply
1469 /// return 5283/100. This does not attempt to gracefully handle
1470 /// floats that are out of range that could be easily int/int.
1471 inline OIIO_HOSTDEVICE void
1472 float_to_rational (float f, unsigned int &num, unsigned int &den)
1473 {
1474  if (f <= 0) { // Trivial case of zero, and handle all negative values
1475  num = 0;
1476  den = 1;
1477  } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1478  num = 1;
1479  den = (int)f;
1480  } else {
1481  num = (int)f;
1482  den = 1;
1483  while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1484  den *= 10;
1485  f *= 10;
1486  num = (int)f;
1487  }
1488  }
1489 }
1490 
1491 
1492 
1493 /// Simple conversion of a float into a rational. This does not attempt
1494 /// to find the simplest fraction that approximates the float, for
1495 /// example 52.83 will simply return 5283/100. This does not attempt to
1496 /// gracefully handle floats that are out of range that could be easily
1497 /// int/int.
1498 inline OIIO_HOSTDEVICE void
1499 float_to_rational (float f, int &num, int &den)
1500 {
1501  unsigned int n, d;
1502  float_to_rational (fabsf(f), n, d);
1503  num = (f >= 0) ? (int)n : -(int)n;
1504  den = (int) d;
1505 }
1506 
1507 
1508 // (end of conversion helpers)
1509 ////////////////////////////////////////////////////////////////////////////
1510 
1511 
1512 
1513 
1514 ////////////////////////////////////////////////////////////////////////////
1515 ////////////////////////////////////////////////////////////////////////////
1516 //
1517 // SAFE MATH
1518 //
1519 // The functions named "safe_*" are versions with various internal clamps
1520 // or other deviations from IEEE standards with the specific intent of
1521 // never producing NaN or Inf values or throwing exceptions. But within the
1522 // valid range, they should be full precision and match IEEE standards.
1523 //
1524 
1525 
1526 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1527 template <typename T>
1529  return x >= T(0) ? std::sqrt(x) : T(0);
1530 }
1531 
1532 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1533 template <typename T>
1535  return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1536 }
1537 
1538 
1539 /// Safe (clamping) arcsine: clamp to the valid domain.
1540 template <typename T>
1542  if (x <= T(-1)) return T(-M_PI_2);
1543  if (x >= T(+1)) return T(+M_PI_2);
1544  return std::asin(x);
1545 }
1546 
1547 /// Safe (clamping) arccosine: clamp to the valid domain.
1548 template <typename T>
1550  if (x <= T(-1)) return T(M_PI);
1551  if (x >= T(+1)) return T(0);
1552  return std::acos(x);
1553 }
1554 
1555 
1556 /// Safe log2: clamp to valid domain.
1557 template <typename T>
1559  // match clamping from fast version
1562  return std::log2(x);
1563 }
1564 
1565 /// Safe log: clamp to valid domain.
1566 template <typename T>
1568  // slightly different than fast version since clamping happens before scaling
1571  return std::log(x);
1572 }
1573 
1574 /// Safe log10: clamp to valid domain.
1575 template <typename T>
1577  // slightly different than fast version since clamping happens before scaling
1580  return log10f(x);
1581 }
1582 
1583 /// Safe logb: clamp to valid domain.
1584 template <typename T>
1586  return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1587 }
1588 
1589 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1590 /// by zero error.
1591 template <typename T>
1593  if (y == T(0)) return T(1);
1594  if (x == T(0)) return T(0);
1595  // if x is negative, only deal with integer powers
1596  if ((x < T(0)) && (y != floor(y))) return T(0);
1597  // FIXME: this does not match "fast" variant because clamping limits are different
1598  T r = std::pow(x, y);
1599  // Clamp to avoid returning Inf.
1600  const T big = std::numeric_limits<T>::max();
1601  return clamp (r, -big, big);
1602 }
1603 
1604 
1605 /// Safe fmod: guard against b==0.0 (in which case, return 0.0). Also, it
1606 /// seems that this is much faster than std::fmod!
1608 {
1609  if (OIIO_LIKELY(b != 0.0f)) {
1610 #if 0
1611  return std::fmod (a,b);
1612  // std::fmod was getting called serially instead of vectorizing, so
1613  // we will just do the calculation ourselves.
1614 #else
1615  // This formulation is equivalent but much faster in our benchmarks,
1616  // also vectorizes better.
1617  // The floating-point remainder of the division operation
1618  // a/b is a - N*b, where N = a/b with its fractional part truncated.
1619  int N = static_cast<int>(a/b);
1620  return a - N*b;
1621 #endif
1622  }
1623  return 0.0f;
1624 }
1625 
1626 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1627 
1628 // (end of safe_* functions)
1629 ////////////////////////////////////////////////////////////////////////////
1630 
1631 
1632 
1633 ////////////////////////////////////////////////////////////////////////////
1634 ////////////////////////////////////////////////////////////////////////////
1635 //
1636 // FAST & APPROXIMATE MATH
1637 //
1638 // The functions named "fast_*" provide a set of replacements to libm that
1639 // are much faster at the expense of some accuracy and robust handling of
1640 // extreme values. One design goal for these approximation was to avoid
1641 // branches as much as possible and operate on single precision values only
1642 // so that SIMD versions should be straightforward ports. We also try to
1643 // implement "safe" semantics (ie: clamp to valid range where possible)
1644 // natively since wrapping these inline calls in another layer would be
1645 // wasteful.
1646 //
1647 // The "fast_*" functions are not only possibly differing in value
1648 // (slightly) from the std versions of these functions, but also we do not
1649 // guarantee that the results of "fast" will exactly match from platform to
1650 // platform. This is because if a particular platform (HW, library, or
1651 // compiler) provides an intrinsic that is even faster than our usual "fast"
1652 // implementation, we may substitute it.
1653 //
1654 // Some functions are fast_safe_*, which is both a faster approximation as
1655 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1656 //
1657 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1658 // is influenced by compiler options (-ffast-math for clang/gcc,
1659 // --use-fast-math for NVCC).
1660 //
1661 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1662 // relative to the definitions in this file. It may be better in some
1663 // cases to use the approximate versions defined below.
1664 
1665 
1666 /// Round to nearest integer, returning as an int.
1667 /// Note that this differs from std::rint, which returns a float; it's more
1668 /// akin to std::lrint, which returns a long (int). Sorry for the naming
1669 /// confusion.
1671  // used by sin/cos/tan range reduction
1672 #if OIIO_SIMD_SSE >= 4
1673  // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1674  return static_cast<int>(std::rint(x));
1675 #else
1676  // emulate rounding by adding/subtracting 0.5
1677  return static_cast<int>(x + copysignf(0.5f, x));
1678 #endif
1679 }
1680 
1681 #ifndef __CUDA_ARCH__
1683  return simd::rint (x);
1684 }
1685 #endif
1686 
1687 
1689 #ifndef __CUDA_ARCH__
1690  // very accurate argument reduction from SLEEF
1691  // starts failing around x=262000
1692  // Results on: [-2pi,2pi]
1693  // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1694  int q = fast_rint (x * float(M_1_PI));
1695  float qf = float(q);
1696  x = madd(qf, -0.78515625f*4, x);
1697  x = madd(qf, -0.00024187564849853515625f*4, x);
1698  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1699  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1700  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1701  float s = x * x;
1702  if ((q & 1) != 0) x = -x;
1703  // this polynomial approximation has very low error on [-pi/2,+pi/2]
1704  // 1.19209e-07 max error in total over [-2pi,+2pi]
1705  float u = 2.6083159809786593541503e-06f;
1706  u = madd(u, s, -0.0001981069071916863322258f);
1707  u = madd(u, s, +0.00833307858556509017944336f);
1708  u = madd(u, s, -0.166666597127914428710938f);
1709  u = madd(s, u * x, x);
1710  // For large x, the argument reduction can fail and the polynomial can
1711  // be evaluated with arguments outside the valid internal. Just clamp
1712  // the bad values away.
1713  return clamp(u, -1.0f, 1.0f);
1714 #else
1715  return __sinf(x);
1716 #endif
1717 }
1718 
1719 
1721 #ifndef __CUDA_ARCH__
1722  // same argument reduction as fast_sin
1723  int q = fast_rint (x * float(M_1_PI));
1724  float qf = float(q);
1725  x = madd(qf, -0.78515625f*4, x);
1726  x = madd(qf, -0.00024187564849853515625f*4, x);
1727  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1728  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1729  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1730  float s = x * x;
1731  // polynomial from SLEEF's sincosf, max error is
1732  // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1733  float u = -2.71811842367242206819355e-07f;
1734  u = madd(u, s, +2.47990446951007470488548e-05f);
1735  u = madd(u, s, -0.00138888787478208541870117f);
1736  u = madd(u, s, +0.0416666641831398010253906f);
1737  u = madd(u, s, -0.5f);
1738  u = madd(u, s, +1.0f);
1739  if ((q & 1) != 0) u = -u;
1740  return clamp(u, -1.0f, 1.0f);
1741 #else
1742  return __cosf(x);
1743 #endif
1744 }
1745 
1746 
1747 OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1748 #ifndef __CUDA_ARCH__
1749  // same argument reduction as fast_sin
1750  int q = fast_rint (x * float(M_1_PI));
1751  float qf = float(q);
1752  x = madd(qf, -0.78515625f*4, x);
1753  x = madd(qf, -0.00024187564849853515625f*4, x);
1754  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1755  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1756  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1757  float s = x * x;
1758  // NOTE: same exact polynomials as fast_sin and fast_cos above
1759  if ((q & 1) != 0) x = -x;
1760  float su = 2.6083159809786593541503e-06f;
1761  su = madd(su, s, -0.0001981069071916863322258f);
1762  su = madd(su, s, +0.00833307858556509017944336f);
1763  su = madd(su, s, -0.166666597127914428710938f);
1764  su = madd(s, su * x, x);
1765  float cu = -2.71811842367242206819355e-07f;
1766  cu = madd(cu, s, +2.47990446951007470488548e-05f);
1767  cu = madd(cu, s, -0.00138888787478208541870117f);
1768  cu = madd(cu, s, +0.0416666641831398010253906f);
1769  cu = madd(cu, s, -0.5f);
1770  cu = madd(cu, s, +1.0f);
1771  if ((q & 1) != 0) cu = -cu;
1772  *sine = clamp(su, -1.0f, 1.0f);;
1773  *cosine = clamp(cu, -1.0f, 1.0f);;
1774 #else
1775  __sincosf(x, sine, cosine);
1776 #endif
1777 }
1778 
1779 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1780 // really poor outside of this range because the reciprocal amplifies errors
1782 #ifndef __CUDA_ARCH__
1783  // derived from SLEEF implementation
1784  // note that we cannot apply the "denormal crush" trick everywhere because
1785  // we sometimes need to take the reciprocal of the polynomial
1786  int q = fast_rint (x * float(2 * M_1_PI));
1787  float qf = float(q);
1788  x = madd(qf, -0.78515625f*2, x);
1789  x = madd(qf, -0.00024187564849853515625f*2, x);
1790  x = madd(qf, -3.7747668102383613586e-08f*2, x);
1791  x = madd(qf, -1.2816720341285448015e-12f*2, x);
1792  if ((q & 1) == 0)
1793  x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1794  float s = x * x;
1795  float u = 0.00927245803177356719970703f;
1796  u = madd(u, s, 0.00331984995864331722259521f);
1797  u = madd(u, s, 0.0242998078465461730957031f);
1798  u = madd(u, s, 0.0534495301544666290283203f);
1799  u = madd(u, s, 0.133383005857467651367188f);
1800  u = madd(u, s, 0.333331853151321411132812f);
1801  u = madd(s, u * x, x);
1802  if ((q & 1) != 0)
1803  u = -1.0f / u;
1804  return u;
1805 #else
1806  return __tanf(x);
1807 #endif
1808 }
1809 
1810 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1811 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1812 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1814 {
1815 #ifndef __CUDA_ARCH__
1816  // Fast trick to strip the integral part off, so our domain is [-1,1]
1817  const float z = x - ((x + 25165824.0f) - 25165824.0f);
1818  const float y = z - z * fabsf(z);
1819  const float Q = 3.10396624f;
1820  const float P = 3.584135056f; // P = 16-4*Q
1821  return y * (Q + P * fabsf(y));
1822  /* The original article used used inferior constants for Q and P and
1823  * so had max error 1.091e-3.
1824  *
1825  * The optimal value for Q was determined by exhaustive search, minimizing
1826  * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1827  * over the interval [0,2] (which is where most of the invocations happen).
1828  *
1829  * The basic idea of this approximation starts with the coarse approximation:
1830  * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
1831  *
1832  * This approximation always _over_ estimates the target. On the otherhand, the
1833  * curve:
1834  * sin(pi*x) ~= f(x) * abs(f(x)) / 4
1835  *
1836  * always lies _under_ the target. Thus we can simply numerically search for the
1837  * optimal constant to LERP these curves into a more precise approximation.
1838  * After folding the constants together and simplifying the resulting math, we
1839  * end up with the compact implementation below.
1840  *
1841  * NOTE: this function actually computes sin(x * pi) which avoids one or two
1842  * mults in many cases and guarantees exact values at integer periods.
1843  */
1844 #else
1845  return sinpif(x);
1846 #endif
1847 }
1848 
1849 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1850 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1852 {
1853 #ifndef __CUDA_ARCH__
1854  return fast_sinpi (x+0.5f);
1855 #else
1856  return cospif(x);
1857 #endif
1858 }
1859 
1861 #ifndef __CUDA_ARCH__
1862  const float f = fabsf(x);
1863  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1864  // based on http://www.pouet.net/topic.php?which=9132&page=2
1865  // 85% accurate (ulp 0)
1866  // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1867  // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1868  const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1869  return x < 0 ? float(M_PI) - a : a;
1870 #else
1871  return acosf(x);
1872 #endif
1873 }
1874 
1876 #ifndef __CUDA_ARCH__
1877  // based on acosf approximation above
1878  // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1879  const float f = fabsf(x);
1880  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1881  const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1882  return copysignf(a, x);
1883 #else
1884  return asinf(x);
1885 #endif
1886 }
1887 
1889 #ifndef __CUDA_ARCH__
1890  const float a = fabsf(x);
1891  const float k = a > 1.0f ? 1 / a : a;
1892  const float s = 1.0f - (1.0f - k); // crush denormals
1893  const float t = s * s;
1894  // http://mathforum.org/library/drmath/view/62672.html
1895  // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1896  // (slightly higher error but no discontinuities)
1897  // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error // (with denormals)
1898  // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1899  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1900  if (a > 1.0f) r = 1.570796326794896557998982f - r;
1901  return copysignf(r, x);
1902 #else
1903  return atanf(x);
1904 #endif
1905 }
1906 
1908 #ifndef __CUDA_ARCH__
1909  // based on atan approximation above
1910  // the special cases around 0 and infinity were tested explicitly
1911  // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1912  const float a = fabsf(x);
1913  const float b = fabsf(y);
1914  bool b_is_greater_than_a = b > a;
1915 
1916 #if OIIO_FMATH_SIMD_FRIENDLY
1917  // When applying to all lanes in SIMD, we end up doing extra masking and
1918  // 2 divides. So lets just do 1 divide and swap the parameters instead.
1919  // And if we are going to do a doing a divide anyway, when a == b it
1920  // should be 1.0f anyway so lets not bother special casing it.
1921  float sa = b_is_greater_than_a ? b : a;
1922  float sb = b_is_greater_than_a ? a : b;
1923  const float k = (b == 0) ? 0.0f : sb/sa;
1924 #else
1925  const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1926 #endif
1927 
1928  const float s = 1.0f - (1.0f - k); // crush denormals
1929  const float t = s * s;
1930 
1931  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1932 
1933  if (b_is_greater_than_a)
1934  r = 1.570796326794896557998982f - r; // account for arg reduction
1935  // TODO: investigate if testing x < 0.0f is more efficient
1936  if (bitcast<unsigned int, float>(x) & 0x80000000u) // test sign bit of x
1937  r = float(M_PI) - r;
1938  return copysignf(r, y);
1939 #else
1940  return atan2f(y, x);
1941 #endif
1942 }
1943 
1944 
1945 template<typename T>
1947  using namespace simd;
1948  typedef typename T::vint_t intN;
1949  // See float fast_log2 for explanations
1951  intN bits = bitcast_to_int(x);
1952  intN exponent = srl (bits, 23) - intN(127);
1953  T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1954  T f2 = f * f;
1955  T f4 = f2 * f2;
1956  T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1957  T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1958  hi = madd(f, hi, T(-0.13753123777116f));
1959  hi = madd(f, hi, T( 0.24187369696082f));
1960  hi = madd(f, hi, T(-0.34730547155299f));
1961  lo = madd(f, lo, T( 1.442689881667200f));
1962  return ((f4 * hi) + (f * lo)) + T(exponent);
1963 }
1964 
1965 
1966 template<>
1967 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1968 #ifndef __CUDA_ARCH__
1969  // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1971  // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1972  unsigned int bits = bitcast<unsigned int, float>(x);
1973  int exponent = int(bits >> 23) - 127;
1974  float f = bitcast<float, unsigned int>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1975  // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1976  // ulp histogram:
1977  // 0 = 97.46%
1978  // 1 = 2.29%
1979  // 2 = 0.11%
1980  float f2 = f * f;
1981  float f4 = f2 * f2;
1982  float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
1983  float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
1984  hi = madd(f, hi, -0.13753123777116f);
1985  hi = madd(f, hi, 0.24187369696082f);
1986  hi = madd(f, hi, -0.34730547155299f);
1987  lo = madd(f, lo, 1.442689881667200f);
1988  return ((f4 * hi) + (f * lo)) + exponent;
1989 #else
1990  return __log2f(xval);
1991 #endif
1992 }
1993 
1994 
1995 
1996 template<typename T>
1998  // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1999  return fast_log2(x) * T(M_LN2);
2000 }
2001 
2002 #ifdef __CUDA_ARCH__
2003 template<>
2004 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log(const float& x)
2005 {
2006  return __logf(x);
2007 }
2008 #endif
2009 
2010 
2011 template<typename T>
2013  // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
2014  return fast_log2(x) * T(M_LN2 / M_LN10);
2015 }
2016 
2017 #ifdef __CUDA_ARCH__
2018 template<>
2019 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log10(const float& x)
2020 {
2021  return __log10f(x);
2022 }
2023 #endif
2024 
2026 #ifndef __CUDA_ARCH__
2027  // don't bother with denormals
2028  x = fabsf(x);
2031  unsigned int bits = bitcast<unsigned int, float>(x);
2032  return float (int(bits >> 23) - 127);
2033 #else
2034  return logbf(x);
2035 #endif
2036 }
2037 
2039 #ifndef __CUDA_ARCH__
2040  if (fabsf(x) < 0.01f) {
2041  float y = 1.0f - (1.0f - x); // crush denormals
2042  return copysignf(madd(-0.5f, y * y, y), x);
2043  } else {
2044  return fast_log(x + 1);
2045  }
2046 #else
2047  return log1pf(x);
2048 #endif
2049 }
2050 
2051 
2052 
2053 template<typename T>
2055  using namespace simd;
2056  typedef typename T::vint_t intN;
2057 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
2058  // See float specialization for explanations
2059  T x = clamp (xval, T(-126.0f), T(126.0f));
2060  intN m (x); x -= T(m);
2061  T one (1.0f);
2062  x = one - (one - x); // crush denormals (does not affect max ulps!)
2063  const T kA (1.33336498402e-3f);
2064  const T kB (9.810352697968e-3f);
2065  const T kC (5.551834031939e-2f);
2066  const T kD (0.2401793301105f);
2067  const T kE (0.693144857883f);
2068  T r (kA);
2069  r = madd(x, r, kB);
2070  r = madd(x, r, kC);
2071  r = madd(x, r, kD);
2072  r = madd(x, r, kE);
2073  r = madd(x, r, one);
2074 
2075  // Original code was:
2076  // return bitcast_to_float (bitcast_to_int(r) + (m << 23));
2077  // This was producing the wrong results when called via `sRGB_to_linear()`
2078  // and `linear_to_sRGB()` on some Windows MSVC builds.
2079  // We found that it was fixed by using a temporary intN, as below.
2080  // Presumed to be an optimizer bug in MSVC versions 16.11.x
2081  // and earlier.
2082  // See PR #3804
2083 
2084  intN i = bitcast_to_int(r);
2085  T f = bitcast_to_float(i + (m << 23));
2086  return f;
2087 #else
2088  T r;
2089  for (int i = 0; i < r.elements; ++i)
2090  r[i] = fast_exp2(xval[i]);
2091  for (int i = r.elements; i < r.paddedelements; ++i)
2092  r[i] = 0.0f;
2093  return r;
2094 #endif
2095 }
2096 
2097 
2098 template<>
2099 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
2100 #if OIIO_ANY_CLANG && !OIIO_INTEL_LLVM_COMPILER && OIIO_FMATH_SIMD_FRIENDLY
2101  // Clang was unhappy using the bitcast/memcpy/reinter_cast/union inside
2102  // an explicit SIMD loop, so revert to calling the standard version.
2103  return std::exp2(xval);
2104 #elif !defined(__CUDA_ARCH__)
2105  // clamp to safe range for final addition
2106  float x = clamp (xval, -126.0f, 126.0f);
2107  // range reduction
2108  int m = int(x); x -= m;
2109  x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
2110  // 5th degree polynomial generated with sollya
2111  // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
2112  // ulp histogram:
2113  // 0 = 87.81%
2114  // 1 = 4.18%
2115  float r = 1.33336498402e-3f;
2116  r = madd(x, r, 9.810352697968e-3f);
2117  r = madd(x, r, 5.551834031939e-2f);
2118  r = madd(x, r, 0.2401793301105f);
2119  r = madd(x, r, 0.693144857883f);
2120  r = madd(x, r, 1.0f);
2121  // multiply by 2 ^ m by adding in the exponent
2122  // NOTE: left-shift of negative number is undefined behavior
2123  return bitcast<float, unsigned int>(bitcast<unsigned int, float>(r) + (unsigned(m) << 23));
2124  // Clang: loop not vectorized: unsafe dependent memory operations in loop.
2125  // This is why we special case the OIIO_FMATH_SIMD_FRIENDLY above.
2126  // FIXME: as clang releases continue to improve, periodically check if
2127  // this is still the case.
2128 #else
2129  return exp2f(xval);
2130 #endif
2131 }
2132 
2133 
2134 
2135 
2136 template <typename T>
2138  // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
2139  return fast_exp2(x * T(1 / M_LN2));
2140 }
2141 
2142 #ifdef __CUDA_ARCH__
2143 template<>
2144 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp (const float& x) {
2145  return __expf(x);
2146 }
2147 #endif
2148 
2149 
2150 
2151 /// Faster float exp than is in libm, but still 100% accurate
2153 {
2154 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2155  // On x86_64, versions of glibc < 2.16 have an issue where expf is
2156  // much slower than the double version. This was fixed in glibc 2.16.
2157  return static_cast<float>(std::exp(static_cast<double>(x)));
2158 #else
2159  return std::exp(x);
2160 #endif
2161 }
2162 
2163 
2165 #ifndef __CUDA_ARCH__
2166  // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
2167  return fast_exp2(x * float(M_LN10 / M_LN2));
2168 #else
2169  return __exp10f(x);
2170 #endif
2171 }
2172 
2174 #ifndef __CUDA_ARCH__
2175  if (fabsf(x) < 0.03f) {
2176  float y = 1.0f - (1.0f - x); // crush denormals
2177  return copysignf(madd(0.5f, y * y, y), x);
2178  } else
2179  return fast_exp(x) - 1.0f;
2180 #else
2181  return expm1f(x);
2182 #endif
2183 }
2184 
2186 #ifndef __CUDA_ARCH__
2187  float a = fabsf(x);
2188  if (a > 1.0f) {
2189  // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
2190  float e = fast_exp(a);
2191  return copysignf(0.5f * e - 0.5f / e, x);
2192  } else {
2193  a = 1.0f - (1.0f - a); // crush denorms
2194  float a2 = a * a;
2195  // degree 7 polynomial generated with sollya
2196  // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
2197  float r = 2.03945513931e-4f;
2198  r = madd(r, a2, 8.32990277558e-3f);
2199  r = madd(r, a2, 0.1666673421859f);
2200  r = madd(r * a, a2, a);
2201  return copysignf(r, x);
2202  }
2203 #else
2204  return sinhf(x);
2205 #endif
2206 }
2207 
2209 #ifndef __CUDA_ARCH__
2210  // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
2211  float e = fast_exp(fabsf(x));
2212  return 0.5f * e + 0.5f / e;
2213 #else
2214  return coshf(x);
2215 #endif
2216 }
2217 
2219 #ifndef __CUDA_ARCH__
2220  // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
2221  // NOTE: ulp error is high because of sub-optimal handling around the origin
2222  float e = fast_exp(2.0f * fabsf(x));
2223  return copysignf(1 - 2 / (1 + e), x);
2224 #else
2225  return tanhf(x);
2226 #endif
2227 }
2228 
2230  if (y == 0) return 1.0f; // x^0=1
2231  if (x == 0) return 0.0f; // 0^y=0
2232  // be cheap & exact for special case of squaring and identity
2233  if (y == 1.0f)
2234  return x;
2235  if (y == 2.0f) {
2236 #ifndef __CUDA_ARCH__
2237  return std::min (x*x, std::numeric_limits<float>::max());
2238 #else
2239  return fminf (x*x, std::numeric_limits<float>::max());
2240 #endif
2241  }
2242  float sign = 1.0f;
2243  if (OIIO_UNLIKELY(x < 0.0f)) {
2244  // if x is negative, only deal with integer powers
2245  // powf returns NaN for non-integers, we will return 0 instead
2246  int ybits = bitcast<int, float>(y) & 0x7fffffff;
2247  if (ybits >= 0x4b800000) {
2248  // always even int, keep positive
2249  } else if (ybits >= 0x3f800000) {
2250  // bigger than 1, check
2251  int k = (ybits >> 23) - 127; // get exponent
2252  int j = ybits >> (23 - k); // shift out possible fractional bits
2253  if ((j << (23 - k)) == ybits) // rebuild number and check for a match
2254  sign = bitcast<float, int>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
2255  else
2256  return 0.0f; // not integer
2257  } else {
2258  return 0.0f; // not integer
2259  }
2260  }
2261  return sign * fast_exp2(y * fast_log2(fabsf(x)));
2262 }
2263 
2264 
2265 // Fast simd pow that only needs to work for positive x
2266 template<typename T, typename U>
2268  return fast_exp2(y * fast_log2(x));
2269 }
2270 
2271 
2272 // Fast cube root (performs better that using fast_pow's above with y=1/3)
2274 #ifndef __CUDA_ARCH__
2275  float x0 = fabsf(x);
2276  // from hacker's delight
2277  float a = bitcast<float, int>(0x2a5137a0 + bitcast<int, float>(x0) / 3); // Initial guess.
2278  // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
2279  // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
2280  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step.
2281  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step again.
2282  a = (x0 == 0) ? 0 : a; // Fix 0 case
2283  return copysignf(a, x);
2284 #else
2285  return cbrtf(x);
2286 #endif
2287 }
2288 
2289 
2291 {
2292 #ifndef __CUDA_ARCH__
2293  // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
2294  // Abramowitz and Stegun, 7.1.28
2295  const float a1 = 0.0705230784f;
2296  const float a2 = 0.0422820123f;
2297  const float a3 = 0.0092705272f;
2298  const float a4 = 0.0001520143f;
2299  const float a5 = 0.0002765672f;
2300  const float a6 = 0.0000430638f;
2301  const float a = fabsf(x);
2302  const float b = 1.0f - (1.0f - a); // crush denormals
2303  const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2304  const float s = r * r; // ^2
2305  const float t = s * s; // ^4
2306  const float u = t * t; // ^8
2307  const float v = u * u; // ^16
2308  return copysignf(1.0f - 1.0f / v, x);
2309 #else
2310  return erff(x);
2311 #endif
2312 }
2313 
2315 {
2316 #ifndef __CUDA_ARCH__
2317  // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
2318  // ulp histogram:
2319  // 0 = 80.30%
2320  return 1.0f - fast_erf(x);
2321 #else
2322  return erfcf(x);
2323 #endif
2324 }
2325 
2327 {
2328  // from: Approximating the erfinv function by Mike Giles
2329  // to avoid trouble at the limit, clamp input to 1-eps
2330  float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
2331  float w = -fast_log((1.0f - a) * (1.0f + a)), p;
2332  if (w < 5.0f) {
2333  w = w - 2.5f;
2334  p = 2.81022636e-08f;
2335  p = madd(p, w, 3.43273939e-07f);
2336  p = madd(p, w, -3.5233877e-06f );
2337  p = madd(p, w, -4.39150654e-06f);
2338  p = madd(p, w, 0.00021858087f );
2339  p = madd(p, w, -0.00125372503f );
2340  p = madd(p, w, -0.00417768164f );
2341  p = madd(p, w, 0.246640727f );
2342  p = madd(p, w, 1.50140941f );
2343  } else {
2344  w = sqrtf(w) - 3.0f;
2345  p = -0.000200214257f;
2346  p = madd(p, w, 0.000100950558f);
2347  p = madd(p, w, 0.00134934322f );
2348  p = madd(p, w, -0.00367342844f );
2349  p = madd(p, w, 0.00573950773f );
2350  p = madd(p, w, -0.0076224613f );
2351  p = madd(p, w, 0.00943887047f );
2352  p = madd(p, w, 1.00167406f );
2353  p = madd(p, w, 2.83297682f );
2354  }
2355  return p * x;
2356 }
2357 
2358 // (end of fast* functions)
2359 ////////////////////////////////////////////////////////////////////////////
2360 
2361 
2362 
2363 
2364 ////////////////////////////////////////////////////////////////////////////
2365 ////////////////////////////////////////////////////////////////////////////
2366 //
2367 // MISCELLANEOUS NUMERICAL METHODS
2368 //
2369 
2370 
2371 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
2372 /// Use a maximum of maxiter iterations, and stop any time the remaining
2373 /// search interval or the function evaluations <= eps. If brack is
2374 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
2375 /// false (in which case the caller should know that the results may be
2376 /// unreliable. Results are undefined if the function is not monotonic
2377 /// on that interval or if there are multiple roots in the interval (it
2378 /// may not converge, or may converge to any of the roots without
2379 /// telling you that there are more than one).
2380 template<class T, class Func> OIIO_HOSTDEVICE
2381 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
2382  int maxiters=32, T eps=1.0e-6, bool *brack=0)
2383 {
2384  // Use the Regula Falsi method, falling back to bisection if it
2385  // hasn't converged after 3/4 of the maximum number of iterations.
2386  // See, e.g., Numerical Recipes for the basic ideas behind both
2387  // methods.
2388  T v0 = func(xmin), v1 = func(xmax);
2389  T x = xmin, v = v0;
2390  bool increasing = (v0 < v1);
2391  T vmin = increasing ? v0 : v1;
2392  T vmax = increasing ? v1 : v0;
2393  bool bracketed = (y >= vmin && y <= vmax);
2394  if (brack)
2395  *brack = bracketed;
2396  if (! bracketed) {
2397  // If our bounds don't bracket the zero, just give up, and
2398  // return the appropriate "edge" of the interval
2399  return ((y < vmin) == increasing) ? xmin : xmax;
2400  }
2401  if (fabs(v0-v1) < eps) // already close enough
2402  return x;
2403  int rfiters = (3*maxiters)/4; // how many times to try regula falsi
2404  for (int iters = 0; iters < maxiters; ++iters) {
2405  T t; // interpolation factor
2406  if (iters < rfiters) {
2407  // Regula falsi
2408  t = (y-v0)/(v1-v0);
2409  if (t <= T(0) || t >= T(1))
2410  t = T(0.5); // RF convergence failure -- bisect instead
2411  } else {
2412  t = T(0.5); // bisection
2413  }
2414  x = lerp (xmin, xmax, t);
2415  v = func(x);
2416  if ((v < y) == increasing) {
2417  xmin = x; v0 = v;
2418  } else {
2419  xmax = x; v1 = v;
2420  }
2421  if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2422  return x; // converged
2423  }
2424  return x;
2425 }
2426 
2427 
2428 
2429 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
2430 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
2431 /// x==1.0.
2432 inline OIIO_HOSTDEVICE float
2434 {
2435 #ifndef __CUDA_ARCH__
2436  OIIO_DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (has %d)", int(y.size()));
2437 #endif
2438  x = clamp (x, float(0.0), float(1.0));
2439  int nsegs = int(y.size()) - 1;
2440  int segnum;
2441  x = floorfrac (x*nsegs, &segnum);
2442 #ifndef __CUDA_ARCH__
2443  int nextseg = std::min (segnum+1, nsegs);
2444 #else
2445  int nextseg = min (segnum+1, nsegs);
2446 #endif
2447  return lerp (y[segnum], y[nextseg], x);
2448 }
2449 
2450 // (end miscellaneous numerical methods)
2451 ////////////////////////////////////////////////////////////////////////////
2452 
2453 
2454 
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
Definition: fmath.h:2229
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:5012
#define OIIO_CONSTEXPR14
Definition: platform.h:95
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
Definition: fmath.h:2152
#define OIIO_LIKELY(x)
Definition: platform.h:387
SYS_API double cos(double x)
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
Definition: fmath.h:2267
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
Definition: fmath.h:1997
bool_constant< is_integral< T >::value &&!std::is_same< T, bool >::value &&!std::is_same< T, char >::value &&!std::is_same< T, wchar_t >::value > is_integer
Definition: format.h:824
EightBitConverter() noexcept
Definition: fmath.h:1451
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
Definition: fmath.h:2164
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
Definition: fmath.h:1907
#define OIIO_FORCEINLINE
Definition: platform.h:403
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:7440
#define M_LN2
Definition: fmath.h:122
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
Definition: fmath.h:575
OIIO_HOSTDEVICE V round_down_to_multiple(V value, M multiple)
Definition: fmath.h:236
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
Definition: half.h:266
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4690
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
Definition: fmath.h:1333
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
Definition: fmath.h:1888
IMATH_HOSTDEVICE constexpr int floor(T x) IMATH_NOEXCEPT
Definition: ImathFun.h:112
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
Definition: fmath.h:1541
GLboolean * data
Definition: glcorearb.h:131
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
Definition: UT_ArraySet.h:1699
const GLdouble * v
Definition: glcorearb.h:837
#define M_PI
Definition: fmath.h:98
const DataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1418
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
Definition: fmath.h:446
GLsizei const GLfloat * value
Definition: glcorearb.h:824
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
Definition: fmath.h:1688
#define M_PI_2
Definition: fmath.h:101
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
Definition: fmath.h:438
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7694
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
Definition: span.h:74
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
Definition: fmath.h:1607
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
Definition: fmath.h:711
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
SYS_API float atan2f(float y, float x)
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
#define OIIO_DEPRECATED(msg)
Definition: platform.h:466
GLint y
Definition: glcorearb.h:103
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
Definition: fmath.h:430
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
Definition: fmath.h:1231
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
Definition: fmath.h:208
**But if you need a result
Definition: thread.h:622
GLfloat GLfloat GLfloat v2
Definition: glcorearb.h:818
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
Definition: fmath.h:184
GLdouble GLdouble GLdouble q
Definition: glad.h:2445
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1249
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
Definition: fmath.h:626
GLfloat GLfloat GLfloat GLfloat v3
Definition: glcorearb.h:819
OIIO_HOSTDEVICE int pow2roundup(int x)
Definition: fmath.h:199
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
Definition: fmath.h:2290
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
Definition: fmath.h:1097
#define OIIO_UTIL_API
Definition: export.h:71
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
Definition: fmath.h:1720
constexpr size_type size() const noexcept
Definition: span.h:322
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
Definition: fmath.h:638
constexpr auto in(type t, int set) -> bool
Definition: core.h:611
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
Definition: fmath.h:2208
void set(I *data)
Definition: fmath.h:1416
SYS_API float log10f(float x)
void set(const I *data)
Definition: fmath.h:1435
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7638
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
Definition: fmath.h:1558
OIIO_HOSTDEVICE int pow2rounddown(int x)
Definition: fmath.h:200
GA_API const UT_StringHolder scale
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
Definition: fmath.h:281
GLdouble n
Definition: glcorearb.h:2008
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
Definition: fmath.h:974
GLfloat f
Definition: glcorearb.h:1926
SYS_API double asin(double x)
DataArrayProxy(I *data=NULL)
Definition: fmath.h:1412
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Definition: fmath.h:1860
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:958
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
Definition: fmath.h:161
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
Definition: fmath.h:1781
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
Definition: fmath.h:2326
#define OIIO_DASSERT_MSG
Definition: dassert.h:56
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
Definition: fmath.h:561
#define OIIO_DASSERT
Definition: dassert.h:55
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7771
DataProxy(I &data)
Definition: fmath.h:1384
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
Definition: fmath.h:348
void bit_pack(cspan< T > data, void *out, int outbits)
Definition: fmath.h:1313
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
Definition: fmath.h:2185
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
Definition: fmath.h:504
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
Definition: fmath.h:2381
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:421
#define OIIO_HOSTDEVICE
Definition: platform.h:529
T operator()(unsigned char c) const noexcept
Definition: fmath.h:1452
void store(float *values) const
Definition: simd.h:6923
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
Definition: fmath.h:223
constexpr size_type size() const noexcept
Definition: span.h:186
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bitcast(const IN_TYPE &in) noexcept
Definition: fmath.h:765
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
Definition: fmath.h:1576
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
Definition: fmath.h:306
#define M_1_PI
Definition: fmath.h:110
SYS_API double acos(double x)
ConstDataProxy(const I &data)
Definition: fmath.h:1399
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
Definition: fmath.h:148
OIIO_HOSTDEVICE float sign(float x)
Definition: fmath.h:740
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7647
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
Definition: fmath.h:1567
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
#define M_PI_4
Definition: fmath.h:104
GLint GLenum GLint x
Definition: glcorearb.h:409
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
Definition: fmath.h:2054
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
Definition: fmath.h:2433
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
Definition: fmath.h:1592
GLdouble t
Definition: glad.h:2397
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
Definition: fmath.h:1747
E operator*() const
Definition: fmath.h:1433
GLfloat v0
Definition: glcorearb.h:816
#define OIIO_CLANG_PRAGMA(UnQuotedPragma)
Definition: platform.h:303
#define OIIO_NODISCARD
Definition: platform.h:485
ConstDataArrayProxy(const I *data=NULL)
Definition: fmath.h:1432
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1548
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
Definition: fmath.h:1851
GLint j
Definition: glad.h:2733
GLenum GLenum dst
Definition: glcorearb.h:1793
std::integral_constant< bool, std::numeric_limits< T >::is_signed||std::is_same< T, int128_opt >::value > is_signed
Definition: format.h:818
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
Definition: fmath.h:2025
IMATH_NAMESPACE::V2f IMATH_NAMESPACE::Box2i std::string this attribute is obsolete as of OpenEXR v3 float
GLenum func
Definition: glcorearb.h:783
#define OIIO_UNLIKELY(x)
Definition: platform.h:388
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
Definition: fmath.h:457
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
Definition: fmath.h:470
E operator*() const
Definition: fmath.h:1413
void store(int *values) const
Store the values into memory.
Definition: simd.h:4348
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
Definition: fmath.h:1670
FMT_CONSTEXPR20 auto bit_cast(const From &from) -> To
Definition: format.h:336
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
Definition: fmath.h:1534
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
Definition: fmath.h:2173
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
Definition: fmath.h:830
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
Definition: fmath.h:704
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
Definition: fmath.h:827
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
Definition: fmath.h:2273
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
Definition: fmath.h:1472
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
Definition: fmath.h:326
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:1040
GLfloat GLfloat v1
Definition: glcorearb.h:817
GLuint GLfloat * val
Definition: glcorearb.h:1608
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4873
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
Definition: fmath.h:2012
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
Definition: fmath.h:1549
SYS_API float asinf(float x)
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:857
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
Definition: fmath.h:1585
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
Definition: fmath.h:2038
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
Definition: fmath.h:951
E operator[](int i) const
Definition: fmath.h:1434
GLboolean r
Definition: glcorearb.h:1222
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:1058
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
Definition: fmath.h:522
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
Definition: fmath.h:248
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:127
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
Definition: fmath.h:316
const ConstDataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1437
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:5001
OIIO_FORCEINLINE T log(const T &v)
Definition: simd.h:7905
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
Definition: fmath.h:841
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Definition: fmath.h:1265
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
Definition: fmath.h:2218
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
Definition: fmath.h:1875
vint4 rint(const vfloat4 &a)
Definition: simd.h:7669
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
Definition: fmath.h:677
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
Definition: fmath.h:1076
#define M_LN10
Definition: fmath.h:125
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
Definition: fmath.h:2137
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
Definition: fmath.h:2314
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
Definition: fmath.h:607
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
Definition: fmath.h:260
SYS_API double sin(double x)
E operator[](int i) const
Definition: fmath.h:1414
float float_t
Definition: fmath.h:938
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
Definition: fmath.h:589
E operator=(E newval)
Definition: fmath.h:1385
Definition: format.h:1821
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
Definition: fmath.h:1528
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
Definition: fmath.h:1813
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
Definition: fmath.h:1946
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:126
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.
Definition: fmath.h:681
GLenum src
Definition: glcorearb.h:1793