31 #define OIIO_FMATH_H 1
38 #include <type_traits>
53 #ifndef OIIO_FMATH_HEADER_ONLY
54 # define OIIO_FMATH_HEADER_ONLY 0
76 #ifndef OIIO_FMATH_SIMD_FRIENDLY
77 # define OIIO_FMATH_SIMD_FRIENDLY 0
98 # define M_PI 3.14159265358979323846264338327950288
101 # define M_PI_2 1.57079632679489661923132169163975144
104 # define M_PI_4 0.785398163397448309615660845819875721
107 # define M_TWO_PI (M_PI * 2.0)
110 # define M_1_PI 0.318309886183790671537767526745028724
113 # define M_2_PI 0.636619772367581343075535053490057448
116 # define M_SQRT2 1.41421356237309504880168872420969808
119 # define M_SQRT1_2 0.707106781186547524400844362104849039
122 # define M_LN2 0.69314718055994530941723212145817656
125 # define M_LN10 2.30258509299404568401799145468436421
128 # define M_E 2.71828182845904523536028747135266250
131 # define M_LOG2E 1.44269504088896340735992468100189214
153 return (
x & (
x - 1)) == 0 && (
x >= 0);
194 return x & ~(
x >> 1);
207 template <
typename V,
typename M>
211 value += V(multiple) - 1;
212 return value - (value % V(multiple));
226 return (x + m - 1) & (~(m - 1));
235 template <
typename V,
typename M>
239 value -= V(multiple - 1);
240 return value - (value % V(multiple));
251 uint64_t
r = (uint64_t)a * (uint64_t)
b;
252 return r < Err ? (uint32_t)r : Err;
263 if (b && ab / b != a)
284 "rotl only works for unsigned integer types");
285 return (
x <<
s) | (
x >> ((
sizeof(
T) * 8) -
s));
289 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
296 uint32_t
rotl(uint32_t
x,
int s) noexcept
298 return __funnelshift_lc(
x,
x,
s);
308 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
309 return __funnelshift_lc(x, x, k);
311 return (x << k) | (x >> (32 - k));
318 return (x << k) | (x >> (64 - k));
328 return b ? (a %
b) :
T(0);
348 clamp (
const T&
a,
const T& low,
const T& high)
366 return (a >= low) ? ((a <= high) ? a : high) : low;
371 #ifndef __CUDA_ARCH__
455 template <
class T,
class Q>
460 return v0*(Q(1)-
x) + v1*x;
468 template <
class T,
class Q>
474 return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
483 template <
class T,
class Q>
486 const T *
v2,
const T *
v3,
491 for (
int i = 0; i <
n; ++i)
492 result[i] =
T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
502 template <
class T,
class Q>
505 const T *
v2,
const T *
v3,
510 for (
int i = 0; i <
n; ++i)
511 result[i] +=
T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
512 t*(v2[i]*s1 + v3[i]*s)));
520 template <
class T,
class Q>
522 trilerp (T
v0, T
v1, T
v2, T
v3, T v4, T v5, T v6, T v7, Q
s, Q
t, Q
r)
528 return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
529 r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
538 template <
class T,
class Q>
541 const T *v4,
const T *v5,
const T *v6,
const T *v7,
547 for (
int i = 0; i <
n; ++i)
548 result[i] =
T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
549 r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
559 template <
class T,
class Q>
562 const T *v4,
const T *v5,
const T *v6,
const T *v7,
566 bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
567 bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
574 template <
typename T>
577 T one_frac = 1 - fraction;
578 w[0] =
T(1.0 / 6.0) * one_frac * one_frac * one_frac;
579 w[1] =
T(2.0 / 3.0) -
T(0.5) * fraction * fraction * (2 - fraction);
580 w[2] =
T(2.0 / 3.0) -
T(0.5) * one_frac * one_frac * (2 - one_frac);
581 w[3] =
T(1.0 / 6.0) * fraction * fraction * fraction;
588 template <
typename T>
591 T one_frac = 1 - fraction;
592 dw[0] = -
T(0.5) * one_frac * one_frac;
593 dw[1] =
T(0.5) * fraction * (3 * fraction - 4);
594 dw[2] = -
T(0.5) * one_frac * (3 * one_frac - 4);
595 dw[3] =
T(0.5) * fraction * fraction;
609 for (
int c = 0;
c <
n; ++
c)
613 for (
int j = 0;
j < 4; ++
j) {
614 for (
int i = 0; i < 4; ++i) {
616 for (
int c = 0;
c <
n; ++
c)
617 result[
c] += w * val[
j*4+i][
c];
628 return (
int)floorf(x);
647 return x -
static_cast<float>(i);
652 #ifndef __CUDA_ARCH__
676 template <
typename T>
680 template <
typename T>
702 template <
typename T>
713 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
714 __builtin_sincosf(x, sine, cosine);
715 #elif defined(__CUDA_ARCH__)
725 sincos (
double x,
double* sine,
double* cosine)
727 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
728 __builtin_sincos(x, sine, cosine);
729 #elif defined(__CUDA_ARCH__)
742 return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
764 template <
typename OUT_TYPE,
typename IN_TYPE>
768 static_assert(
sizeof(IN_TYPE) ==
sizeof(OUT_TYPE),
769 "bit_cast must be between objects of the same size");
771 memcpy ((
void *)&out, &
in,
sizeof(IN_TYPE));
775 #if OIIO_VERSION_LESS(3, 0, 0)
781 template <
typename IN_TYPE,
typename OUT_TYPE>
782 #if OIIO_VERSION_GREATER_EQUAL(2, 5, 0)
786 return bitcast<OUT_TYPE, IN_TYPE>(
in);
790 #if defined(__x86_64__) && !defined(__CUDA_ARCH__) && \
791 (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) \
792 || OIIO_CLANG_VERSION >= 100000 || OIIO_APPLE_CLANG_VERSION >= 130000)
801 return static_cast<uint32_t
>(_castf32_u32(
val));
804 return static_cast<int32_t
>(_castf32_u32(
val));
807 return _castu32_f32(
val);
810 return _castu32_f32(
val);
812 template<>
OIIO_FORCEINLINE uint64_t bitcast<uint64_t, double>(
const double&
val) noexcept {
813 return static_cast<uint64_t
>(_castf64_u64(
val));
816 return static_cast<int64_t
>(_castf64_u64(
val));
818 template<>
OIIO_FORCEINLINE double bitcast<double, uint64_t>(
const uint64_t&
val) noexcept {
819 return _castu64_f64(
val);
822 return _castu64_f64(
val);
828 return bitcast<int, float>(
x);
831 return bitcast<float, int>(
x);
843 for (
char *
c = (
char *) f; len--;
c +=
sizeof(
T)) {
844 if (
sizeof(
T) == 2) {
846 }
else if (
sizeof(
T) == 4) {
849 }
else if (
sizeof(
T) == 8) {
859 #if (OIIO_GNUC_VERSION || OIIO_ANY_CLANG || OIIO_INTEL_CLASSIC_COMPILER_VERSION) && !defined(__CUDACC__)
862 template<>
inline void swap_endian(uint16_t*
f,
int len) {
863 for (
int i = 0; i < len; ++i)
864 f[i] = __builtin_bswap16(f[i]);
867 template<>
inline void swap_endian(uint32_t* f,
int len) {
868 for (
int i = 0; i < len; ++i)
869 f[i] = __builtin_bswap32(f[i]);
872 template<>
inline void swap_endian(uint64_t* f,
int len) {
873 for (
int i = 0; i < len; ++i)
874 f[i] = __builtin_bswap64(f[i]);
877 template<>
inline void swap_endian(int16_t* f,
int len) {
878 for (
int i = 0; i < len; ++i)
879 f[i] = __builtin_bswap16(f[i]);
882 template<>
inline void swap_endian(int32_t* f,
int len) {
883 for (
int i = 0; i < len; ++i)
884 f[i] = __builtin_bswap32(f[i]);
887 template<>
inline void swap_endian(int64_t* f,
int len) {
888 for (
int i = 0; i < len; ++i)
889 f[i] = __builtin_bswap64(f[i]);
892 template<>
inline void swap_endian(
float* f,
int len) {
896 template<>
inline void swap_endian(
double* f,
int len) {
900 #elif defined(_MSC_VER) && !defined(__CUDACC__)
903 template<>
inline void swap_endian(uint16_t* f,
int len) {
904 for (
int i = 0; i < len; ++i)
905 f[i] = _byteswap_ushort(f[i]);
908 template<>
inline void swap_endian(uint32_t* f,
int len) {
909 for (
int i = 0; i < len; ++i)
910 f[i] = _byteswap_ulong(f[i]);
913 template<>
inline void swap_endian(uint64_t* f,
int len) {
914 for (
int i = 0; i < len; ++i)
915 f[i] = _byteswap_uint64(f[i]);
918 template<>
inline void swap_endian(int16_t* f,
int len) {
919 for (
int i = 0; i < len; ++i)
920 f[i] = _byteswap_ushort(f[i]);
923 template<>
inline void swap_endian(int32_t* f,
int len) {
924 for (
int i = 0; i < len; ++i)
925 f[i] = _byteswap_ulong(f[i]);
928 template<>
inline void swap_endian(int64_t* f,
int len) {
929 for (
int i = 0; i < len; ++i)
930 f[i] = _byteswap_uint64(f[i]);
949 template<
typename S,
typename D,
typename F>
955 s += (s < 0 ? (F)-0.5 : (F)0.5);
956 return (D)
clamp(s, min, max);
958 return (D)
clamp((F)src * scale + (F)0.5, min, max);
973 template<
typename S,
typename D>
978 memcpy (dst, src, n*
sizeof(D));
990 for ( ; n >= 16; n -= 16) {
991 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
992 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
993 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
994 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
995 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
996 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
997 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
998 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
999 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1000 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1001 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1002 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1003 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1004 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1005 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1006 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1009 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
1014 for ( ; n >= 16; n -= 16) {
1015 *dst++ = (D)((*src++) *
scale);
1016 *dst++ = (D)((*src++) *
scale);
1017 *dst++ = (D)((*src++) *
scale);
1018 *dst++ = (D)((*src++) *
scale);
1019 *dst++ = (D)((*src++) *
scale);
1020 *dst++ = (D)((*src++) *
scale);
1021 *dst++ = (D)((*src++) *
scale);
1022 *dst++ = (D)((*src++) *
scale);
1023 *dst++ = (D)((*src++) *
scale);
1024 *dst++ = (D)((*src++) *
scale);
1025 *dst++ = (D)((*src++) *
scale);
1026 *dst++ = (D)((*src++) *
scale);
1027 *dst++ = (D)((*src++) *
scale);
1028 *dst++ = (D)((*src++) *
scale);
1029 *dst++ = (D)((*src++) *
scale);
1030 *dst++ = (D)((*src++) *
scale);
1033 *dst++ = (D)((*src++) *
scale);
1038 #ifndef __CUDA_ARCH__
1041 float *
dst,
size_t n,
1046 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1052 *dst++ = (*src++) * scale;
1059 float *
dst,
size_t n,
1064 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1070 *dst++ = (*src++) * scale;
1077 uint16_t , uint16_t )
1084 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1091 *dst++ = scaled_conversion<float,uint16_t,float> (*src++,
scale,
min,
max);
1105 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1112 *dst++ = scaled_conversion<float,uint8_t,float> (*src++,
scale,
min,
max);
1116 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1119 void convert_type<half,float> (
const half *
src,
float *
dst,
size_t n,
1123 void convert_type<float,half> (
const float *
src,
half *
dst,
size_t n,
1126 #if OIIO_FMATH_HEADER_ONLY
1129 void convert_type<half,float> (
const half *
src,
float *
dst,
size_t n,
1132 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1134 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1140 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1151 convert_type<float,half> (
const float *
src, half *
dst,
size_t n,
1154 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1156 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1162 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1177 template<
typename S,
typename D>
1180 convert_type<S,D> (
src,
dst,
n,
1193 template<
typename S,
typename D>
1213 return (D)((F)src * scale);
1230 template<
unsigned int FROM_BITS,
unsigned int TO_BITS>
1232 static_assert(FROM_BITS > 0,
"FROM_BITS cannot be 0");
1233 unsigned int out = 0;
1234 int shift = TO_BITS - FROM_BITS;
1235 for (; shift > 0; shift -= FROM_BITS)
1237 out |= in >> -shift;
1247 unsigned int out = 0;
1249 int shift = TO_BITS - FROM_BITS;
1250 for (; shift > 0; shift -= FROM_BITS)
1252 out |= in >> -shift;
1263 template<
typename T>
1270 "bitstring_add_n_bits must be unsigned int 8/16/32");
1271 const int Tbits =
sizeof(
T) * 8;
1279 val &= ~(0xffffffff <<
n);
1285 int bits_left_in_out = Tbits - filled;
1288 if (n <= bits_left_in_out) {
1289 b = val << (bits_left_in_out -
n);
1292 b = val >> (n - bits_left_in_out);
1293 nb = bits_left_in_out;
1299 if (filled == Tbits) {
1311 template<
typename T>
1318 "bit_pack must be unsigned int 8/16/32");
1319 unsigned char* outbuffer = (
unsigned char*)out;
1321 for (
size_t i = 0, e = data.
size(); i < e; ++i)
1331 template<
typename T>
1338 "bit_unpack must be unsigned int 8/16/32");
1344 for (
int i = 0; i <
n; ++i) {
1347 while (valbits < inbits) {
1351 int out_left = inbits - valbits;
1352 int in_left = 8 -
b;
1353 if (in_left <= out_left) {
1358 val |= in[
B] & ~(0xffffffff << in_left);
1367 int extra = 8 -
b - out_left;
1368 val |= (in[
B] >> extra) & ~(0xffffffff << out_left);
1382 template<
typename I,
typename E>
1385 E
operator= (E newval) { m_data = convert_type<E,I>(newval);
return newval; }
1386 operator E ()
const {
return convert_type<I,E>(m_data); }
1397 template<
typename I,
typename E>
1400 operator E ()
const {
return convert_type<E,I>(*m_data); }
1410 template<
typename I,
typename E>
1414 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1417 I *
get ()
const {
return m_data; }
1419 m_data += i;
return *
this;
1430 template<
typename I,
typename E>
1434 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1436 const I *
get ()
const {
return m_data; }
1438 m_data += i;
return *
this;
1448 template <
class T=
float>
1455 void init () noexcept {
1456 float scale = 1.0f / 255.0f;
1459 for (
int i = 0; i < 256; ++i)
1477 }
else if ((
int)(1.0/f) == (1.0/f)) {
1483 while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1503 num = (f >= 0) ? (
int)n : -(
int)n;
1527 template <
typename T>
1533 template <
typename T>
1540 template <
typename T>
1548 template <
typename T>
1550 if (x <=
T(-1))
return T(
M_PI);
1551 if (x >=
T(+1))
return T(0);
1557 template <
typename T>
1562 return std::log2(x);
1566 template <
typename T>
1575 template <
typename T>
1584 template <
typename T>
1591 template <
typename T>
1593 if (y ==
T(0))
return T(1);
1594 if (x ==
T(0))
return T(0);
1596 if ((x <
T(0)) && (y !=
floor(y)))
return T(0);
1601 return clamp (r, -big, big);
1619 int N =
static_cast<int>(a/
b);
1626 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1672 #if OIIO_SIMD_SSE >= 4
1677 return static_cast<int>(x +
copysignf(0.5f, x));
1681 #ifndef __CUDA_ARCH__
1689 #ifndef __CUDA_ARCH__
1695 float qf =
float(q);
1696 x =
madd(qf, -0.78515625f*4, x);
1697 x =
madd(qf, -0.00024187564849853515625f*4, x);
1698 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1699 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1702 if ((q & 1) != 0) x = -x;
1705 float u = 2.6083159809786593541503e-06
f;
1706 u =
madd(u, s, -0.0001981069071916863322258f);
1707 u =
madd(u, s, +0.00833307858556509017944336f);
1708 u =
madd(u, s, -0.166666597127914428710938f);
1709 u =
madd(s, u * x, x);
1713 return clamp(u, -1.0f, 1.0f);
1721 #ifndef __CUDA_ARCH__
1724 float qf =
float(q);
1725 x =
madd(qf, -0.78515625f*4, x);
1726 x =
madd(qf, -0.00024187564849853515625f*4, x);
1727 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1728 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1733 float u = -2.71811842367242206819355e-07
f;
1734 u =
madd(u, s, +2.47990446951007470488548e-05f);
1735 u =
madd(u, s, -0.00138888787478208541870117f);
1736 u =
madd(u, s, +0.0416666641831398010253906f);
1737 u =
madd(u, s, -0.5f);
1738 u =
madd(u, s, +1.0f);
1739 if ((q & 1) != 0) u = -u;
1740 return clamp(u, -1.0f, 1.0f);
1748 #ifndef __CUDA_ARCH__
1751 float qf =
float(q);
1752 x =
madd(qf, -0.78515625f*4, x);
1753 x =
madd(qf, -0.00024187564849853515625f*4, x);
1754 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1755 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1759 if ((q & 1) != 0) x = -x;
1760 float su = 2.6083159809786593541503e-06
f;
1761 su =
madd(su, s, -0.0001981069071916863322258f);
1762 su =
madd(su, s, +0.00833307858556509017944336f);
1763 su =
madd(su, s, -0.166666597127914428710938f);
1764 su =
madd(s, su * x, x);
1765 float cu = -2.71811842367242206819355e-07
f;
1766 cu =
madd(cu, s, +2.47990446951007470488548e-05f);
1767 cu =
madd(cu, s, -0.00138888787478208541870117f);
1768 cu =
madd(cu, s, +0.0416666641831398010253906f);
1769 cu =
madd(cu, s, -0.5f);
1770 cu =
madd(cu, s, +1.0f);
1771 if ((q & 1) != 0) cu = -cu;
1772 *sine =
clamp(su, -1.0f, 1.0f);;
1773 *cosine =
clamp(cu, -1.0f, 1.0f);;
1775 __sincosf(x, sine, cosine);
1782 #ifndef __CUDA_ARCH__
1787 float qf =
float(q);
1788 x =
madd(qf, -0.78515625f*2, x);
1789 x =
madd(qf, -0.00024187564849853515625f*2, x);
1790 x =
madd(qf, -3.7747668102383613586e-08f*2, x);
1791 x =
madd(qf, -1.2816720341285448015e-12f*2, x);
1795 float u = 0.00927245803177356719970703f;
1796 u =
madd(u, s, 0.00331984995864331722259521f);
1797 u =
madd(u, s, 0.0242998078465461730957031f);
1798 u =
madd(u, s, 0.0534495301544666290283203f);
1799 u =
madd(u, s, 0.133383005857467651367188f);
1800 u =
madd(u, s, 0.333331853151321411132812f);
1801 u =
madd(s, u * x, x);
1815 #ifndef __CUDA_ARCH__
1817 const float z = x - ((x + 25165824.0f) - 25165824.0f);
1818 const float y = z - z * fabsf(z);
1819 const float Q = 3.10396624f;
1820 const float P = 3.584135056f;
1821 return y * (Q + P * fabsf(y));
1853 #ifndef __CUDA_ARCH__
1861 #ifndef __CUDA_ARCH__
1862 const float f = fabsf(x);
1863 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1868 const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1876 #ifndef __CUDA_ARCH__
1879 const float f = fabsf(x);
1880 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1881 const float a =
float(
M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1889 #ifndef __CUDA_ARCH__
1890 const float a = fabsf(x);
1891 const float k = a > 1.0f ? 1 / a :
a;
1892 const float s = 1.0f - (1.0f - k);
1893 const float t = s *
s;
1899 float r = s *
madd(0.430165678f, t, 1.0f) /
madd(
madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1900 if (a > 1.0f) r = 1.570796326794896557998982f -
r;
1908 #ifndef __CUDA_ARCH__
1912 const float a = fabsf(x);
1913 const float b = fabsf(y);
1914 bool b_is_greater_than_a = b >
a;
1916 #if OIIO_FMATH_SIMD_FRIENDLY
1921 float sa = b_is_greater_than_a ? b :
a;
1922 float sb = b_is_greater_than_a ? a :
b;
1923 const float k = (b == 0) ? 0.0f : sb/sa;
1925 const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b /
a));
1928 const float s = 1.0f - (1.0f - k);
1929 const float t = s *
s;
1931 float r = s *
madd(0.430165678f, t, 1.0f) /
madd(
madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1933 if (b_is_greater_than_a)
1934 r = 1.570796326794896557998982f -
r;
1936 if (bitcast<unsigned int, float>(x) & 0x80000000u)
1945 template<
typename T>
1947 using namespace simd;
1948 typedef typename T::vint_t intN;
1952 intN exponent =
srl (bits, 23) - intN(127);
1956 T hi =
madd(f,
T(-0.00931049621349f),
T( 0.05206469089414f));
1957 T lo =
madd(f,
T( 0.47868480909345f),
T(-0.72116591947498f));
1958 hi =
madd(f, hi,
T(-0.13753123777116f));
1959 hi =
madd(f, hi,
T( 0.24187369696082f));
1960 hi =
madd(f, hi,
T(-0.34730547155299f));
1961 lo =
madd(f, lo,
T( 1.442689881667200f));
1962 return ((f4 * hi) + (f * lo)) +
T(exponent);
1968 #ifndef __CUDA_ARCH__
1972 unsigned int bits = bitcast<unsigned int, float>(
x);
1973 int exponent =
int(bits >> 23) - 127;
1974 float f = bitcast<float, unsigned int>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1982 float hi =
madd(f, -0.00931049621349f, 0.05206469089414f);
1983 float lo =
madd(f, 0.47868480909345f, -0.72116591947498f);
1984 hi =
madd(f, hi, -0.13753123777116f);
1985 hi =
madd(f, hi, 0.24187369696082f);
1986 hi =
madd(f, hi, -0.34730547155299f);
1987 lo =
madd(f, lo, 1.442689881667200f);
1988 return ((f4 * hi) + (f * lo)) + exponent;
1990 return __log2f(xval);
1996 template<
typename T>
2002 #ifdef __CUDA_ARCH__
2011 template<
typename T>
2017 #ifdef __CUDA_ARCH__
2026 #ifndef __CUDA_ARCH__
2031 unsigned int bits = bitcast<unsigned int, float>(
x);
2032 return float (
int(bits >> 23) - 127);
2039 #ifndef __CUDA_ARCH__
2040 if (fabsf(x) < 0.01f) {
2041 float y = 1.0f - (1.0f -
x);
2053 template<
typename T>
2055 using namespace simd;
2056 typedef typename T::vint_t intN;
2057 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
2059 T x =
clamp (xval,
T(-126.0f),
T(126.0f));
2060 intN m (
x);
x -=
T(m);
2062 x = one - (one -
x);
2063 const T kA (1.33336498402e-3f);
2064 const T kB (9.810352697968e-3f);
2065 const T kC (5.551834031939e-2f);
2066 const T kD (0.2401793301105f);
2067 const T kE (0.693144857883f);
2073 r =
madd(
x, r, one);
2089 for (
int i = 0; i < r.elements; ++i)
2091 for (
int i = r.elements; i < r.paddedelements; ++i)
2100 #if OIIO_ANY_CLANG && !OIIO_INTEL_LLVM_COMPILER && OIIO_FMATH_SIMD_FRIENDLY
2103 return std::exp2(xval);
2104 #elif !defined(__CUDA_ARCH__)
2106 float x =
clamp (xval, -126.0f, 126.0f);
2108 int m =
int(x); x -= m;
2109 x = 1.0f - (1.0f -
x);
2115 float r = 1.33336498402e-3
f;
2116 r =
madd(x, r, 9.810352697968e-3f);
2117 r =
madd(x, r, 5.551834031939e-2f);
2118 r =
madd(x, r, 0.2401793301105f);
2119 r =
madd(x, r, 0.693144857883f);
2120 r =
madd(x, r, 1.0f);
2123 return bitcast<float, unsigned int>(bitcast<unsigned int, float>(
r) + (
unsigned(m) << 23));
2136 template <
typename T>
2142 #ifdef __CUDA_ARCH__
2154 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2157 return static_cast<float>(std::exp(static_cast<double>(x)));
2165 #ifndef __CUDA_ARCH__
2174 #ifndef __CUDA_ARCH__
2175 if (fabsf(x) < 0.03f) {
2176 float y = 1.0f - (1.0f -
x);
2186 #ifndef __CUDA_ARCH__
2191 return copysignf(0.5f * e - 0.5f / e, x);
2193 a = 1.0f - (1.0f -
a);
2197 float r = 2.03945513931e-4
f;
2198 r =
madd(r, a2, 8.32990277558e-3f);
2199 r =
madd(r, a2, 0.1666673421859f);
2200 r =
madd(r * a, a2, a);
2209 #ifndef __CUDA_ARCH__
2212 return 0.5f * e + 0.5f / e;
2219 #ifndef __CUDA_ARCH__
2222 float e =
fast_exp(2.0f * fabsf(x));
2230 if (y == 0)
return 1.0f;
2231 if (x == 0)
return 0.0f;
2236 #ifndef __CUDA_ARCH__
2246 int ybits = bitcast<int, float>(
y) & 0x7fffffff;
2247 if (ybits >= 0x4b800000) {
2249 }
else if (ybits >= 0x3f800000) {
2251 int k = (ybits >> 23) - 127;
2252 int j = ybits >> (23 - k);
2253 if ((j << (23 - k)) == ybits)
2254 sign = bitcast<float, int>(0x3f800000 | (j << 31));
2266 template<
typename T,
typename U>
2274 #ifndef __CUDA_ARCH__
2275 float x0 = fabsf(x);
2277 float a = bitcast<float, int>(0x2a5137a0 + bitcast<int, float>(x0) / 3);
2280 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2281 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2282 a = (x0 == 0) ? 0 : a;
2292 #ifndef __CUDA_ARCH__
2295 const float a1 = 0.0705230784f;
2296 const float a2 = 0.0422820123f;
2297 const float a3 = 0.0092705272f;
2298 const float a4 = 0.0001520143f;
2299 const float a5 = 0.0002765672f;
2300 const float a6 = 0.0000430638f;
2301 const float a = fabsf(x);
2302 const float b = 1.0f - (1.0f -
a);
2303 const float r =
madd(
madd(
madd(
madd(
madd(
madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2304 const float s = r *
r;
2305 const float t = s *
s;
2306 const float u = t *
t;
2307 const float v = u * u;
2316 #ifndef __CUDA_ARCH__
2330 float a = fabsf(x);
if (a > 0.99999994f) a = 0.99999994f;
2331 float w = -
fast_log((1.0f - a) * (1.0f + a)), p;
2334 p = 2.81022636e-08
f;
2335 p =
madd(p, w, 3.43273939e-07f);
2336 p =
madd(p, w, -3.5233877e-06f );
2337 p =
madd(p, w, -4.39150654e-06f);
2338 p =
madd(p, w, 0.00021858087f );
2339 p =
madd(p, w, -0.00125372503f );
2340 p =
madd(p, w, -0.00417768164f );
2341 p =
madd(p, w, 0.246640727f );
2342 p =
madd(p, w, 1.50140941f );
2344 w = sqrtf(w) - 3.0f;
2345 p = -0.000200214257f;
2346 p =
madd(p, w, 0.000100950558f);
2347 p =
madd(p, w, 0.00134934322f );
2348 p =
madd(p, w, -0.00367342844f );
2349 p =
madd(p, w, 0.00573950773f );
2350 p =
madd(p, w, -0.0076224613f );
2351 p =
madd(p, w, 0.00943887047f );
2352 p =
madd(p, w, 1.00167406f );
2353 p =
madd(p, w, 2.83297682f );
2382 int maxiters=32, T eps=1.0e-6,
bool *brack=0)
2390 bool increasing = (v0 <
v1);
2391 T vmin = increasing ? v0 :
v1;
2392 T vmax = increasing ?
v1 :
v0;
2393 bool bracketed = (y >= vmin && y <= vmax);
2399 return ((y < vmin) == increasing) ? xmin : xmax;
2401 if (fabs(v0-
v1) < eps)
2403 int rfiters = (3*maxiters)/4;
2404 for (
int iters = 0; iters < maxiters; ++iters) {
2406 if (iters < rfiters) {
2409 if (t <=
T(0) || t >=
T(1))
2414 x =
lerp (xmin, xmax, t);
2416 if ((v < y) == increasing) {
2421 if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2435 #ifndef __CUDA_ARCH__
2438 x =
clamp (x,
float(0.0),
float(1.0));
2439 int nsegs =
int(y.
size()) - 1;
2442 #ifndef __CUDA_ARCH__
2443 int nextseg =
std::min (segnum+1, nsegs);
2445 int nextseg =
min (segnum+1, nsegs);
2447 return lerp (y[segnum], y[nextseg], x);
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
SYS_API double cos(double x)
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
bool_constant< is_integral< T >::value &&!std::is_same< T, bool >::value &&!std::is_same< T, char >::value &&!std::is_same< T, wchar_t >::value > is_integer
EightBitConverter() noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
OIIO_HOSTDEVICE V round_down_to_multiple(V value, M multiple)
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
vint4 srl(const vint4 &val, const unsigned int bits)
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
IMATH_HOSTDEVICE constexpr int floor(T x) IMATH_NOEXCEPT
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
const DataArrayProxy< I, E > & operator+=(int i)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
GLsizei const GLfloat * value
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
vfloat4 sqrt(const vfloat4 &a)
GLdouble GLdouble GLdouble z
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
GLboolean GLboolean GLboolean GLboolean a
SYS_API float atan2f(float y, float x)
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
**But if you need a result
GLfloat GLfloat GLfloat v2
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
GLdouble GLdouble GLdouble q
Integer 8-vector, accelerated by SIMD instructions when available.
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
GLfloat GLfloat GLfloat GLfloat v3
OIIO_HOSTDEVICE int pow2roundup(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
constexpr size_type size() const noexcept
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
constexpr auto in(type t, int set) -> bool
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
SYS_API float log10f(float x)
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
vfloat4 floor(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
OIIO_HOSTDEVICE int pow2rounddown(int x)
GA_API const UT_StringHolder scale
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
SYS_API double asin(double x)
DataArrayProxy(I *data=NULL)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Integer 4-vector, accelerated by SIMD instructions when available.
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
void bit_pack(cspan< T > data, void *out, int outbits)
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
T operator()(unsigned char c) const noexcept
void store(float *values) const
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
constexpr size_type size() const noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bitcast(const IN_TYPE &in) noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
SYS_API double acos(double x)
ConstDataProxy(const I &data)
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
OIIO_HOSTDEVICE float sign(float x)
vfloat4 round(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
GLboolean GLboolean GLboolean b
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
ConstDataArrayProxy(const I *data=NULL)
Integer 16-vector, accelerated by SIMD instructions when available.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
std::integral_constant< bool, std::numeric_limits< T >::is_signed||std::is_same< T, int128_opt >::value > is_signed
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
IMATH_NAMESPACE::V2f IMATH_NAMESPACE::Box2i std::string this attribute is obsolete as of OpenEXR v3 float
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
void store(int *values) const
Store the values into memory.
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
FMT_CONSTEXPR20 auto bit_cast(const From &from) -> To
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
SYS_API float asinf(float x)
GLubyte GLubyte GLubyte GLubyte w
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
E operator[](int i) const
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
#define OIIO_NAMESPACE_END
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
const ConstDataArrayProxy< I, E > & operator+=(int i)
vint4 min(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE T log(const T &v)
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
vint4 rint(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
SYS_API double sin(double x)
E operator[](int i) const
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
#define OIIO_NAMESPACE_BEGIN
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.