31 #define OIIO_FMATH_H 1
38 #include <type_traits>
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 # define OIIO_FMATH_SIMD_FRIENDLY 0
90 # define M_PI 3.14159265358979323846264338327950288
93 # define M_PI_2 1.57079632679489661923132169163975144
96 # define M_PI_4 0.785398163397448309615660845819875721
99 # define M_TWO_PI (M_PI * 2.0)
102 # define M_1_PI 0.318309886183790671537767526745028724
105 # define M_2_PI 0.636619772367581343075535053490057448
108 # define M_SQRT2 1.41421356237309504880168872420969808
111 # define M_SQRT1_2 0.707106781186547524400844362104849039
114 # define M_LN2 0.69314718055994530941723212145817656
117 # define M_LN10 2.30258509299404568401799145468436421
120 # define M_E 2.71828182845904523536028747135266250
123 # define M_LOG2E 1.44269504088896340735992468100189214
145 return (
x & (
x - 1)) == 0 && (
x >= 0);
186 return x & ~(
x >> 1);
198 template <
typename V,
typename M>
201 return V (((value + V(multiple) - 1) / V(multiple)) * V(multiple));
215 return (x + m - 1) & (~(m - 1));
226 uint64_t
r = (uint64_t)a * (uint64_t)
b;
227 return r < Err ? (uint32_t)r : Err;
238 if (b && ab / b != a)
259 "rotl only works for unsigned integer types");
260 return (
x <<
s) | (
x >> ((
sizeof(
T) * 8) -
s));
264 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
271 uint32_t
rotl(uint32_t
x,
int s) noexcept
273 return __funnelshift_lc(
x,
x,
s);
283 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
284 return __funnelshift_lc(x, x, k);
286 return (x << k) | (x >> (32 - k));
293 return (x << k) | (x >> (64 - k));
303 return b ? (a %
b) :
T(0);
323 clamp (
const T&
a,
const T& low,
const T& high)
341 return (a >= low) ? ((a <= high) ? a : high) : low;
346 #ifndef __CUDA_ARCH__
430 template <
class T,
class Q>
435 return v0*(Q(1)-
x) + v1*x;
443 template <
class T,
class Q>
449 return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
458 template <
class T,
class Q>
461 const T *
v2,
const T *
v3,
466 for (
int i = 0; i <
n; ++i)
467 result[i] =
T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
477 template <
class T,
class Q>
480 const T *
v2,
const T *
v3,
485 for (
int i = 0; i <
n; ++i)
486 result[i] +=
T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
487 t*(v2[i]*s1 + v3[i]*s)));
495 template <
class T,
class Q>
497 trilerp (T
v0, T
v1, T
v2, T
v3, T v4, T v5, T v6, T v7, Q
s, Q
t, Q
r)
503 return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
504 r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
513 template <
class T,
class Q>
516 const T *v4,
const T *v5,
const T *v6,
const T *v7,
522 for (
int i = 0; i <
n; ++i)
523 result[i] =
T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
524 r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
534 template <
class T,
class Q>
537 const T *v4,
const T *v5,
const T *v6,
const T *v7,
541 bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
542 bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
549 template <
typename T>
552 T one_frac = 1 - fraction;
553 w[0] =
T(1.0 / 6.0) * one_frac * one_frac * one_frac;
554 w[1] =
T(2.0 / 3.0) -
T(0.5) * fraction * fraction * (2 - fraction);
555 w[2] =
T(2.0 / 3.0) -
T(0.5) * one_frac * one_frac * (2 - one_frac);
556 w[3] =
T(1.0 / 6.0) * fraction * fraction * fraction;
563 template <
typename T>
566 T one_frac = 1 - fraction;
567 dw[0] = -
T(0.5) * one_frac * one_frac;
568 dw[1] =
T(0.5) * fraction * (3 * fraction - 4);
569 dw[2] = -
T(0.5) * one_frac * (3 * one_frac - 4);
570 dw[3] =
T(0.5) * fraction * fraction;
584 for (
int c = 0;
c <
n; ++
c)
588 for (
int j = 0; j < 4; ++j) {
589 for (
int i = 0; i < 4; ++i) {
591 for (
int c = 0;
c <
n; ++
c)
592 result[
c] += w * val[j*4+i][
c];
603 return (
int)floorf(x);
622 return x -
static_cast<float>(i);
627 #ifndef __CUDA_ARCH__
651 template <
typename T>
655 template <
typename T>
677 template <
typename T>
688 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
689 __builtin_sincosf(x, sine, cosine);
690 #elif defined(__CUDA_ARCH__)
700 sincos (
double x,
double* sine,
double* cosine)
702 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
703 __builtin_sincos(x, sine, cosine);
704 #elif defined(__CUDA_ARCH__)
717 return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
736 template <
typename IN_TYPE,
typename OUT_TYPE>
740 static_assert(
sizeof(IN_TYPE) ==
sizeof(OUT_TYPE),
741 "bit_cast must be between objects of the same size");
743 memcpy ((
void *)&out, &in,
sizeof(IN_TYPE));
747 #if defined(__INTEL_COMPILER)
755 return static_cast<uint32_t
>(_castf32_u32(
val));
758 return static_cast<int32_t
>(_castf32_u32(
val));
761 return _castu32_f32(
val);
764 return _castu32_f32(
val);
767 return static_cast<uint64_t
>(_castf64_u64(
val));
770 return static_cast<int64_t
>(_castf64_u64(
val));
773 return _castu64_f64(
val);
776 return _castu64_f64(
val);
797 for (
char *
c = (
char *) f;
len--;
c +=
sizeof(
T)) {
798 if (
sizeof(
T) == 2) {
800 }
else if (
sizeof(
T) == 4) {
803 }
else if (
sizeof(
T) == 8) {
827 template<
typename S,
typename D,
typename F>
833 s += (s < 0 ? (F)-0.5 : (F)0.5);
834 return (D)
clamp(s, min, max);
836 return (D)
clamp((F)src * scale + (F)0.5, min, max);
851 template<
typename S,
typename D>
856 memcpy (dst, src, n*
sizeof(D));
868 for ( ; n >= 16; n -= 16) {
869 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
870 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
871 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
872 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
873 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
874 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
875 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
876 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
877 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
878 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
879 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
880 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
881 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
882 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
883 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
884 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
887 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
892 for ( ; n >= 16; n -= 16) {
893 *dst++ = (D)((*src++) *
scale);
894 *dst++ = (D)((*src++) *
scale);
895 *dst++ = (D)((*src++) *
scale);
896 *dst++ = (D)((*src++) *
scale);
897 *dst++ = (D)((*src++) *
scale);
898 *dst++ = (D)((*src++) *
scale);
899 *dst++ = (D)((*src++) *
scale);
900 *dst++ = (D)((*src++) *
scale);
901 *dst++ = (D)((*src++) *
scale);
902 *dst++ = (D)((*src++) *
scale);
903 *dst++ = (D)((*src++) *
scale);
904 *dst++ = (D)((*src++) *
scale);
905 *dst++ = (D)((*src++) *
scale);
906 *dst++ = (D)((*src++) *
scale);
907 *dst++ = (D)((*src++) *
scale);
908 *dst++ = (D)((*src++) *
scale);
911 *dst++ = (D)((*src++) *
scale);
916 #ifndef __CUDA_ARCH__
919 float *
dst,
size_t n,
924 for ( ;
n >= 4;
n -= 4,
src += 4,
dst += 4) {
930 *
dst++ = (*
src++) * scale;
937 float *
dst,
size_t n,
942 for ( ;
n >= 4;
n -= 4,
src += 4,
dst += 4) {
948 *
dst++ = (*
src++) * scale;
954 inline void convert_type<half,float> (
const half *
src,
955 float *
dst,
size_t n,
958 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
960 for ( ;
n >= 8;
n -= 8,
src += 8,
dst += 8) {
966 for ( ;
n >= 4;
n -= 4,
src += 4,
dst += 4) {
981 uint16_t , uint16_t )
988 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
995 *dst++ = scaled_conversion<float,uint16_t,float> (*src++,
scale,
min,
max);
1009 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1016 *dst++ = scaled_conversion<float,uint8_t,float> (*src++,
scale,
min,
max);
1023 convert_type<float,half> (
const float *
src,
half *
dst,
size_t n,
1026 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1028 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1034 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1047 template<
typename S,
typename D>
1050 convert_type<S,D> (
src,
dst,
n,
1063 template<
typename S,
typename D>
1083 return (D)((F)src * scale);
1100 template<
unsigned int FROM_BITS,
unsigned int TO_BITS>
1102 unsigned int out = 0;
1103 int shift = TO_BITS - FROM_BITS;
1104 for (; shift > 0; shift -= FROM_BITS)
1106 out |= in >> -shift;
1116 unsigned int out = 0;
1117 int shift = TO_BITS - FROM_BITS;
1118 for (; shift > 0; shift -= FROM_BITS)
1120 out |= in >> -shift;
1130 template<
typename T>
1137 "bitstring_add_n_bits must be unsigned int 8/16/32");
1138 const int Tbits =
sizeof(
T) * 8;
1146 val &= ~(0xffffffff <<
n);
1152 int bits_left_in_out = Tbits - filled;
1155 if (n <= bits_left_in_out) {
1156 b = val << (bits_left_in_out -
n);
1159 b = val >> (n - bits_left_in_out);
1160 nb = bits_left_in_out;
1166 if (filled == Tbits) {
1178 template<
typename T>
1185 "bit_pack must be unsigned int 8/16/32");
1186 unsigned char* outbuffer = (
unsigned char*)out;
1188 for (
size_t i = 0, e = data.
size(); i < e; ++i)
1198 template<
typename T>
1205 "bit_unpack must be unsigned int 8/16/32");
1211 for (
int i = 0; i <
n; ++i) {
1214 while (valbits < inbits) {
1218 int out_left = inbits - valbits;
1219 int in_left = 8 -
b;
1220 if (in_left <= out_left) {
1225 val |= in[B] & ~(0xffffffff << in_left);
1234 int extra = 8 -
b - out_left;
1235 val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1249 template<
typename I,
typename E>
1252 E
operator= (E newval) { m_data = convert_type<E,I>(newval);
return newval; }
1253 operator E ()
const {
return convert_type<I,E>(m_data); }
1264 template<
typename I,
typename E>
1267 operator E ()
const {
return convert_type<E,I>(*m_data); }
1277 template<
typename I,
typename E>
1281 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1284 I *
get ()
const {
return m_data; }
1286 m_data += i;
return *
this;
1297 template<
typename I,
typename E>
1301 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1303 const I *
get ()
const {
return m_data; }
1305 m_data += i;
return *
this;
1315 template <
class T=
float>
1322 void init () noexcept {
1323 float scale = 1.0f / 255.0f;
1326 for (
int i = 0; i < 256; ++i)
1344 }
else if ((
int)(1.0/f) == (1.0/f)) {
1350 while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1370 num = (f >= 0) ? (
int)n : -(
int)n;
1394 template <
typename T>
1400 template <
typename T>
1407 template <
typename T>
1415 template <
typename T>
1417 if (x <=
T(-1))
return T(
M_PI);
1418 if (x >=
T(+1))
return T(0);
1424 template <
typename T>
1429 return std::log2(x);
1433 template <
typename T>
1442 template <
typename T>
1451 template <
typename T>
1458 template <
typename T>
1460 if (y ==
T(0))
return T(1);
1461 if (x ==
T(0))
return T(0);
1463 if ((x <
T(0)) && (y !=
floor(y)))
return T(0);
1468 return clamp (r, -big, big);
1486 int N =
static_cast<int>(a/
b);
1493 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1539 #if OIIO_SIMD_SSE >= 4
1544 return static_cast<int>(x +
copysignf(0.5
f, x));
1548 #ifndef __CUDA_ARCH__
1556 #ifndef __CUDA_ARCH__
1562 float qf = float(q);
1563 x =
madd(qf, -0.78515625
f*4, x);
1564 x =
madd(qf, -0.00024187564849853515625
f*4, x);
1565 x =
madd(qf, -3.7747668102383613586e-08
f*4, x);
1566 x =
madd(qf, -1.2816720341285448015e-12
f*4, x);
1569 if ((q & 1) != 0) x = -x;
1572 float u = 2.6083159809786593541503e-06
f;
1573 u =
madd(u, s, -0.0001981069071916863322258
f);
1574 u =
madd(u, s, +0.00833307858556509017944336
f);
1575 u =
madd(u, s, -0.166666597127914428710938
f);
1576 u =
madd(s, u * x, x);
1588 #ifndef __CUDA_ARCH__
1591 float qf = float(q);
1592 x =
madd(qf, -0.78515625
f*4, x);
1593 x =
madd(qf, -0.00024187564849853515625
f*4, x);
1594 x =
madd(qf, -3.7747668102383613586e-08
f*4, x);
1595 x =
madd(qf, -1.2816720341285448015e-12
f*4, x);
1600 float u = -2.71811842367242206819355e-07
f;
1601 u =
madd(u, s, +2.47990446951007470488548e-05
f);
1602 u =
madd(u, s, -0.00138888787478208541870117
f);
1603 u =
madd(u, s, +0.0416666641831398010253906
f);
1604 u =
madd(u, s, -0.5
f);
1605 u =
madd(u, s, +1.0
f);
1606 if ((q & 1) != 0) u = -u;
1615 #ifndef __CUDA_ARCH__
1618 float qf = float(q);
1619 x =
madd(qf, -0.78515625
f*4, x);
1620 x =
madd(qf, -0.00024187564849853515625
f*4, x);
1621 x =
madd(qf, -3.7747668102383613586e-08
f*4, x);
1622 x =
madd(qf, -1.2816720341285448015e-12
f*4, x);
1626 if ((q & 1) != 0) x = -x;
1627 float su = 2.6083159809786593541503e-06
f;
1628 su =
madd(su, s, -0.0001981069071916863322258
f);
1629 su =
madd(su, s, +0.00833307858556509017944336
f);
1630 su =
madd(su, s, -0.166666597127914428710938
f);
1631 su =
madd(s, su * x, x);
1632 float cu = -2.71811842367242206819355e-07
f;
1633 cu =
madd(cu, s, +2.47990446951007470488548e-05
f);
1634 cu =
madd(cu, s, -0.00138888787478208541870117
f);
1635 cu =
madd(cu, s, +0.0416666641831398010253906
f);
1636 cu =
madd(cu, s, -0.5
f);
1637 cu =
madd(cu, s, +1.0
f);
1638 if ((q & 1) != 0) cu = -cu;
1639 *sine =
clamp(su, -1.0
f, 1.0
f);;
1640 *cosine =
clamp(cu, -1.0
f, 1.0
f);;
1642 __sincosf(x, sine, cosine);
1649 #ifndef __CUDA_ARCH__
1654 float qf = float(q);
1655 x =
madd(qf, -0.78515625
f*2, x);
1656 x =
madd(qf, -0.00024187564849853515625
f*2, x);
1657 x =
madd(qf, -3.7747668102383613586e-08
f*2, x);
1658 x =
madd(qf, -1.2816720341285448015e-12
f*2, x);
1662 float u = 0.00927245803177356719970703f;
1663 u =
madd(u, s, 0.00331984995864331722259521
f);
1664 u =
madd(u, s, 0.0242998078465461730957031
f);
1665 u =
madd(u, s, 0.0534495301544666290283203
f);
1666 u =
madd(u, s, 0.133383005857467651367188
f);
1667 u =
madd(u, s, 0.333331853151321411132812
f);
1668 u =
madd(s, u * x, x);
1682 #ifndef __CUDA_ARCH__
1684 const float z = x - ((x + 25165824.0f) - 25165824.0
f);
1685 const float y = z - z * fabsf(z);
1686 const float Q = 3.10396624f;
1687 const float P = 3.584135056f;
1688 return y * (Q + P * fabsf(y));
1720 #ifndef __CUDA_ARCH__
1728 #ifndef __CUDA_ARCH__
1729 const float f = fabsf(x);
1730 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1735 const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1736 return x < 0 ? float(
M_PI) - a :
a;
1743 #ifndef __CUDA_ARCH__
1746 const float f = fabsf(x);
1747 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1748 const float a = float(
M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1756 #ifndef __CUDA_ARCH__
1757 const float a = fabsf(x);
1758 const float k = a > 1.0f ? 1 / a :
a;
1759 const float s = 1.0f - (1.0f - k);
1760 const float t = s *
s;
1766 float r = s *
madd(0.430165678
f, t, 1.0
f) /
madd(
madd(0.0579354987
f, t, 0.763007998
f), t, 1.0
f);
1767 if (a > 1.0
f) r = 1.570796326794896557998982f -
r;
1775 #ifndef __CUDA_ARCH__
1779 const float a = fabsf(x);
1780 const float b = fabsf(y);
1781 bool b_is_greater_than_a = b >
a;
1783 #if OIIO_FMATH_SIMD_FRIENDLY
1788 float sa = b_is_greater_than_a ? b :
a;
1789 float sb = b_is_greater_than_a ? a :
b;
1790 const float k = (b == 0) ? 0.0
f : sb/sa;
1792 const float k = (b == 0) ? 0.0
f : ((a == b) ? 1.0f : (b > a ? a / b : b /
a));
1795 const float s = 1.0f - (1.0f - k);
1796 const float t = s *
s;
1798 float r = s *
madd(0.430165678
f, t, 1.0
f) /
madd(
madd(0.0579354987
f, t, 0.763007998
f), t, 1.0
f);
1800 if (b_is_greater_than_a)
1801 r = 1.570796326794896557998982f -
r;
1803 if (bit_cast<float, unsigned>(x) & 0x80000000u)
1804 r = float(
M_PI) -
r;
1812 template<
typename T>
1814 using namespace simd;
1815 typedef typename T::int_t intN;
1819 intN exponent =
srl (bits, 23) - intN(127);
1823 T hi =
madd(
f,
T(-0.00931049621349
f),
T( 0.05206469089414
f));
1824 T lo =
madd(
f,
T( 0.47868480909345
f),
T(-0.72116591947498
f));
1825 hi =
madd(
f, hi,
T(-0.13753123777116
f));
1826 hi =
madd(
f, hi,
T( 0.24187369696082
f));
1827 hi =
madd(
f, hi,
T(-0.34730547155299
f));
1828 lo =
madd(
f, lo,
T( 1.442689881667200
f));
1829 return ((f4 * hi) + (
f * lo)) +
T(exponent);
1835 #ifndef __CUDA_ARCH__
1839 unsigned bits =
bit_cast<float,
unsigned>(
x);
1840 int exponent =
int(bits >> 23) - 127;
1841 float f =
bit_cast<unsigned,
float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1849 float hi =
madd(f, -0.00931049621349f, 0.05206469089414f);
1850 float lo =
madd(f, 0.47868480909345f, -0.72116591947498f);
1851 hi =
madd(f, hi, -0.13753123777116f);
1852 hi =
madd(f, hi, 0.24187369696082f);
1853 hi =
madd(f, hi, -0.34730547155299f);
1854 lo =
madd(f, lo, 1.442689881667200f);
1855 return ((f4 * hi) + (f * lo)) + exponent;
1857 return __log2f(xval);
1863 template<
typename T>
1869 #ifdef __CUDA_ARCH__
1878 template<
typename T>
1884 #ifdef __CUDA_ARCH__
1893 #ifndef __CUDA_ARCH__
1898 unsigned bits =
bit_cast<float,
unsigned>(
x);
1899 return float (
int(bits >> 23) - 127);
1906 #ifndef __CUDA_ARCH__
1907 if (fabsf(x) < 0.01
f) {
1908 float y = 1.0f - (1.0f -
x);
1920 template<
typename T>
1922 using namespace simd;
1923 typedef typename T::int_t intN;
1927 intN
m (
x);
x -=
T(m);
1929 x = one - (one -
x);
1930 const T kA (1.33336498402e-3
f);
1931 const T kB (9.810352697968e-3
f);
1932 const T kC (5.551834031939e-2
f);
1933 const T kD (0.2401793301105
f);
1934 const T kE (0.693144857883
f);
1940 r =
madd(
x, r, one);
1944 for (
int i = 0; i < r.elements; ++i)
1946 for (
int i = r.elements; i < r.paddedelements; ++i)
1955 #if OIIO_NON_INTEL_CLANG && OIIO_FMATH_SIMD_FRIENDLY
1958 return std::exp2(xval);
1959 #elif !defined(__CUDA_ARCH__)
1961 float x =
clamp (xval, -126.0
f, 126.0
f);
1963 int m =
int(x); x -=
m;
1964 x = 1.0f - (1.0f -
x);
1970 float r = 1.33336498402e-3
f;
1971 r =
madd(x, r, 9.810352697968e-3
f);
1972 r =
madd(x, r, 5.551834031939e-2
f);
1973 r =
madd(x, r, 0.2401793301105
f);
1974 r =
madd(x, r, 0.693144857883
f);
1975 r =
madd(x, r, 1.0
f);
1978 return bit_cast<unsigned,
float>(
bit_cast<float,
unsigned>(
r) + (
unsigned(m) << 23));
1991 template <
typename T>
1997 #ifdef __CUDA_ARCH__
2009 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2012 return static_cast<float>(std::exp(static_cast<double>(x)));
2020 #ifndef __CUDA_ARCH__
2029 #ifndef __CUDA_ARCH__
2030 if (fabsf(x) < 0.03
f) {
2031 float y = 1.0f - (1.0f -
x);
2041 #ifndef __CUDA_ARCH__
2048 a = 1.0f - (1.0f -
a);
2052 float r = 2.03945513931e-4
f;
2053 r =
madd(r, a2, 8.32990277558e-3
f);
2054 r =
madd(r, a2, 0.1666673421859
f);
2055 r =
madd(r * a, a2, a);
2064 #ifndef __CUDA_ARCH__
2067 return 0.5f * e + 0.5f / e;
2074 #ifndef __CUDA_ARCH__
2085 if (y == 0)
return 1.0f;
2086 if (x == 0)
return 0.0f;
2091 #ifndef __CUDA_ARCH__
2102 if (ybits >= 0x4b800000) {
2104 }
else if (ybits >= 0x3f800000) {
2106 int k = (ybits >> 23) - 127;
2107 int j = ybits >> (23 - k);
2108 if ((j << (23 - k)) == ybits)
2109 sign =
bit_cast<
int,
float>(0x3f800000 | (j << 31));
2121 template<
typename T,
typename U>
2129 #ifndef __CUDA_ARCH__
2130 float x0 = fabsf(x);
2135 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2136 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2137 a = (x0 == 0) ? 0 : a;
2147 #ifndef __CUDA_ARCH__
2150 const float a1 = 0.0705230784f;
2151 const float a2 = 0.0422820123f;
2152 const float a3 = 0.0092705272f;
2153 const float a4 = 0.0001520143f;
2154 const float a5 = 0.0002765672f;
2155 const float a6 = 0.0000430638f;
2156 const float a = fabsf(x);
2157 const float b = 1.0f - (1.0f -
a);
2158 const float r =
madd(
madd(
madd(
madd(
madd(
madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0
f);
2159 const float s = r *
r;
2160 const float t = s *
s;
2161 const float u = t *
t;
2162 const float v = u * u;
2171 #ifndef __CUDA_ARCH__
2185 float a = fabsf(x);
if (a > 0.99999994
f) a = 0.99999994f;
2189 p = 2.81022636e-08
f;
2190 p =
madd(
p, w, 3.43273939e-07
f);
2191 p =
madd(
p, w, -3.5233877e-06
f );
2192 p =
madd(
p, w, -4.39150654e-06
f);
2193 p =
madd(
p, w, 0.00021858087
f );
2194 p =
madd(
p, w, -0.00125372503
f );
2195 p =
madd(
p, w, -0.00417768164
f );
2196 p =
madd(
p, w, 0.246640727
f );
2199 w = sqrtf(w) - 3.0f;
2200 p = -0.000200214257f;
2201 p =
madd(
p, w, 0.000100950558
f);
2202 p =
madd(
p, w, 0.00134934322
f );
2203 p =
madd(
p, w, -0.00367342844
f );
2204 p =
madd(
p, w, 0.00573950773
f );
2205 p =
madd(
p, w, -0.0076224613
f );
2206 p =
madd(
p, w, 0.00943887047
f );
2237 int maxiters=32, T eps=1.0e-6,
bool *brack=0)
2245 bool increasing = (v0 <
v1);
2246 T vmin = increasing ? v0 :
v1;
2247 T vmax = increasing ?
v1 :
v0;
2248 bool bracketed = (y >= vmin && y <= vmax);
2254 return ((y < vmin) == increasing) ? xmin : xmax;
2256 if (fabs(v0-
v1) < eps)
2258 int rfiters = (3*maxiters)/4;
2259 for (
int iters = 0; iters < maxiters; ++iters) {
2261 if (iters < rfiters) {
2264 if (t <=
T(0) || t >=
T(1))
2269 x =
lerp (xmin, xmax, t);
2271 if ((v < y) == increasing) {
2276 if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2290 #ifndef __CUDA_ARCH__
2291 DASSERT_MSG (y.
size() >= 2,
"interpolate_linear needs at least 2 knot values (%zd)", y.
size());
2293 x =
clamp (x,
float(0.0),
float(1.0));
2294 int nsegs =
int(y.
size()) - 1;
2297 #ifndef __CUDA_ARCH__
2298 int nextseg =
std::min (segnum+1, nsegs);
2300 int nextseg =
min (segnum+1, nsegs);
2302 return lerp (y[segnum], y[nextseg], x);
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
SYS_API double cos(double x)
GLboolean GLboolean GLboolean b
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
bool_constant< is_integral< T >::value &&!std::is_same< T, bool >::value &&!std::is_same< T, char >::value &&!std::is_same< T, wchar_t >::value > is_integer
EightBitConverter() noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
GLenum GLenum GLenum GLenum GLenum scale
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
vint4 srl(const vint4 &val, const unsigned int bits)
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
const DataArrayProxy< I, E > & operator+=(int i)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
vfloat4 sqrt(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
SYS_API float atan2f(float y, float x)
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat t1
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
Integer 8-vector, accelerated by SIMD instructions when available.
std::integral_constant< bool, std::numeric_limits< T >::is_signed||std::is_same< T, int128_t >::value > is_signed
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
OIIO_HOSTDEVICE int pow2roundup(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
constexpr size_type size() const noexcept
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
GLuint GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat GLfloat s1
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
SYS_API float log10f(float x)
GLubyte GLubyte GLubyte GLubyte w
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
GLdouble GLdouble GLdouble GLdouble q
vfloat4 floor(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
OIIO_HOSTDEVICE int pow2rounddown(int x)
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
SYS_API double asin(double x)
DataArrayProxy(I *data=NULL)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Integer 4-vector, accelerated by SIMD instructions when available.
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
GLfloat GLfloat GLfloat v2
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast(const IN_TYPE &in)
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
GLboolean GLboolean GLboolean GLboolean a
void bit_pack(cspan< T > data, void *out, int outbits)
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
GLdouble GLdouble GLdouble z
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
T operator()(unsigned char c) const noexcept
void store(float *values) const
typedef int(WINAPI *PFNWGLRELEASEPBUFFERDCARBPROC)(HPBUFFERARB hPbuffer
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
constexpr size_type size() const noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
GLfloat GLfloat GLfloat GLfloat v3
SYS_API double acos(double x)
ConstDataProxy(const I &data)
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
OIIO_HOSTDEVICE float sign(float x)
vfloat4 round(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
ConstDataArrayProxy(const I *data=NULL)
Integer 16-vector, accelerated by SIMD instructions when available.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
void store(int *values) const
Store the values into memory.
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
GLsizei const GLfloat * value
SYS_API float asinf(float x)
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
E operator[](int i) const
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
#define OIIO_NAMESPACE_END
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
const ConstDataArrayProxy< I, E > & operator+=(int i)
vint4 min(const vint4 &a, const vint4 &b)
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
vint4 rint(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
SYS_API double sin(double x)
E operator[](int i) const
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
#define OIIO_NAMESPACE_BEGIN
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.