00001 /////////////////////////////////////////////////////////////////////////// 00002 // 00003 // Copyright (c) 2002, Industrial Light & Magic, a division of Lucas 00004 // Digital Ltd. LLC 00005 // 00006 // All rights reserved. 00007 // 00008 // Redistribution and use in source and binary forms, with or without 00009 // modification, are permitted provided that the following conditions are 00010 // met: 00011 // * Redistributions of source code must retain the above copyright 00012 // notice, this list of conditions and the following disclaimer. 00013 // * Redistributions in binary form must reproduce the above 00014 // copyright notice, this list of conditions and the following disclaimer 00015 // in the documentation and/or other materials provided with the 00016 // distribution. 00017 // * Neither the name of Industrial Light & Magic nor the names of 00018 // its contributors may be used to endorse or promote products derived 00019 // from this software without specific prior written permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00022 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00023 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00024 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00025 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00026 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00027 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00028 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00029 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00030 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00031 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00032 // 00033 /////////////////////////////////////////////////////////////////////////// 00034 00035 // Primary authors: 00036 // Florian Kainz <kainz@ilm.com> 00037 // Rod Bogart <rgb@ilm.com> 00038 00039 //--------------------------------------------------------------------------- 00040 // 00041 // fpreal16 -- a 16-bit floating point number class: 00042 // 00043 // Type fpreal16 can represent positive and negative numbers, whose 00044 // magnitude is between roughly 6.1e-5 and 6.5e+4, with a relative 00045 // error of 9.8e-4; numbers smaller than 6.1e-5 can be represented 00046 // with an absolute error of 6.0e-8. All integers from -2048 to 00047 // +2048 can be represented exactly. 00048 // 00049 // Type fpreal16 behaves (almost) like the built-in C++ floating point 00050 // types. In arithmetic expressions, fpreal16, float and double can be 00051 // mixed freely. Here are a few examples: 00052 // 00053 // fpreal16 a (3.5); 00054 // float b (a + sqrt (a)); 00055 // a += b; 00056 // b += a; 00057 // b = a + 7; 00058 // 00059 // Conversions from fpreal16 to float are lossless; all fpreal16 numbers 00060 // are exactly representable as floats. 00061 // 00062 // Conversions from float to fpreal16 may not preserve the float's 00063 // value exactly. If a float is not representable as a fpreal16, the 00064 // float value is rounded to the nearest representable fpreal16. If 00065 // a float value is exactly in the middle between the two closest 00066 // representable fpreal16 values, then the float value is rounded to 00067 // the fpreal16 with the greater magnitude. 00068 // 00069 // Overflows during float-to-fpreal16 conversions cause arithmetic 00070 // exceptions. An overflow occurs when the float value to be 00071 // converted is too large to be represented as a fpreal16, or if the 00072 // float value is an infinity or a NAN. 00073 // 00074 // The implementation of type fpreal16 makes the following assumptions 00075 // about the implementation of the built-in C++ types: 00076 // 00077 // float is an IEEE 754 single-precision number 00078 // sizeof (float) == 4 00079 // sizeof (unsigned int) == sizeof (float) 00080 // alignof (unsigned int) == alignof (float) 00081 // sizeof (unsigned short) == 2 00082 // 00083 //--------------------------------------------------------------------------- 00084 00085 #ifndef _H_REAL16_H_ 00086 #define _H_REAL16_H_ 00087 00088 #include "SYS_API.h" 00089 #ifndef __SYS_Types__ 00090 #error This file must be included from SYS_Types.h 00091 #endif 00092 00093 #include <iostream.h> 00094 00095 class SYS_API fpreal16 00096 { 00097 public: 00098 00099 //------------- 00100 // Constructors 00101 //------------- 00102 00103 fpreal16 (); // no initialization 00104 fpreal16 (const fpreal16 &h); 00105 fpreal16 (fpreal32 f); 00106 00107 00108 //-------------------- 00109 // Conversion to float 00110 //-------------------- 00111 00112 operator fpreal32 () const; 00113 00114 00115 //------------ 00116 // Unary minus 00117 //------------ 00118 00119 fpreal16 operator - () const; 00120 00121 00122 //----------- 00123 // Assignment 00124 //----------- 00125 00126 fpreal16 operator = (fpreal16 h); 00127 fpreal16 operator = (fpreal32 f); 00128 00129 fpreal16 operator += (fpreal16 h); 00130 fpreal16 operator += (fpreal32 f); 00131 00132 fpreal16 operator -= (fpreal16 h); 00133 fpreal16 operator -= (fpreal32 f); 00134 00135 fpreal16 operator *= (fpreal16 h); 00136 fpreal16 operator *= (fpreal32 f); 00137 00138 fpreal16 operator /= (fpreal16 h); 00139 fpreal16 operator /= (fpreal32 f); 00140 00141 00142 //--------------------------------------------------------- 00143 // Round to n-bit precision (n should be between 0 and 10). 00144 // After rounding, the significand's 10-n least significant 00145 // bits will be zero. 00146 //--------------------------------------------------------- 00147 00148 fpreal16 round (unsigned int n) const; 00149 00150 00151 //-------------------------------------------------------------------- 00152 // Classification: 00153 // 00154 // h.isFinite() returns true if h is a normalized number, 00155 // a denormalized number or zero 00156 // 00157 // h.isNormalized() returns true if h is a normalized number 00158 // 00159 // h.isDenormalized() returns true if h is a denormalized number 00160 // 00161 // h.isZero() returns true if h is zero 00162 // 00163 // h.isNan() returns true if h is a NAN 00164 // 00165 // h.isInfinity() returns true if h is a positive 00166 // or a negative infinity 00167 // 00168 // h.isNegative() returns true if the sign bit of h 00169 // is set (negative) 00170 //-------------------------------------------------------------------- 00171 00172 bool isFinite () const; 00173 bool isNormalized () const; 00174 bool isDenormalized () const; 00175 bool isZero () const; 00176 bool isNan () const; 00177 bool isInfinity () const; 00178 bool isNegative () const; 00179 00180 00181 //-------------------------------------------- 00182 // Special values 00183 // 00184 // posInf() returns +infinity 00185 // 00186 // negInf() returns +infinity 00187 // 00188 // qNan() returns a NAN with the bit 00189 // pattern 0111111111111111 00190 // 00191 // sNan() returns a NAN with the bit 00192 // pattern 0111110111111111 00193 //-------------------------------------------- 00194 00195 static fpreal16 posInf (); 00196 static fpreal16 negInf (); 00197 static fpreal16 qNan (); 00198 static fpreal16 sNan (); 00199 00200 00201 //-------------------------------------- 00202 // Access to the internal representation 00203 //-------------------------------------- 00204 00205 unsigned short bits () const; 00206 void setBits (unsigned short bits); 00207 00208 00209 public: 00210 00211 union uif 00212 { 00213 uint32 i; 00214 fpreal32 f; 00215 }; 00216 00217 private: 00218 00219 static int16 convert (int i); 00220 static fpreal32 overflow (); 00221 00222 unsigned short _h; 00223 00224 protected: 00225 static bool selftest (); 00226 static const uif *_toFloat; 00227 static const unsigned short *_eLut; 00228 static bool _itWorks; 00229 }; 00230 00231 00232 //----------- 00233 // Stream I/O 00234 //----------- 00235 00236 ostream & operator << (ostream &os, fpreal16 h); 00237 istream & operator >> (istream &is, fpreal16 &h); 00238 00239 00240 //---------- 00241 // Debugging 00242 //---------- 00243 00244 void SYSprintBits (ostream &os, fpreal16 h); 00245 void SYSprintBits (ostream &os, fpreal32 f); 00246 void SYSprintBits (char c[19], fpreal16 h); 00247 void SYSprintBits (char c[35], fpreal32 f); 00248 00249 00250 //------- 00251 // Limits 00252 //------- 00253 00254 #define H_REAL16_MIN 5.96046448e-08 // Smallest +ve fpreal16 00255 #define H_REAL16_NRM_MIN 6.10351562e-05 // Smallest +ve normalized fpreal16 00256 00257 #define H_REAL16_MAX 65504.0 // Largest positive fpreal16 00258 00259 // Smallest positive e for which fpreal16 (1.0 + e) != fpreal16 (1.0) 00260 #define H_REAL16_EPSILON 0.00097656 00261 00262 #define H_REAL16_MANT_DIG 11 // Number of digits in mantissa 00263 // (significand + hidden leading 1) 00264 00265 #define H_REAL16_DIG 2 // Number of base 10 digits that 00266 // can be represented without change 00267 00268 #define H_REAL16_RADIX 2 // Base of the exponent 00269 00270 #define H_REAL16_MIN_EXP -13 // Minimum negative integer such that 00271 // H_REAL16_RADIX raised to the power of 00272 // one less than that integer is a 00273 // normalized fpreal16 00274 00275 #define H_REAL16_MAX_EXP 16 // Maximum positive integer such that 00276 // H_REAL16_RADIX raised to the power of 00277 // one less than that integer is a 00278 // normalized fpreal16 00279 00280 #define H_REAL16_MIN_10_EXP -4 // Minimum positive integer such 00281 // that 10 raised to that power is 00282 // a normalized fpreal16 00283 00284 #define H_REAL16_MAX_10_EXP 4 // Maximum positive integer such 00285 // that 10 raised to that power is 00286 // a normalized fpreal16 00287 00288 00289 //--------------------------------------------------------------------------- 00290 // 00291 // Implementation -- 00292 // 00293 // Representation of a float: 00294 // 00295 // We assume that a float, f, is an IEEE 754 single-precision 00296 // floating point number, whose bits are arranged as follows: 00297 // 00298 // 31 (msb) 00299 // | 00300 // | 30 23 00301 // | | | 00302 // | | | 22 0 (lsb) 00303 // | | | | | 00304 // X XXXXXXXX XXXXXXXXXXXXXXXXXXXXXXX 00305 // 00306 // s e m 00307 // 00308 // S is the sign-bit, e is the exponent and m is the significand. 00309 // 00310 // If e is between 1 and 254, f is a normalized number: 00311 // 00312 // s e-127 00313 // f = (-1) * 2 * 1.m 00314 // 00315 // If e is 0, and m is not zero, f is a denormalized number: 00316 // 00317 // s -126 00318 // f = (-1) * 2 * 0.m 00319 // 00320 // If e and m are both zero, f is zero: 00321 // 00322 // f = 0.0 00323 // 00324 // If e is 255, f is an "infinity" or "not a number" (NAN), 00325 // depending on whether m is zero or not. 00326 // 00327 // Examples: 00328 // 00329 // 0 00000000 00000000000000000000000 = 0.0 00330 // 0 01111110 00000000000000000000000 = 0.5 00331 // 0 01111111 00000000000000000000000 = 1.0 00332 // 0 10000000 00000000000000000000000 = 2.0 00333 // 0 10000000 10000000000000000000000 = 3.0 00334 // 1 10000101 11110000010000000000000 = -124.0625 00335 // 0 11111111 00000000000000000000000 = +infinity 00336 // 1 11111111 00000000000000000000000 = -infinity 00337 // 0 11111111 10000000000000000000000 = NAN 00338 // 1 11111111 11111111111111111111111 = NAN 00339 // 00340 // Representation of a fpreal16: 00341 // 00342 // Here is the bit-layout for a fpreal16 number, h: 00343 // 00344 // 15 (msb) 00345 // | 00346 // | 14 10 00347 // | | | 00348 // | | | 9 0 (lsb) 00349 // | | | | | 00350 // X XXXXX XXXXXXXXXX 00351 // 00352 // s e m 00353 // 00354 // S is the sign-bit, e is the exponent and m is the significand. 00355 // 00356 // If e is between 1 and 30, h is a normalized number: 00357 // 00358 // s e-15 00359 // h = (-1) * 2 * 1.m 00360 // 00361 // If e is 0, and m is not zero, h is a denormalized number: 00362 // 00363 // S -14 00364 // h = (-1) * 2 * 0.m 00365 // 00366 // If e and m are both zero, h is zero: 00367 // 00368 // h = 0.0 00369 // 00370 // If e is 31, h is an "infinity" or "not a number" (NAN), 00371 // depending on whether m is zero or not. 00372 // 00373 // Examples: 00374 // 00375 // 0 00000 0000000000 = 0.0 00376 // 0 01110 0000000000 = 0.5 00377 // 0 01111 0000000000 = 1.0 00378 // 0 10000 0000000000 = 2.0 00379 // 0 10000 1000000000 = 3.0 00380 // 1 10101 1111000001 = -124.0625 00381 // 0 11111 0000000000 = +infinity 00382 // 1 11111 0000000000 = -infinity 00383 // 0 11111 1000000000 = NAN 00384 // 1 11111 1111111111 = NAN 00385 // 00386 // Conversion: 00387 // 00388 // Converting from a float to a fpreal16 requires some non-trivial bit 00389 // manipulations. In some cases, this makes conversion relatively 00390 // slow, but the most common case is accelerated via table lookups. 00391 // 00392 // Converting back from a fpreal16 to a float is easier because we don't 00393 // have to do any rounding. In addition, there are only 65536 00394 // different fpreal16 numbers; we can convert each of those numbers once 00395 // and store the results in a table. Later, all conversions can be 00396 // done using only simple table lookups. 00397 // 00398 //--------------------------------------------------------------------------- 00399 00400 00401 //-------------------- 00402 // Simple constructors 00403 //-------------------- 00404 00405 inline 00406 fpreal16::fpreal16 () 00407 { 00408 // no initialization 00409 } 00410 00411 00412 inline 00413 fpreal16::fpreal16 (const fpreal16 &h) 00414 { 00415 _h = h._h; 00416 } 00417 00418 00419 //---------------------------- 00420 // fpreal16-from-float constructor 00421 //---------------------------- 00422 00423 inline 00424 fpreal16::fpreal16(fpreal32 f) 00425 { 00426 if (f == 0) 00427 { 00428 // 00429 // Common special case - zero. 00430 // For speed, we don't preserve the zero's sign. 00431 // 00432 00433 _h = 0; 00434 } 00435 else 00436 { 00437 // 00438 // We extract the combined sign and exponent, e, from our 00439 // floating-point number, f. Then we convert e to the sign 00440 // and exponent of the fpreal16 number via a table lookup. 00441 // 00442 // For the most common case, where a normalized fpreal16 is produced, 00443 // the table lookup returns a non-zero value; in this case, all 00444 // we have to do, is round f's significand to 10 bits and combine 00445 // the result with e. 00446 // 00447 // For all other cases (overflow, zeroes, denormalized numbers 00448 // resulting from underflow, infinities and NANs), the table 00449 // lookup returns zero, and we call a longer, non-inline function 00450 // to do the float-to-fpreal16 conversion. 00451 // 00452 00453 uif x; 00454 00455 x.f = f; 00456 00457 register int e = (x.i >> 23) & 0x000001ff; 00458 00459 e = _eLut[e]; 00460 00461 if (e) 00462 { 00463 // 00464 // Simple case - round the significand and 00465 // combine it with the sign and exponent. 00466 // 00467 00468 _h = e + (((x.i & 0x007fffff) + 0x00001000) >> 13); 00469 } 00470 else 00471 { 00472 // 00473 // Difficult case - call a function. 00474 // 00475 00476 _h = convert (x.i); 00477 } 00478 } 00479 } 00480 00481 00482 //------------------------------------------ 00483 // fpreal16-to-float conversion via table lookup 00484 //------------------------------------------ 00485 00486 inline 00487 fpreal16::operator fpreal32 () const 00488 { 00489 return _toFloat[_h].f; 00490 } 00491 00492 //------------------------- 00493 // Round to n-bit precision 00494 //------------------------- 00495 00496 inline fpreal16 00497 fpreal16::round (unsigned int n) const 00498 { 00499 // 00500 // Parameter check. 00501 // 00502 00503 if (n >= 10) 00504 return *this; 00505 00506 // 00507 // Disassemble h into the sign, s, 00508 // and the combined exponent and significand, e. 00509 // 00510 00511 unsigned short s = _h & 0x8000; 00512 unsigned short e = _h & 0x7fff; 00513 00514 // 00515 // Round the exponent and significand to the nearest value 00516 // where ones occur only in the (10-n) most significant bits. 00517 // Note that the exponent adjusts automatically if rounding 00518 // up causes the significand to overflow. 00519 // 00520 00521 e >>= 9 - n; 00522 e += e & 1; 00523 e <<= 9 - n; 00524 00525 // 00526 // Check for exponent overflow. 00527 // 00528 00529 if (e >= 0x7c00) 00530 { 00531 // 00532 // Overflow occurred -- truncate instead of rounding. 00533 // 00534 00535 e = _h; 00536 e >>= 10 - n; 00537 e <<= 10 - n; 00538 } 00539 00540 // 00541 // Put the original sign bit back. 00542 // 00543 00544 fpreal16 h; 00545 h._h = s | e; 00546 00547 return h; 00548 } 00549 00550 00551 //----------------------- 00552 // Other inline functions 00553 //----------------------- 00554 00555 inline fpreal16 00556 fpreal16::operator - () const 00557 { 00558 fpreal16 h; 00559 h._h = _h ^ 0x8000; 00560 return h; 00561 } 00562 00563 00564 inline fpreal16 00565 fpreal16::operator = (fpreal16 h) 00566 { 00567 _h = h._h; 00568 return *this; 00569 } 00570 00571 00572 inline fpreal16 00573 fpreal16::operator = (fpreal32 f) 00574 { 00575 *this = fpreal16 (f); 00576 return *this; 00577 } 00578 00579 00580 inline fpreal16 00581 fpreal16::operator += (fpreal16 h) 00582 { 00583 *this = fpreal16 (fpreal32(*this) + fpreal32(h)); 00584 return *this; 00585 } 00586 00587 00588 inline fpreal16 00589 fpreal16::operator += (fpreal32 f) 00590 { 00591 *this = fpreal16 (fpreal32(*this) + f); 00592 return *this; 00593 } 00594 00595 00596 inline fpreal16 00597 fpreal16::operator -= (fpreal16 h) 00598 { 00599 *this = fpreal16 (fpreal32(*this) - fpreal32(h)); 00600 return *this; 00601 } 00602 00603 00604 inline fpreal16 00605 fpreal16::operator -= (fpreal32 f) 00606 { 00607 *this = fpreal16 (fpreal32 (*this) - f); 00608 return *this; 00609 } 00610 00611 00612 inline fpreal16 00613 fpreal16::operator *= (fpreal16 h) 00614 { 00615 *this = fpreal16 (fpreal32 (*this) * fpreal32 (h)); 00616 return *this; 00617 } 00618 00619 00620 inline fpreal16 00621 fpreal16::operator *= (fpreal32 f) 00622 { 00623 *this = fpreal16 (fpreal32 (*this) * f); 00624 return *this; 00625 } 00626 00627 00628 inline fpreal16 00629 fpreal16::operator /= (fpreal16 h) 00630 { 00631 *this = fpreal16 (fpreal32 (*this) / fpreal32 (h)); 00632 return *this; 00633 } 00634 00635 00636 inline fpreal16 00637 fpreal16::operator /= (fpreal32 f) 00638 { 00639 *this = fpreal16 (fpreal32 (*this) / f); 00640 return *this; 00641 } 00642 00643 00644 inline bool 00645 fpreal16::isFinite () const 00646 { 00647 unsigned short e = (_h >> 10) & 0x001f; 00648 return e < 31; 00649 } 00650 00651 00652 inline bool 00653 fpreal16::isNormalized () const 00654 { 00655 unsigned short e = (_h >> 10) & 0x001f; 00656 return e > 0 && e < 31; 00657 } 00658 00659 00660 inline bool 00661 fpreal16::isDenormalized () const 00662 { 00663 unsigned short e = (_h >> 10) & 0x001f; 00664 unsigned short m = _h & 0x3ff; 00665 return e == 0 && m != 0; 00666 } 00667 00668 00669 inline bool 00670 fpreal16::isZero () const 00671 { 00672 return (_h & 0x7fff) == 0; 00673 } 00674 00675 00676 inline bool 00677 fpreal16::isNan () const 00678 { 00679 unsigned short e = (_h >> 10) & 0x001f; 00680 unsigned short m = _h & 0x3ff; 00681 return e == 31 && m != 0; 00682 } 00683 00684 00685 inline bool 00686 fpreal16::isInfinity () const 00687 { 00688 unsigned short e = (_h >> 10) & 0x001f; 00689 unsigned short m = _h & 0x3ff; 00690 return e == 31 && m == 0; 00691 } 00692 00693 00694 inline bool 00695 fpreal16::isNegative () const 00696 { 00697 return (_h & 0x8000) != 0; 00698 } 00699 00700 00701 inline fpreal16 00702 fpreal16::posInf () 00703 { 00704 fpreal16 h; 00705 h._h = 0x7c00; 00706 return h; 00707 } 00708 00709 00710 inline fpreal16 00711 fpreal16::negInf () 00712 { 00713 fpreal16 h; 00714 h._h = 0xfc00; 00715 return h; 00716 } 00717 00718 00719 inline fpreal16 00720 fpreal16::qNan () 00721 { 00722 fpreal16 h; 00723 h._h = 0x7fff; 00724 return h; 00725 } 00726 00727 00728 inline fpreal16 00729 fpreal16::sNan () 00730 { 00731 fpreal16 h; 00732 h._h = 0x7dff; 00733 return h; 00734 } 00735 00736 00737 inline unsigned short 00738 fpreal16::bits () const 00739 { 00740 return _h; 00741 } 00742 00743 00744 inline void 00745 fpreal16::setBits (unsigned short b) 00746 { 00747 _h = b; 00748 } 00749 00750 #endif
1.5.9