1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "fpu/softfloat.h" 87 88 /* We only need stdlib for abort() */ 89 90 /*---------------------------------------------------------------------------- 91 | Primitive arithmetic functions, including multi-word arithmetic, and 92 | division and square root approximations. (Can be specialized to target if 93 | desired.) 94 *----------------------------------------------------------------------------*/ 95 #include "softfloat-macros.h" 96 97 /*---------------------------------------------------------------------------- 98 | Functions and definitions to determine: (1) whether tininess for underflow 99 | is detected before or after rounding by default, (2) what (if anything) 100 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 101 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 102 | are propagated from function inputs to output. These details are target- 103 | specific. 104 *----------------------------------------------------------------------------*/ 105 #include "softfloat-specialize.h" 106 107 /*---------------------------------------------------------------------------- 108 | Returns the fraction bits of the half-precision floating-point value `a'. 109 *----------------------------------------------------------------------------*/ 110 111 static inline uint32_t extractFloat16Frac(float16 a) 112 { 113 return float16_val(a) & 0x3ff; 114 } 115 116 /*---------------------------------------------------------------------------- 117 | Returns the exponent bits of the half-precision floating-point value `a'. 118 *----------------------------------------------------------------------------*/ 119 120 static inline int extractFloat16Exp(float16 a) 121 { 122 return (float16_val(a) >> 10) & 0x1f; 123 } 124 125 /*---------------------------------------------------------------------------- 126 | Returns the sign bit of the single-precision floating-point value `a'. 127 *----------------------------------------------------------------------------*/ 128 129 static inline flag extractFloat16Sign(float16 a) 130 { 131 return float16_val(a)>>15; 132 } 133 134 /*---------------------------------------------------------------------------- 135 | Returns the fraction bits of the single-precision floating-point value `a'. 136 *----------------------------------------------------------------------------*/ 137 138 static inline uint32_t extractFloat32Frac(float32 a) 139 { 140 return float32_val(a) & 0x007FFFFF; 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Returns the exponent bits of the single-precision floating-point value `a'. 145 *----------------------------------------------------------------------------*/ 146 147 static inline int extractFloat32Exp(float32 a) 148 { 149 return (float32_val(a) >> 23) & 0xFF; 150 } 151 152 /*---------------------------------------------------------------------------- 153 | Returns the sign bit of the single-precision floating-point value `a'. 154 *----------------------------------------------------------------------------*/ 155 156 static inline flag extractFloat32Sign(float32 a) 157 { 158 return float32_val(a) >> 31; 159 } 160 161 /*---------------------------------------------------------------------------- 162 | Returns the fraction bits of the double-precision floating-point value `a'. 163 *----------------------------------------------------------------------------*/ 164 165 static inline uint64_t extractFloat64Frac(float64 a) 166 { 167 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 168 } 169 170 /*---------------------------------------------------------------------------- 171 | Returns the exponent bits of the double-precision floating-point value `a'. 172 *----------------------------------------------------------------------------*/ 173 174 static inline int extractFloat64Exp(float64 a) 175 { 176 return (float64_val(a) >> 52) & 0x7FF; 177 } 178 179 /*---------------------------------------------------------------------------- 180 | Returns the sign bit of the double-precision floating-point value `a'. 181 *----------------------------------------------------------------------------*/ 182 183 static inline flag extractFloat64Sign(float64 a) 184 { 185 return float64_val(a) >> 63; 186 } 187 188 /* 189 * Classify a floating point number. Everything above float_class_qnan 190 * is a NaN so cls >= float_class_qnan is any NaN. 191 */ 192 193 typedef enum __attribute__ ((__packed__)) { 194 float_class_unclassified, 195 float_class_zero, 196 float_class_normal, 197 float_class_inf, 198 float_class_qnan, /* all NaNs from here */ 199 float_class_snan, 200 float_class_dnan, 201 float_class_msnan, /* maybe silenced */ 202 } FloatClass; 203 204 /* 205 * Structure holding all of the decomposed parts of a float. The 206 * exponent is unbiased and the fraction is normalized. All 207 * calculations are done with a 64 bit fraction and then rounded as 208 * appropriate for the final format. 209 * 210 * Thanks to the packed FloatClass a decent compiler should be able to 211 * fit the whole structure into registers and avoid using the stack 212 * for parameter passing. 213 */ 214 215 typedef struct { 216 uint64_t frac; 217 int32_t exp; 218 FloatClass cls; 219 bool sign; 220 } FloatParts; 221 222 #define DECOMPOSED_BINARY_POINT (64 - 2) 223 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 224 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 225 226 /* Structure holding all of the relevant parameters for a format. 227 * exp_size: the size of the exponent field 228 * exp_bias: the offset applied to the exponent field 229 * exp_max: the maximum normalised exponent 230 * frac_size: the size of the fraction field 231 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 232 * The following are computed based the size of fraction 233 * frac_lsb: least significant bit of fraction 234 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 235 * round_mask/roundeven_mask: masks used for rounding 236 */ 237 typedef struct { 238 int exp_size; 239 int exp_bias; 240 int exp_max; 241 int frac_size; 242 int frac_shift; 243 uint64_t frac_lsb; 244 uint64_t frac_lsbm1; 245 uint64_t round_mask; 246 uint64_t roundeven_mask; 247 } FloatFmt; 248 249 /* Expand fields based on the size of exponent and fraction */ 250 #define FLOAT_PARAMS(E, F) \ 251 .exp_size = E, \ 252 .exp_bias = ((1 << E) - 1) >> 1, \ 253 .exp_max = (1 << E) - 1, \ 254 .frac_size = F, \ 255 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 256 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 257 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 258 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 259 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 260 261 static const FloatFmt float16_params = { 262 FLOAT_PARAMS(5, 10) 263 }; 264 265 static const FloatFmt float32_params = { 266 FLOAT_PARAMS(8, 23) 267 }; 268 269 static const FloatFmt float64_params = { 270 FLOAT_PARAMS(11, 52) 271 }; 272 273 /*---------------------------------------------------------------------------- 274 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 275 | and 7, and returns the properly rounded 32-bit integer corresponding to the 276 | input. If `zSign' is 1, the input is negated before being converted to an 277 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 278 | is simply rounded to an integer, with the inexact exception raised if the 279 | input cannot be represented exactly as an integer. However, if the fixed- 280 | point input is too large, the invalid exception is raised and the largest 281 | positive or negative integer is returned. 282 *----------------------------------------------------------------------------*/ 283 284 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 285 { 286 int8_t roundingMode; 287 flag roundNearestEven; 288 int8_t roundIncrement, roundBits; 289 int32_t z; 290 291 roundingMode = status->float_rounding_mode; 292 roundNearestEven = ( roundingMode == float_round_nearest_even ); 293 switch (roundingMode) { 294 case float_round_nearest_even: 295 case float_round_ties_away: 296 roundIncrement = 0x40; 297 break; 298 case float_round_to_zero: 299 roundIncrement = 0; 300 break; 301 case float_round_up: 302 roundIncrement = zSign ? 0 : 0x7f; 303 break; 304 case float_round_down: 305 roundIncrement = zSign ? 0x7f : 0; 306 break; 307 default: 308 abort(); 309 } 310 roundBits = absZ & 0x7F; 311 absZ = ( absZ + roundIncrement )>>7; 312 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 313 z = absZ; 314 if ( zSign ) z = - z; 315 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 316 float_raise(float_flag_invalid, status); 317 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 318 } 319 if (roundBits) { 320 status->float_exception_flags |= float_flag_inexact; 321 } 322 return z; 323 324 } 325 326 /*---------------------------------------------------------------------------- 327 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 328 | `absZ1', with binary point between bits 63 and 64 (between the input words), 329 | and returns the properly rounded 64-bit integer corresponding to the input. 330 | If `zSign' is 1, the input is negated before being converted to an integer. 331 | Ordinarily, the fixed-point input is simply rounded to an integer, with 332 | the inexact exception raised if the input cannot be represented exactly as 333 | an integer. However, if the fixed-point input is too large, the invalid 334 | exception is raised and the largest positive or negative integer is 335 | returned. 336 *----------------------------------------------------------------------------*/ 337 338 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 339 float_status *status) 340 { 341 int8_t roundingMode; 342 flag roundNearestEven, increment; 343 int64_t z; 344 345 roundingMode = status->float_rounding_mode; 346 roundNearestEven = ( roundingMode == float_round_nearest_even ); 347 switch (roundingMode) { 348 case float_round_nearest_even: 349 case float_round_ties_away: 350 increment = ((int64_t) absZ1 < 0); 351 break; 352 case float_round_to_zero: 353 increment = 0; 354 break; 355 case float_round_up: 356 increment = !zSign && absZ1; 357 break; 358 case float_round_down: 359 increment = zSign && absZ1; 360 break; 361 default: 362 abort(); 363 } 364 if ( increment ) { 365 ++absZ0; 366 if ( absZ0 == 0 ) goto overflow; 367 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 368 } 369 z = absZ0; 370 if ( zSign ) z = - z; 371 if ( z && ( ( z < 0 ) ^ zSign ) ) { 372 overflow: 373 float_raise(float_flag_invalid, status); 374 return 375 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 376 : LIT64( 0x7FFFFFFFFFFFFFFF ); 377 } 378 if (absZ1) { 379 status->float_exception_flags |= float_flag_inexact; 380 } 381 return z; 382 383 } 384 385 /*---------------------------------------------------------------------------- 386 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 387 | `absZ1', with binary point between bits 63 and 64 (between the input words), 388 | and returns the properly rounded 64-bit unsigned integer corresponding to the 389 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 390 | with the inexact exception raised if the input cannot be represented exactly 391 | as an integer. However, if the fixed-point input is too large, the invalid 392 | exception is raised and the largest unsigned integer is returned. 393 *----------------------------------------------------------------------------*/ 394 395 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 396 uint64_t absZ1, float_status *status) 397 { 398 int8_t roundingMode; 399 flag roundNearestEven, increment; 400 401 roundingMode = status->float_rounding_mode; 402 roundNearestEven = (roundingMode == float_round_nearest_even); 403 switch (roundingMode) { 404 case float_round_nearest_even: 405 case float_round_ties_away: 406 increment = ((int64_t)absZ1 < 0); 407 break; 408 case float_round_to_zero: 409 increment = 0; 410 break; 411 case float_round_up: 412 increment = !zSign && absZ1; 413 break; 414 case float_round_down: 415 increment = zSign && absZ1; 416 break; 417 default: 418 abort(); 419 } 420 if (increment) { 421 ++absZ0; 422 if (absZ0 == 0) { 423 float_raise(float_flag_invalid, status); 424 return LIT64(0xFFFFFFFFFFFFFFFF); 425 } 426 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 427 } 428 429 if (zSign && absZ0) { 430 float_raise(float_flag_invalid, status); 431 return 0; 432 } 433 434 if (absZ1) { 435 status->float_exception_flags |= float_flag_inexact; 436 } 437 return absZ0; 438 } 439 440 /*---------------------------------------------------------------------------- 441 | If `a' is denormal and we are in flush-to-zero mode then set the 442 | input-denormal exception and return zero. Otherwise just return the value. 443 *----------------------------------------------------------------------------*/ 444 float32 float32_squash_input_denormal(float32 a, float_status *status) 445 { 446 if (status->flush_inputs_to_zero) { 447 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 448 float_raise(float_flag_input_denormal, status); 449 return make_float32(float32_val(a) & 0x80000000); 450 } 451 } 452 return a; 453 } 454 455 /*---------------------------------------------------------------------------- 456 | Normalizes the subnormal single-precision floating-point value represented 457 | by the denormalized significand `aSig'. The normalized exponent and 458 | significand are stored at the locations pointed to by `zExpPtr' and 459 | `zSigPtr', respectively. 460 *----------------------------------------------------------------------------*/ 461 462 static void 463 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 464 { 465 int8_t shiftCount; 466 467 shiftCount = countLeadingZeros32( aSig ) - 8; 468 *zSigPtr = aSig<<shiftCount; 469 *zExpPtr = 1 - shiftCount; 470 471 } 472 473 /*---------------------------------------------------------------------------- 474 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 475 | single-precision floating-point value, returning the result. After being 476 | shifted into the proper positions, the three fields are simply added 477 | together to form the result. This means that any integer portion of `zSig' 478 | will be added into the exponent. Since a properly normalized significand 479 | will have an integer portion equal to 1, the `zExp' input should be 1 less 480 | than the desired result exponent whenever `zSig' is a complete, normalized 481 | significand. 482 *----------------------------------------------------------------------------*/ 483 484 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 485 { 486 487 return make_float32( 488 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 489 490 } 491 492 /*---------------------------------------------------------------------------- 493 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 494 | and significand `zSig', and returns the proper single-precision floating- 495 | point value corresponding to the abstract input. Ordinarily, the abstract 496 | value is simply rounded and packed into the single-precision format, with 497 | the inexact exception raised if the abstract input cannot be represented 498 | exactly. However, if the abstract value is too large, the overflow and 499 | inexact exceptions are raised and an infinity or maximal finite value is 500 | returned. If the abstract value is too small, the input value is rounded to 501 | a subnormal number, and the underflow and inexact exceptions are raised if 502 | the abstract input cannot be represented exactly as a subnormal single- 503 | precision floating-point number. 504 | The input significand `zSig' has its binary point between bits 30 505 | and 29, which is 7 bits to the left of the usual location. This shifted 506 | significand must be normalized or smaller. If `zSig' is not normalized, 507 | `zExp' must be 0; in that case, the result returned is a subnormal number, 508 | and it must not require rounding. In the usual case that `zSig' is 509 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 510 | The handling of underflow and overflow follows the IEC/IEEE Standard for 511 | Binary Floating-Point Arithmetic. 512 *----------------------------------------------------------------------------*/ 513 514 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 515 float_status *status) 516 { 517 int8_t roundingMode; 518 flag roundNearestEven; 519 int8_t roundIncrement, roundBits; 520 flag isTiny; 521 522 roundingMode = status->float_rounding_mode; 523 roundNearestEven = ( roundingMode == float_round_nearest_even ); 524 switch (roundingMode) { 525 case float_round_nearest_even: 526 case float_round_ties_away: 527 roundIncrement = 0x40; 528 break; 529 case float_round_to_zero: 530 roundIncrement = 0; 531 break; 532 case float_round_up: 533 roundIncrement = zSign ? 0 : 0x7f; 534 break; 535 case float_round_down: 536 roundIncrement = zSign ? 0x7f : 0; 537 break; 538 default: 539 abort(); 540 break; 541 } 542 roundBits = zSig & 0x7F; 543 if ( 0xFD <= (uint16_t) zExp ) { 544 if ( ( 0xFD < zExp ) 545 || ( ( zExp == 0xFD ) 546 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 547 ) { 548 float_raise(float_flag_overflow | float_flag_inexact, status); 549 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 550 } 551 if ( zExp < 0 ) { 552 if (status->flush_to_zero) { 553 float_raise(float_flag_output_denormal, status); 554 return packFloat32(zSign, 0, 0); 555 } 556 isTiny = 557 (status->float_detect_tininess 558 == float_tininess_before_rounding) 559 || ( zExp < -1 ) 560 || ( zSig + roundIncrement < 0x80000000 ); 561 shift32RightJamming( zSig, - zExp, &zSig ); 562 zExp = 0; 563 roundBits = zSig & 0x7F; 564 if (isTiny && roundBits) { 565 float_raise(float_flag_underflow, status); 566 } 567 } 568 } 569 if (roundBits) { 570 status->float_exception_flags |= float_flag_inexact; 571 } 572 zSig = ( zSig + roundIncrement )>>7; 573 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 574 if ( zSig == 0 ) zExp = 0; 575 return packFloat32( zSign, zExp, zSig ); 576 577 } 578 579 /*---------------------------------------------------------------------------- 580 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 581 | and significand `zSig', and returns the proper single-precision floating- 582 | point value corresponding to the abstract input. This routine is just like 583 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 584 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 585 | floating-point exponent. 586 *----------------------------------------------------------------------------*/ 587 588 static float32 589 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 590 float_status *status) 591 { 592 int8_t shiftCount; 593 594 shiftCount = countLeadingZeros32( zSig ) - 1; 595 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 596 status); 597 598 } 599 600 /*---------------------------------------------------------------------------- 601 | If `a' is denormal and we are in flush-to-zero mode then set the 602 | input-denormal exception and return zero. Otherwise just return the value. 603 *----------------------------------------------------------------------------*/ 604 float64 float64_squash_input_denormal(float64 a, float_status *status) 605 { 606 if (status->flush_inputs_to_zero) { 607 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 608 float_raise(float_flag_input_denormal, status); 609 return make_float64(float64_val(a) & (1ULL << 63)); 610 } 611 } 612 return a; 613 } 614 615 /*---------------------------------------------------------------------------- 616 | Normalizes the subnormal double-precision floating-point value represented 617 | by the denormalized significand `aSig'. The normalized exponent and 618 | significand are stored at the locations pointed to by `zExpPtr' and 619 | `zSigPtr', respectively. 620 *----------------------------------------------------------------------------*/ 621 622 static void 623 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 624 { 625 int8_t shiftCount; 626 627 shiftCount = countLeadingZeros64( aSig ) - 11; 628 *zSigPtr = aSig<<shiftCount; 629 *zExpPtr = 1 - shiftCount; 630 631 } 632 633 /*---------------------------------------------------------------------------- 634 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 635 | double-precision floating-point value, returning the result. After being 636 | shifted into the proper positions, the three fields are simply added 637 | together to form the result. This means that any integer portion of `zSig' 638 | will be added into the exponent. Since a properly normalized significand 639 | will have an integer portion equal to 1, the `zExp' input should be 1 less 640 | than the desired result exponent whenever `zSig' is a complete, normalized 641 | significand. 642 *----------------------------------------------------------------------------*/ 643 644 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 645 { 646 647 return make_float64( 648 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 649 650 } 651 652 /*---------------------------------------------------------------------------- 653 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 654 | and significand `zSig', and returns the proper double-precision floating- 655 | point value corresponding to the abstract input. Ordinarily, the abstract 656 | value is simply rounded and packed into the double-precision format, with 657 | the inexact exception raised if the abstract input cannot be represented 658 | exactly. However, if the abstract value is too large, the overflow and 659 | inexact exceptions are raised and an infinity or maximal finite value is 660 | returned. If the abstract value is too small, the input value is rounded to 661 | a subnormal number, and the underflow and inexact exceptions are raised if 662 | the abstract input cannot be represented exactly as a subnormal double- 663 | precision floating-point number. 664 | The input significand `zSig' has its binary point between bits 62 665 | and 61, which is 10 bits to the left of the usual location. This shifted 666 | significand must be normalized or smaller. If `zSig' is not normalized, 667 | `zExp' must be 0; in that case, the result returned is a subnormal number, 668 | and it must not require rounding. In the usual case that `zSig' is 669 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 670 | The handling of underflow and overflow follows the IEC/IEEE Standard for 671 | Binary Floating-Point Arithmetic. 672 *----------------------------------------------------------------------------*/ 673 674 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 675 float_status *status) 676 { 677 int8_t roundingMode; 678 flag roundNearestEven; 679 int roundIncrement, roundBits; 680 flag isTiny; 681 682 roundingMode = status->float_rounding_mode; 683 roundNearestEven = ( roundingMode == float_round_nearest_even ); 684 switch (roundingMode) { 685 case float_round_nearest_even: 686 case float_round_ties_away: 687 roundIncrement = 0x200; 688 break; 689 case float_round_to_zero: 690 roundIncrement = 0; 691 break; 692 case float_round_up: 693 roundIncrement = zSign ? 0 : 0x3ff; 694 break; 695 case float_round_down: 696 roundIncrement = zSign ? 0x3ff : 0; 697 break; 698 case float_round_to_odd: 699 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 700 break; 701 default: 702 abort(); 703 } 704 roundBits = zSig & 0x3FF; 705 if ( 0x7FD <= (uint16_t) zExp ) { 706 if ( ( 0x7FD < zExp ) 707 || ( ( zExp == 0x7FD ) 708 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 709 ) { 710 bool overflow_to_inf = roundingMode != float_round_to_odd && 711 roundIncrement != 0; 712 float_raise(float_flag_overflow | float_flag_inexact, status); 713 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 714 } 715 if ( zExp < 0 ) { 716 if (status->flush_to_zero) { 717 float_raise(float_flag_output_denormal, status); 718 return packFloat64(zSign, 0, 0); 719 } 720 isTiny = 721 (status->float_detect_tininess 722 == float_tininess_before_rounding) 723 || ( zExp < -1 ) 724 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 725 shift64RightJamming( zSig, - zExp, &zSig ); 726 zExp = 0; 727 roundBits = zSig & 0x3FF; 728 if (isTiny && roundBits) { 729 float_raise(float_flag_underflow, status); 730 } 731 if (roundingMode == float_round_to_odd) { 732 /* 733 * For round-to-odd case, the roundIncrement depends on 734 * zSig which just changed. 735 */ 736 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 737 } 738 } 739 } 740 if (roundBits) { 741 status->float_exception_flags |= float_flag_inexact; 742 } 743 zSig = ( zSig + roundIncrement )>>10; 744 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 745 if ( zSig == 0 ) zExp = 0; 746 return packFloat64( zSign, zExp, zSig ); 747 748 } 749 750 /*---------------------------------------------------------------------------- 751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 752 | and significand `zSig', and returns the proper double-precision floating- 753 | point value corresponding to the abstract input. This routine is just like 754 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 755 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 756 | floating-point exponent. 757 *----------------------------------------------------------------------------*/ 758 759 static float64 760 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 761 float_status *status) 762 { 763 int8_t shiftCount; 764 765 shiftCount = countLeadingZeros64( zSig ) - 1; 766 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 767 status); 768 769 } 770 771 /*---------------------------------------------------------------------------- 772 | Returns the fraction bits of the extended double-precision floating-point 773 | value `a'. 774 *----------------------------------------------------------------------------*/ 775 776 static inline uint64_t extractFloatx80Frac( floatx80 a ) 777 { 778 779 return a.low; 780 781 } 782 783 /*---------------------------------------------------------------------------- 784 | Returns the exponent bits of the extended double-precision floating-point 785 | value `a'. 786 *----------------------------------------------------------------------------*/ 787 788 static inline int32_t extractFloatx80Exp( floatx80 a ) 789 { 790 791 return a.high & 0x7FFF; 792 793 } 794 795 /*---------------------------------------------------------------------------- 796 | Returns the sign bit of the extended double-precision floating-point value 797 | `a'. 798 *----------------------------------------------------------------------------*/ 799 800 static inline flag extractFloatx80Sign( floatx80 a ) 801 { 802 803 return a.high>>15; 804 805 } 806 807 /*---------------------------------------------------------------------------- 808 | Normalizes the subnormal extended double-precision floating-point value 809 | represented by the denormalized significand `aSig'. The normalized exponent 810 | and significand are stored at the locations pointed to by `zExpPtr' and 811 | `zSigPtr', respectively. 812 *----------------------------------------------------------------------------*/ 813 814 static void 815 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 816 { 817 int8_t shiftCount; 818 819 shiftCount = countLeadingZeros64( aSig ); 820 *zSigPtr = aSig<<shiftCount; 821 *zExpPtr = 1 - shiftCount; 822 823 } 824 825 /*---------------------------------------------------------------------------- 826 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 827 | extended double-precision floating-point value, returning the result. 828 *----------------------------------------------------------------------------*/ 829 830 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 831 { 832 floatx80 z; 833 834 z.low = zSig; 835 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 836 return z; 837 838 } 839 840 /*---------------------------------------------------------------------------- 841 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 842 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 843 | and returns the proper extended double-precision floating-point value 844 | corresponding to the abstract input. Ordinarily, the abstract value is 845 | rounded and packed into the extended double-precision format, with the 846 | inexact exception raised if the abstract input cannot be represented 847 | exactly. However, if the abstract value is too large, the overflow and 848 | inexact exceptions are raised and an infinity or maximal finite value is 849 | returned. If the abstract value is too small, the input value is rounded to 850 | a subnormal number, and the underflow and inexact exceptions are raised if 851 | the abstract input cannot be represented exactly as a subnormal extended 852 | double-precision floating-point number. 853 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 854 | number of bits as single or double precision, respectively. Otherwise, the 855 | result is rounded to the full precision of the extended double-precision 856 | format. 857 | The input significand must be normalized or smaller. If the input 858 | significand is not normalized, `zExp' must be 0; in that case, the result 859 | returned is a subnormal number, and it must not require rounding. The 860 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 861 | Floating-Point Arithmetic. 862 *----------------------------------------------------------------------------*/ 863 864 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 865 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 866 float_status *status) 867 { 868 int8_t roundingMode; 869 flag roundNearestEven, increment, isTiny; 870 int64_t roundIncrement, roundMask, roundBits; 871 872 roundingMode = status->float_rounding_mode; 873 roundNearestEven = ( roundingMode == float_round_nearest_even ); 874 if ( roundingPrecision == 80 ) goto precision80; 875 if ( roundingPrecision == 64 ) { 876 roundIncrement = LIT64( 0x0000000000000400 ); 877 roundMask = LIT64( 0x00000000000007FF ); 878 } 879 else if ( roundingPrecision == 32 ) { 880 roundIncrement = LIT64( 0x0000008000000000 ); 881 roundMask = LIT64( 0x000000FFFFFFFFFF ); 882 } 883 else { 884 goto precision80; 885 } 886 zSig0 |= ( zSig1 != 0 ); 887 switch (roundingMode) { 888 case float_round_nearest_even: 889 case float_round_ties_away: 890 break; 891 case float_round_to_zero: 892 roundIncrement = 0; 893 break; 894 case float_round_up: 895 roundIncrement = zSign ? 0 : roundMask; 896 break; 897 case float_round_down: 898 roundIncrement = zSign ? roundMask : 0; 899 break; 900 default: 901 abort(); 902 } 903 roundBits = zSig0 & roundMask; 904 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 905 if ( ( 0x7FFE < zExp ) 906 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 907 ) { 908 goto overflow; 909 } 910 if ( zExp <= 0 ) { 911 if (status->flush_to_zero) { 912 float_raise(float_flag_output_denormal, status); 913 return packFloatx80(zSign, 0, 0); 914 } 915 isTiny = 916 (status->float_detect_tininess 917 == float_tininess_before_rounding) 918 || ( zExp < 0 ) 919 || ( zSig0 <= zSig0 + roundIncrement ); 920 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 921 zExp = 0; 922 roundBits = zSig0 & roundMask; 923 if (isTiny && roundBits) { 924 float_raise(float_flag_underflow, status); 925 } 926 if (roundBits) { 927 status->float_exception_flags |= float_flag_inexact; 928 } 929 zSig0 += roundIncrement; 930 if ( (int64_t) zSig0 < 0 ) zExp = 1; 931 roundIncrement = roundMask + 1; 932 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 933 roundMask |= roundIncrement; 934 } 935 zSig0 &= ~ roundMask; 936 return packFloatx80( zSign, zExp, zSig0 ); 937 } 938 } 939 if (roundBits) { 940 status->float_exception_flags |= float_flag_inexact; 941 } 942 zSig0 += roundIncrement; 943 if ( zSig0 < roundIncrement ) { 944 ++zExp; 945 zSig0 = LIT64( 0x8000000000000000 ); 946 } 947 roundIncrement = roundMask + 1; 948 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 949 roundMask |= roundIncrement; 950 } 951 zSig0 &= ~ roundMask; 952 if ( zSig0 == 0 ) zExp = 0; 953 return packFloatx80( zSign, zExp, zSig0 ); 954 precision80: 955 switch (roundingMode) { 956 case float_round_nearest_even: 957 case float_round_ties_away: 958 increment = ((int64_t)zSig1 < 0); 959 break; 960 case float_round_to_zero: 961 increment = 0; 962 break; 963 case float_round_up: 964 increment = !zSign && zSig1; 965 break; 966 case float_round_down: 967 increment = zSign && zSig1; 968 break; 969 default: 970 abort(); 971 } 972 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 973 if ( ( 0x7FFE < zExp ) 974 || ( ( zExp == 0x7FFE ) 975 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 976 && increment 977 ) 978 ) { 979 roundMask = 0; 980 overflow: 981 float_raise(float_flag_overflow | float_flag_inexact, status); 982 if ( ( roundingMode == float_round_to_zero ) 983 || ( zSign && ( roundingMode == float_round_up ) ) 984 || ( ! zSign && ( roundingMode == float_round_down ) ) 985 ) { 986 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 987 } 988 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 989 } 990 if ( zExp <= 0 ) { 991 isTiny = 992 (status->float_detect_tininess 993 == float_tininess_before_rounding) 994 || ( zExp < 0 ) 995 || ! increment 996 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 997 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 998 zExp = 0; 999 if (isTiny && zSig1) { 1000 float_raise(float_flag_underflow, status); 1001 } 1002 if (zSig1) { 1003 status->float_exception_flags |= float_flag_inexact; 1004 } 1005 switch (roundingMode) { 1006 case float_round_nearest_even: 1007 case float_round_ties_away: 1008 increment = ((int64_t)zSig1 < 0); 1009 break; 1010 case float_round_to_zero: 1011 increment = 0; 1012 break; 1013 case float_round_up: 1014 increment = !zSign && zSig1; 1015 break; 1016 case float_round_down: 1017 increment = zSign && zSig1; 1018 break; 1019 default: 1020 abort(); 1021 } 1022 if ( increment ) { 1023 ++zSig0; 1024 zSig0 &= 1025 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1026 if ( (int64_t) zSig0 < 0 ) zExp = 1; 1027 } 1028 return packFloatx80( zSign, zExp, zSig0 ); 1029 } 1030 } 1031 if (zSig1) { 1032 status->float_exception_flags |= float_flag_inexact; 1033 } 1034 if ( increment ) { 1035 ++zSig0; 1036 if ( zSig0 == 0 ) { 1037 ++zExp; 1038 zSig0 = LIT64( 0x8000000000000000 ); 1039 } 1040 else { 1041 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 1042 } 1043 } 1044 else { 1045 if ( zSig0 == 0 ) zExp = 0; 1046 } 1047 return packFloatx80( zSign, zExp, zSig0 ); 1048 1049 } 1050 1051 /*---------------------------------------------------------------------------- 1052 | Takes an abstract floating-point value having sign `zSign', exponent 1053 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 1054 | and returns the proper extended double-precision floating-point value 1055 | corresponding to the abstract input. This routine is just like 1056 | `roundAndPackFloatx80' except that the input significand does not have to be 1057 | normalized. 1058 *----------------------------------------------------------------------------*/ 1059 1060 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 1061 flag zSign, int32_t zExp, 1062 uint64_t zSig0, uint64_t zSig1, 1063 float_status *status) 1064 { 1065 int8_t shiftCount; 1066 1067 if ( zSig0 == 0 ) { 1068 zSig0 = zSig1; 1069 zSig1 = 0; 1070 zExp -= 64; 1071 } 1072 shiftCount = countLeadingZeros64( zSig0 ); 1073 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1074 zExp -= shiftCount; 1075 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 1076 zSig0, zSig1, status); 1077 1078 } 1079 1080 /*---------------------------------------------------------------------------- 1081 | Returns the least-significant 64 fraction bits of the quadruple-precision 1082 | floating-point value `a'. 1083 *----------------------------------------------------------------------------*/ 1084 1085 static inline uint64_t extractFloat128Frac1( float128 a ) 1086 { 1087 1088 return a.low; 1089 1090 } 1091 1092 /*---------------------------------------------------------------------------- 1093 | Returns the most-significant 48 fraction bits of the quadruple-precision 1094 | floating-point value `a'. 1095 *----------------------------------------------------------------------------*/ 1096 1097 static inline uint64_t extractFloat128Frac0( float128 a ) 1098 { 1099 1100 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1101 1102 } 1103 1104 /*---------------------------------------------------------------------------- 1105 | Returns the exponent bits of the quadruple-precision floating-point value 1106 | `a'. 1107 *----------------------------------------------------------------------------*/ 1108 1109 static inline int32_t extractFloat128Exp( float128 a ) 1110 { 1111 1112 return ( a.high>>48 ) & 0x7FFF; 1113 1114 } 1115 1116 /*---------------------------------------------------------------------------- 1117 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1118 *----------------------------------------------------------------------------*/ 1119 1120 static inline flag extractFloat128Sign( float128 a ) 1121 { 1122 1123 return a.high>>63; 1124 1125 } 1126 1127 /*---------------------------------------------------------------------------- 1128 | Normalizes the subnormal quadruple-precision floating-point value 1129 | represented by the denormalized significand formed by the concatenation of 1130 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1131 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1132 | significand are stored at the location pointed to by `zSig0Ptr', and the 1133 | least significant 64 bits of the normalized significand are stored at the 1134 | location pointed to by `zSig1Ptr'. 1135 *----------------------------------------------------------------------------*/ 1136 1137 static void 1138 normalizeFloat128Subnormal( 1139 uint64_t aSig0, 1140 uint64_t aSig1, 1141 int32_t *zExpPtr, 1142 uint64_t *zSig0Ptr, 1143 uint64_t *zSig1Ptr 1144 ) 1145 { 1146 int8_t shiftCount; 1147 1148 if ( aSig0 == 0 ) { 1149 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1150 if ( shiftCount < 0 ) { 1151 *zSig0Ptr = aSig1>>( - shiftCount ); 1152 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1153 } 1154 else { 1155 *zSig0Ptr = aSig1<<shiftCount; 1156 *zSig1Ptr = 0; 1157 } 1158 *zExpPtr = - shiftCount - 63; 1159 } 1160 else { 1161 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1162 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1163 *zExpPtr = 1 - shiftCount; 1164 } 1165 1166 } 1167 1168 /*---------------------------------------------------------------------------- 1169 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1170 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1171 | floating-point value, returning the result. After being shifted into the 1172 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1173 | added together to form the most significant 32 bits of the result. This 1174 | means that any integer portion of `zSig0' will be added into the exponent. 1175 | Since a properly normalized significand will have an integer portion equal 1176 | to 1, the `zExp' input should be 1 less than the desired result exponent 1177 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1178 | significand. 1179 *----------------------------------------------------------------------------*/ 1180 1181 static inline float128 1182 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1183 { 1184 float128 z; 1185 1186 z.low = zSig1; 1187 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1188 return z; 1189 1190 } 1191 1192 /*---------------------------------------------------------------------------- 1193 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1194 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1195 | and `zSig2', and returns the proper quadruple-precision floating-point value 1196 | corresponding to the abstract input. Ordinarily, the abstract value is 1197 | simply rounded and packed into the quadruple-precision format, with the 1198 | inexact exception raised if the abstract input cannot be represented 1199 | exactly. However, if the abstract value is too large, the overflow and 1200 | inexact exceptions are raised and an infinity or maximal finite value is 1201 | returned. If the abstract value is too small, the input value is rounded to 1202 | a subnormal number, and the underflow and inexact exceptions are raised if 1203 | the abstract input cannot be represented exactly as a subnormal quadruple- 1204 | precision floating-point number. 1205 | The input significand must be normalized or smaller. If the input 1206 | significand is not normalized, `zExp' must be 0; in that case, the result 1207 | returned is a subnormal number, and it must not require rounding. In the 1208 | usual case that the input significand is normalized, `zExp' must be 1 less 1209 | than the ``true'' floating-point exponent. The handling of underflow and 1210 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1211 *----------------------------------------------------------------------------*/ 1212 1213 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1214 uint64_t zSig0, uint64_t zSig1, 1215 uint64_t zSig2, float_status *status) 1216 { 1217 int8_t roundingMode; 1218 flag roundNearestEven, increment, isTiny; 1219 1220 roundingMode = status->float_rounding_mode; 1221 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1222 switch (roundingMode) { 1223 case float_round_nearest_even: 1224 case float_round_ties_away: 1225 increment = ((int64_t)zSig2 < 0); 1226 break; 1227 case float_round_to_zero: 1228 increment = 0; 1229 break; 1230 case float_round_up: 1231 increment = !zSign && zSig2; 1232 break; 1233 case float_round_down: 1234 increment = zSign && zSig2; 1235 break; 1236 case float_round_to_odd: 1237 increment = !(zSig1 & 0x1) && zSig2; 1238 break; 1239 default: 1240 abort(); 1241 } 1242 if ( 0x7FFD <= (uint32_t) zExp ) { 1243 if ( ( 0x7FFD < zExp ) 1244 || ( ( zExp == 0x7FFD ) 1245 && eq128( 1246 LIT64( 0x0001FFFFFFFFFFFF ), 1247 LIT64( 0xFFFFFFFFFFFFFFFF ), 1248 zSig0, 1249 zSig1 1250 ) 1251 && increment 1252 ) 1253 ) { 1254 float_raise(float_flag_overflow | float_flag_inexact, status); 1255 if ( ( roundingMode == float_round_to_zero ) 1256 || ( zSign && ( roundingMode == float_round_up ) ) 1257 || ( ! zSign && ( roundingMode == float_round_down ) ) 1258 || (roundingMode == float_round_to_odd) 1259 ) { 1260 return 1261 packFloat128( 1262 zSign, 1263 0x7FFE, 1264 LIT64( 0x0000FFFFFFFFFFFF ), 1265 LIT64( 0xFFFFFFFFFFFFFFFF ) 1266 ); 1267 } 1268 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1269 } 1270 if ( zExp < 0 ) { 1271 if (status->flush_to_zero) { 1272 float_raise(float_flag_output_denormal, status); 1273 return packFloat128(zSign, 0, 0, 0); 1274 } 1275 isTiny = 1276 (status->float_detect_tininess 1277 == float_tininess_before_rounding) 1278 || ( zExp < -1 ) 1279 || ! increment 1280 || lt128( 1281 zSig0, 1282 zSig1, 1283 LIT64( 0x0001FFFFFFFFFFFF ), 1284 LIT64( 0xFFFFFFFFFFFFFFFF ) 1285 ); 1286 shift128ExtraRightJamming( 1287 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1288 zExp = 0; 1289 if (isTiny && zSig2) { 1290 float_raise(float_flag_underflow, status); 1291 } 1292 switch (roundingMode) { 1293 case float_round_nearest_even: 1294 case float_round_ties_away: 1295 increment = ((int64_t)zSig2 < 0); 1296 break; 1297 case float_round_to_zero: 1298 increment = 0; 1299 break; 1300 case float_round_up: 1301 increment = !zSign && zSig2; 1302 break; 1303 case float_round_down: 1304 increment = zSign && zSig2; 1305 break; 1306 case float_round_to_odd: 1307 increment = !(zSig1 & 0x1) && zSig2; 1308 break; 1309 default: 1310 abort(); 1311 } 1312 } 1313 } 1314 if (zSig2) { 1315 status->float_exception_flags |= float_flag_inexact; 1316 } 1317 if ( increment ) { 1318 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1319 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1320 } 1321 else { 1322 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1323 } 1324 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1325 1326 } 1327 1328 /*---------------------------------------------------------------------------- 1329 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1330 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1331 | returns the proper quadruple-precision floating-point value corresponding 1332 | to the abstract input. This routine is just like `roundAndPackFloat128' 1333 | except that the input significand has fewer bits and does not have to be 1334 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1335 | point exponent. 1336 *----------------------------------------------------------------------------*/ 1337 1338 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1339 uint64_t zSig0, uint64_t zSig1, 1340 float_status *status) 1341 { 1342 int8_t shiftCount; 1343 uint64_t zSig2; 1344 1345 if ( zSig0 == 0 ) { 1346 zSig0 = zSig1; 1347 zSig1 = 0; 1348 zExp -= 64; 1349 } 1350 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1351 if ( 0 <= shiftCount ) { 1352 zSig2 = 0; 1353 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1354 } 1355 else { 1356 shift128ExtraRightJamming( 1357 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1358 } 1359 zExp -= shiftCount; 1360 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1361 1362 } 1363 1364 /*---------------------------------------------------------------------------- 1365 | Returns the result of converting the 32-bit two's complement integer `a' 1366 | to the single-precision floating-point format. The conversion is performed 1367 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1368 *----------------------------------------------------------------------------*/ 1369 1370 float32 int32_to_float32(int32_t a, float_status *status) 1371 { 1372 flag zSign; 1373 1374 if ( a == 0 ) return float32_zero; 1375 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1376 zSign = ( a < 0 ); 1377 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1378 } 1379 1380 /*---------------------------------------------------------------------------- 1381 | Returns the result of converting the 32-bit two's complement integer `a' 1382 | to the double-precision floating-point format. The conversion is performed 1383 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1384 *----------------------------------------------------------------------------*/ 1385 1386 float64 int32_to_float64(int32_t a, float_status *status) 1387 { 1388 flag zSign; 1389 uint32_t absA; 1390 int8_t shiftCount; 1391 uint64_t zSig; 1392 1393 if ( a == 0 ) return float64_zero; 1394 zSign = ( a < 0 ); 1395 absA = zSign ? - a : a; 1396 shiftCount = countLeadingZeros32( absA ) + 21; 1397 zSig = absA; 1398 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1399 1400 } 1401 1402 /*---------------------------------------------------------------------------- 1403 | Returns the result of converting the 32-bit two's complement integer `a' 1404 | to the extended double-precision floating-point format. The conversion 1405 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1406 | Arithmetic. 1407 *----------------------------------------------------------------------------*/ 1408 1409 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1410 { 1411 flag zSign; 1412 uint32_t absA; 1413 int8_t shiftCount; 1414 uint64_t zSig; 1415 1416 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1417 zSign = ( a < 0 ); 1418 absA = zSign ? - a : a; 1419 shiftCount = countLeadingZeros32( absA ) + 32; 1420 zSig = absA; 1421 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1422 1423 } 1424 1425 /*---------------------------------------------------------------------------- 1426 | Returns the result of converting the 32-bit two's complement integer `a' to 1427 | the quadruple-precision floating-point format. The conversion is performed 1428 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1429 *----------------------------------------------------------------------------*/ 1430 1431 float128 int32_to_float128(int32_t a, float_status *status) 1432 { 1433 flag zSign; 1434 uint32_t absA; 1435 int8_t shiftCount; 1436 uint64_t zSig0; 1437 1438 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1439 zSign = ( a < 0 ); 1440 absA = zSign ? - a : a; 1441 shiftCount = countLeadingZeros32( absA ) + 17; 1442 zSig0 = absA; 1443 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1444 1445 } 1446 1447 /*---------------------------------------------------------------------------- 1448 | Returns the result of converting the 64-bit two's complement integer `a' 1449 | to the single-precision floating-point format. The conversion is performed 1450 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1451 *----------------------------------------------------------------------------*/ 1452 1453 float32 int64_to_float32(int64_t a, float_status *status) 1454 { 1455 flag zSign; 1456 uint64_t absA; 1457 int8_t shiftCount; 1458 1459 if ( a == 0 ) return float32_zero; 1460 zSign = ( a < 0 ); 1461 absA = zSign ? - a : a; 1462 shiftCount = countLeadingZeros64( absA ) - 40; 1463 if ( 0 <= shiftCount ) { 1464 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1465 } 1466 else { 1467 shiftCount += 7; 1468 if ( shiftCount < 0 ) { 1469 shift64RightJamming( absA, - shiftCount, &absA ); 1470 } 1471 else { 1472 absA <<= shiftCount; 1473 } 1474 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1475 } 1476 1477 } 1478 1479 /*---------------------------------------------------------------------------- 1480 | Returns the result of converting the 64-bit two's complement integer `a' 1481 | to the double-precision floating-point format. The conversion is performed 1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1483 *----------------------------------------------------------------------------*/ 1484 1485 float64 int64_to_float64(int64_t a, float_status *status) 1486 { 1487 flag zSign; 1488 1489 if ( a == 0 ) return float64_zero; 1490 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1491 return packFloat64( 1, 0x43E, 0 ); 1492 } 1493 zSign = ( a < 0 ); 1494 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1495 } 1496 1497 /*---------------------------------------------------------------------------- 1498 | Returns the result of converting the 64-bit two's complement integer `a' 1499 | to the extended double-precision floating-point format. The conversion 1500 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1501 | Arithmetic. 1502 *----------------------------------------------------------------------------*/ 1503 1504 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1505 { 1506 flag zSign; 1507 uint64_t absA; 1508 int8_t shiftCount; 1509 1510 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1511 zSign = ( a < 0 ); 1512 absA = zSign ? - a : a; 1513 shiftCount = countLeadingZeros64( absA ); 1514 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1515 1516 } 1517 1518 /*---------------------------------------------------------------------------- 1519 | Returns the result of converting the 64-bit two's complement integer `a' to 1520 | the quadruple-precision floating-point format. The conversion is performed 1521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1522 *----------------------------------------------------------------------------*/ 1523 1524 float128 int64_to_float128(int64_t a, float_status *status) 1525 { 1526 flag zSign; 1527 uint64_t absA; 1528 int8_t shiftCount; 1529 int32_t zExp; 1530 uint64_t zSig0, zSig1; 1531 1532 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1533 zSign = ( a < 0 ); 1534 absA = zSign ? - a : a; 1535 shiftCount = countLeadingZeros64( absA ) + 49; 1536 zExp = 0x406E - shiftCount; 1537 if ( 64 <= shiftCount ) { 1538 zSig1 = 0; 1539 zSig0 = absA; 1540 shiftCount -= 64; 1541 } 1542 else { 1543 zSig1 = absA; 1544 zSig0 = 0; 1545 } 1546 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1547 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1548 1549 } 1550 1551 /*---------------------------------------------------------------------------- 1552 | Returns the result of converting the 64-bit unsigned integer `a' 1553 | to the single-precision floating-point format. The conversion is performed 1554 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1555 *----------------------------------------------------------------------------*/ 1556 1557 float32 uint64_to_float32(uint64_t a, float_status *status) 1558 { 1559 int shiftcount; 1560 1561 if (a == 0) { 1562 return float32_zero; 1563 } 1564 1565 /* Determine (left) shift needed to put first set bit into bit posn 23 1566 * (since packFloat32() expects the binary point between bits 23 and 22); 1567 * this is the fast case for smallish numbers. 1568 */ 1569 shiftcount = countLeadingZeros64(a) - 40; 1570 if (shiftcount >= 0) { 1571 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1572 } 1573 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1574 * expects the binary point between bits 30 and 29, hence the + 7. 1575 */ 1576 shiftcount += 7; 1577 if (shiftcount < 0) { 1578 shift64RightJamming(a, -shiftcount, &a); 1579 } else { 1580 a <<= shiftcount; 1581 } 1582 1583 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1584 } 1585 1586 /*---------------------------------------------------------------------------- 1587 | Returns the result of converting the 64-bit unsigned integer `a' 1588 | to the double-precision floating-point format. The conversion is performed 1589 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1590 *----------------------------------------------------------------------------*/ 1591 1592 float64 uint64_to_float64(uint64_t a, float_status *status) 1593 { 1594 int exp = 0x43C; 1595 int shiftcount; 1596 1597 if (a == 0) { 1598 return float64_zero; 1599 } 1600 1601 shiftcount = countLeadingZeros64(a) - 1; 1602 if (shiftcount < 0) { 1603 shift64RightJamming(a, -shiftcount, &a); 1604 } else { 1605 a <<= shiftcount; 1606 } 1607 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1608 } 1609 1610 /*---------------------------------------------------------------------------- 1611 | Returns the result of converting the 64-bit unsigned integer `a' 1612 | to the quadruple-precision floating-point format. The conversion is performed 1613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1614 *----------------------------------------------------------------------------*/ 1615 1616 float128 uint64_to_float128(uint64_t a, float_status *status) 1617 { 1618 if (a == 0) { 1619 return float128_zero; 1620 } 1621 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1622 } 1623 1624 /*---------------------------------------------------------------------------- 1625 | Returns the result of converting the single-precision floating-point value 1626 | `a' to the 32-bit two's complement integer format. The conversion is 1627 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1628 | Arithmetic---which means in particular that the conversion is rounded 1629 | according to the current rounding mode. If `a' is a NaN, the largest 1630 | positive integer is returned. Otherwise, if the conversion overflows, the 1631 | largest integer with the same sign as `a' is returned. 1632 *----------------------------------------------------------------------------*/ 1633 1634 int32_t float32_to_int32(float32 a, float_status *status) 1635 { 1636 flag aSign; 1637 int aExp; 1638 int shiftCount; 1639 uint32_t aSig; 1640 uint64_t aSig64; 1641 1642 a = float32_squash_input_denormal(a, status); 1643 aSig = extractFloat32Frac( a ); 1644 aExp = extractFloat32Exp( a ); 1645 aSign = extractFloat32Sign( a ); 1646 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1647 if ( aExp ) aSig |= 0x00800000; 1648 shiftCount = 0xAF - aExp; 1649 aSig64 = aSig; 1650 aSig64 <<= 32; 1651 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1652 return roundAndPackInt32(aSign, aSig64, status); 1653 1654 } 1655 1656 /*---------------------------------------------------------------------------- 1657 | Returns the result of converting the single-precision floating-point value 1658 | `a' to the 32-bit two's complement integer format. The conversion is 1659 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1660 | Arithmetic, except that the conversion is always rounded toward zero. 1661 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1662 | the conversion overflows, the largest integer with the same sign as `a' is 1663 | returned. 1664 *----------------------------------------------------------------------------*/ 1665 1666 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1667 { 1668 flag aSign; 1669 int aExp; 1670 int shiftCount; 1671 uint32_t aSig; 1672 int32_t z; 1673 a = float32_squash_input_denormal(a, status); 1674 1675 aSig = extractFloat32Frac( a ); 1676 aExp = extractFloat32Exp( a ); 1677 aSign = extractFloat32Sign( a ); 1678 shiftCount = aExp - 0x9E; 1679 if ( 0 <= shiftCount ) { 1680 if ( float32_val(a) != 0xCF000000 ) { 1681 float_raise(float_flag_invalid, status); 1682 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1683 } 1684 return (int32_t) 0x80000000; 1685 } 1686 else if ( aExp <= 0x7E ) { 1687 if (aExp | aSig) { 1688 status->float_exception_flags |= float_flag_inexact; 1689 } 1690 return 0; 1691 } 1692 aSig = ( aSig | 0x00800000 )<<8; 1693 z = aSig>>( - shiftCount ); 1694 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1695 status->float_exception_flags |= float_flag_inexact; 1696 } 1697 if ( aSign ) z = - z; 1698 return z; 1699 1700 } 1701 1702 /*---------------------------------------------------------------------------- 1703 | Returns the result of converting the single-precision floating-point value 1704 | `a' to the 16-bit two's complement integer format. The conversion is 1705 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1706 | Arithmetic, except that the conversion is always rounded toward zero. 1707 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1708 | the conversion overflows, the largest integer with the same sign as `a' is 1709 | returned. 1710 *----------------------------------------------------------------------------*/ 1711 1712 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1713 { 1714 flag aSign; 1715 int aExp; 1716 int shiftCount; 1717 uint32_t aSig; 1718 int32_t z; 1719 1720 aSig = extractFloat32Frac( a ); 1721 aExp = extractFloat32Exp( a ); 1722 aSign = extractFloat32Sign( a ); 1723 shiftCount = aExp - 0x8E; 1724 if ( 0 <= shiftCount ) { 1725 if ( float32_val(a) != 0xC7000000 ) { 1726 float_raise(float_flag_invalid, status); 1727 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1728 return 0x7FFF; 1729 } 1730 } 1731 return (int32_t) 0xffff8000; 1732 } 1733 else if ( aExp <= 0x7E ) { 1734 if ( aExp | aSig ) { 1735 status->float_exception_flags |= float_flag_inexact; 1736 } 1737 return 0; 1738 } 1739 shiftCount -= 0x10; 1740 aSig = ( aSig | 0x00800000 )<<8; 1741 z = aSig>>( - shiftCount ); 1742 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1743 status->float_exception_flags |= float_flag_inexact; 1744 } 1745 if ( aSign ) { 1746 z = - z; 1747 } 1748 return z; 1749 1750 } 1751 1752 /*---------------------------------------------------------------------------- 1753 | Returns the result of converting the single-precision floating-point value 1754 | `a' to the 64-bit two's complement integer format. The conversion is 1755 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1756 | Arithmetic---which means in particular that the conversion is rounded 1757 | according to the current rounding mode. If `a' is a NaN, the largest 1758 | positive integer is returned. Otherwise, if the conversion overflows, the 1759 | largest integer with the same sign as `a' is returned. 1760 *----------------------------------------------------------------------------*/ 1761 1762 int64_t float32_to_int64(float32 a, float_status *status) 1763 { 1764 flag aSign; 1765 int aExp; 1766 int shiftCount; 1767 uint32_t aSig; 1768 uint64_t aSig64, aSigExtra; 1769 a = float32_squash_input_denormal(a, status); 1770 1771 aSig = extractFloat32Frac( a ); 1772 aExp = extractFloat32Exp( a ); 1773 aSign = extractFloat32Sign( a ); 1774 shiftCount = 0xBE - aExp; 1775 if ( shiftCount < 0 ) { 1776 float_raise(float_flag_invalid, status); 1777 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1778 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1779 } 1780 return (int64_t) LIT64( 0x8000000000000000 ); 1781 } 1782 if ( aExp ) aSig |= 0x00800000; 1783 aSig64 = aSig; 1784 aSig64 <<= 40; 1785 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1786 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1787 1788 } 1789 1790 /*---------------------------------------------------------------------------- 1791 | Returns the result of converting the single-precision floating-point value 1792 | `a' to the 64-bit unsigned integer format. The conversion is 1793 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1794 | Arithmetic---which means in particular that the conversion is rounded 1795 | according to the current rounding mode. If `a' is a NaN, the largest 1796 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1797 | largest unsigned integer is returned. If the 'a' is negative, the result 1798 | is rounded and zero is returned; values that do not round to zero will 1799 | raise the inexact exception flag. 1800 *----------------------------------------------------------------------------*/ 1801 1802 uint64_t float32_to_uint64(float32 a, float_status *status) 1803 { 1804 flag aSign; 1805 int aExp; 1806 int shiftCount; 1807 uint32_t aSig; 1808 uint64_t aSig64, aSigExtra; 1809 a = float32_squash_input_denormal(a, status); 1810 1811 aSig = extractFloat32Frac(a); 1812 aExp = extractFloat32Exp(a); 1813 aSign = extractFloat32Sign(a); 1814 if ((aSign) && (aExp > 126)) { 1815 float_raise(float_flag_invalid, status); 1816 if (float32_is_any_nan(a)) { 1817 return LIT64(0xFFFFFFFFFFFFFFFF); 1818 } else { 1819 return 0; 1820 } 1821 } 1822 shiftCount = 0xBE - aExp; 1823 if (aExp) { 1824 aSig |= 0x00800000; 1825 } 1826 if (shiftCount < 0) { 1827 float_raise(float_flag_invalid, status); 1828 return LIT64(0xFFFFFFFFFFFFFFFF); 1829 } 1830 1831 aSig64 = aSig; 1832 aSig64 <<= 40; 1833 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1834 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1835 } 1836 1837 /*---------------------------------------------------------------------------- 1838 | Returns the result of converting the single-precision floating-point value 1839 | `a' to the 64-bit unsigned integer format. The conversion is 1840 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1841 | Arithmetic, except that the conversion is always rounded toward zero. If 1842 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1843 | conversion overflows, the largest unsigned integer is returned. If the 1844 | 'a' is negative, the result is rounded and zero is returned; values that do 1845 | not round to zero will raise the inexact flag. 1846 *----------------------------------------------------------------------------*/ 1847 1848 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1849 { 1850 signed char current_rounding_mode = status->float_rounding_mode; 1851 set_float_rounding_mode(float_round_to_zero, status); 1852 int64_t v = float32_to_uint64(a, status); 1853 set_float_rounding_mode(current_rounding_mode, status); 1854 return v; 1855 } 1856 1857 /*---------------------------------------------------------------------------- 1858 | Returns the result of converting the single-precision floating-point value 1859 | `a' to the 64-bit two's complement integer format. The conversion is 1860 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1861 | Arithmetic, except that the conversion is always rounded toward zero. If 1862 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1863 | conversion overflows, the largest integer with the same sign as `a' is 1864 | returned. 1865 *----------------------------------------------------------------------------*/ 1866 1867 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1868 { 1869 flag aSign; 1870 int aExp; 1871 int shiftCount; 1872 uint32_t aSig; 1873 uint64_t aSig64; 1874 int64_t z; 1875 a = float32_squash_input_denormal(a, status); 1876 1877 aSig = extractFloat32Frac( a ); 1878 aExp = extractFloat32Exp( a ); 1879 aSign = extractFloat32Sign( a ); 1880 shiftCount = aExp - 0xBE; 1881 if ( 0 <= shiftCount ) { 1882 if ( float32_val(a) != 0xDF000000 ) { 1883 float_raise(float_flag_invalid, status); 1884 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1885 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1886 } 1887 } 1888 return (int64_t) LIT64( 0x8000000000000000 ); 1889 } 1890 else if ( aExp <= 0x7E ) { 1891 if (aExp | aSig) { 1892 status->float_exception_flags |= float_flag_inexact; 1893 } 1894 return 0; 1895 } 1896 aSig64 = aSig | 0x00800000; 1897 aSig64 <<= 40; 1898 z = aSig64>>( - shiftCount ); 1899 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1900 status->float_exception_flags |= float_flag_inexact; 1901 } 1902 if ( aSign ) z = - z; 1903 return z; 1904 1905 } 1906 1907 /*---------------------------------------------------------------------------- 1908 | Returns the result of converting the single-precision floating-point value 1909 | `a' to the double-precision floating-point format. The conversion is 1910 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1911 | Arithmetic. 1912 *----------------------------------------------------------------------------*/ 1913 1914 float64 float32_to_float64(float32 a, float_status *status) 1915 { 1916 flag aSign; 1917 int aExp; 1918 uint32_t aSig; 1919 a = float32_squash_input_denormal(a, status); 1920 1921 aSig = extractFloat32Frac( a ); 1922 aExp = extractFloat32Exp( a ); 1923 aSign = extractFloat32Sign( a ); 1924 if ( aExp == 0xFF ) { 1925 if (aSig) { 1926 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1927 } 1928 return packFloat64( aSign, 0x7FF, 0 ); 1929 } 1930 if ( aExp == 0 ) { 1931 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1932 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1933 --aExp; 1934 } 1935 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1936 1937 } 1938 1939 /*---------------------------------------------------------------------------- 1940 | Returns the result of converting the single-precision floating-point value 1941 | `a' to the extended double-precision floating-point format. The conversion 1942 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1943 | Arithmetic. 1944 *----------------------------------------------------------------------------*/ 1945 1946 floatx80 float32_to_floatx80(float32 a, float_status *status) 1947 { 1948 flag aSign; 1949 int aExp; 1950 uint32_t aSig; 1951 1952 a = float32_squash_input_denormal(a, status); 1953 aSig = extractFloat32Frac( a ); 1954 aExp = extractFloat32Exp( a ); 1955 aSign = extractFloat32Sign( a ); 1956 if ( aExp == 0xFF ) { 1957 if (aSig) { 1958 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1959 } 1960 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1961 } 1962 if ( aExp == 0 ) { 1963 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1964 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1965 } 1966 aSig |= 0x00800000; 1967 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1968 1969 } 1970 1971 /*---------------------------------------------------------------------------- 1972 | Returns the result of converting the single-precision floating-point value 1973 | `a' to the double-precision floating-point format. The conversion is 1974 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1975 | Arithmetic. 1976 *----------------------------------------------------------------------------*/ 1977 1978 float128 float32_to_float128(float32 a, float_status *status) 1979 { 1980 flag aSign; 1981 int aExp; 1982 uint32_t aSig; 1983 1984 a = float32_squash_input_denormal(a, status); 1985 aSig = extractFloat32Frac( a ); 1986 aExp = extractFloat32Exp( a ); 1987 aSign = extractFloat32Sign( a ); 1988 if ( aExp == 0xFF ) { 1989 if (aSig) { 1990 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1991 } 1992 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1993 } 1994 if ( aExp == 0 ) { 1995 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1996 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1997 --aExp; 1998 } 1999 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 2000 2001 } 2002 2003 /*---------------------------------------------------------------------------- 2004 | Rounds the single-precision floating-point value `a' to an integer, and 2005 | returns the result as a single-precision floating-point value. The 2006 | operation is performed according to the IEC/IEEE Standard for Binary 2007 | Floating-Point Arithmetic. 2008 *----------------------------------------------------------------------------*/ 2009 2010 float32 float32_round_to_int(float32 a, float_status *status) 2011 { 2012 flag aSign; 2013 int aExp; 2014 uint32_t lastBitMask, roundBitsMask; 2015 uint32_t z; 2016 a = float32_squash_input_denormal(a, status); 2017 2018 aExp = extractFloat32Exp( a ); 2019 if ( 0x96 <= aExp ) { 2020 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 2021 return propagateFloat32NaN(a, a, status); 2022 } 2023 return a; 2024 } 2025 if ( aExp <= 0x7E ) { 2026 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 2027 status->float_exception_flags |= float_flag_inexact; 2028 aSign = extractFloat32Sign( a ); 2029 switch (status->float_rounding_mode) { 2030 case float_round_nearest_even: 2031 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 2032 return packFloat32( aSign, 0x7F, 0 ); 2033 } 2034 break; 2035 case float_round_ties_away: 2036 if (aExp == 0x7E) { 2037 return packFloat32(aSign, 0x7F, 0); 2038 } 2039 break; 2040 case float_round_down: 2041 return make_float32(aSign ? 0xBF800000 : 0); 2042 case float_round_up: 2043 return make_float32(aSign ? 0x80000000 : 0x3F800000); 2044 } 2045 return packFloat32( aSign, 0, 0 ); 2046 } 2047 lastBitMask = 1; 2048 lastBitMask <<= 0x96 - aExp; 2049 roundBitsMask = lastBitMask - 1; 2050 z = float32_val(a); 2051 switch (status->float_rounding_mode) { 2052 case float_round_nearest_even: 2053 z += lastBitMask>>1; 2054 if ((z & roundBitsMask) == 0) { 2055 z &= ~lastBitMask; 2056 } 2057 break; 2058 case float_round_ties_away: 2059 z += lastBitMask >> 1; 2060 break; 2061 case float_round_to_zero: 2062 break; 2063 case float_round_up: 2064 if (!extractFloat32Sign(make_float32(z))) { 2065 z += roundBitsMask; 2066 } 2067 break; 2068 case float_round_down: 2069 if (extractFloat32Sign(make_float32(z))) { 2070 z += roundBitsMask; 2071 } 2072 break; 2073 default: 2074 abort(); 2075 } 2076 z &= ~ roundBitsMask; 2077 if (z != float32_val(a)) { 2078 status->float_exception_flags |= float_flag_inexact; 2079 } 2080 return make_float32(z); 2081 2082 } 2083 2084 /*---------------------------------------------------------------------------- 2085 | Returns the result of adding the absolute values of the single-precision 2086 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2087 | before being returned. `zSign' is ignored if the result is a NaN. 2088 | The addition is performed according to the IEC/IEEE Standard for Binary 2089 | Floating-Point Arithmetic. 2090 *----------------------------------------------------------------------------*/ 2091 2092 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 2093 float_status *status) 2094 { 2095 int aExp, bExp, zExp; 2096 uint32_t aSig, bSig, zSig; 2097 int expDiff; 2098 2099 aSig = extractFloat32Frac( a ); 2100 aExp = extractFloat32Exp( a ); 2101 bSig = extractFloat32Frac( b ); 2102 bExp = extractFloat32Exp( b ); 2103 expDiff = aExp - bExp; 2104 aSig <<= 6; 2105 bSig <<= 6; 2106 if ( 0 < expDiff ) { 2107 if ( aExp == 0xFF ) { 2108 if (aSig) { 2109 return propagateFloat32NaN(a, b, status); 2110 } 2111 return a; 2112 } 2113 if ( bExp == 0 ) { 2114 --expDiff; 2115 } 2116 else { 2117 bSig |= 0x20000000; 2118 } 2119 shift32RightJamming( bSig, expDiff, &bSig ); 2120 zExp = aExp; 2121 } 2122 else if ( expDiff < 0 ) { 2123 if ( bExp == 0xFF ) { 2124 if (bSig) { 2125 return propagateFloat32NaN(a, b, status); 2126 } 2127 return packFloat32( zSign, 0xFF, 0 ); 2128 } 2129 if ( aExp == 0 ) { 2130 ++expDiff; 2131 } 2132 else { 2133 aSig |= 0x20000000; 2134 } 2135 shift32RightJamming( aSig, - expDiff, &aSig ); 2136 zExp = bExp; 2137 } 2138 else { 2139 if ( aExp == 0xFF ) { 2140 if (aSig | bSig) { 2141 return propagateFloat32NaN(a, b, status); 2142 } 2143 return a; 2144 } 2145 if ( aExp == 0 ) { 2146 if (status->flush_to_zero) { 2147 if (aSig | bSig) { 2148 float_raise(float_flag_output_denormal, status); 2149 } 2150 return packFloat32(zSign, 0, 0); 2151 } 2152 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2153 } 2154 zSig = 0x40000000 + aSig + bSig; 2155 zExp = aExp; 2156 goto roundAndPack; 2157 } 2158 aSig |= 0x20000000; 2159 zSig = ( aSig + bSig )<<1; 2160 --zExp; 2161 if ( (int32_t) zSig < 0 ) { 2162 zSig = aSig + bSig; 2163 ++zExp; 2164 } 2165 roundAndPack: 2166 return roundAndPackFloat32(zSign, zExp, zSig, status); 2167 2168 } 2169 2170 /*---------------------------------------------------------------------------- 2171 | Returns the result of subtracting the absolute values of the single- 2172 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2173 | difference is negated before being returned. `zSign' is ignored if the 2174 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2175 | Standard for Binary Floating-Point Arithmetic. 2176 *----------------------------------------------------------------------------*/ 2177 2178 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2179 float_status *status) 2180 { 2181 int aExp, bExp, zExp; 2182 uint32_t aSig, bSig, zSig; 2183 int expDiff; 2184 2185 aSig = extractFloat32Frac( a ); 2186 aExp = extractFloat32Exp( a ); 2187 bSig = extractFloat32Frac( b ); 2188 bExp = extractFloat32Exp( b ); 2189 expDiff = aExp - bExp; 2190 aSig <<= 7; 2191 bSig <<= 7; 2192 if ( 0 < expDiff ) goto aExpBigger; 2193 if ( expDiff < 0 ) goto bExpBigger; 2194 if ( aExp == 0xFF ) { 2195 if (aSig | bSig) { 2196 return propagateFloat32NaN(a, b, status); 2197 } 2198 float_raise(float_flag_invalid, status); 2199 return float32_default_nan(status); 2200 } 2201 if ( aExp == 0 ) { 2202 aExp = 1; 2203 bExp = 1; 2204 } 2205 if ( bSig < aSig ) goto aBigger; 2206 if ( aSig < bSig ) goto bBigger; 2207 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2208 bExpBigger: 2209 if ( bExp == 0xFF ) { 2210 if (bSig) { 2211 return propagateFloat32NaN(a, b, status); 2212 } 2213 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2214 } 2215 if ( aExp == 0 ) { 2216 ++expDiff; 2217 } 2218 else { 2219 aSig |= 0x40000000; 2220 } 2221 shift32RightJamming( aSig, - expDiff, &aSig ); 2222 bSig |= 0x40000000; 2223 bBigger: 2224 zSig = bSig - aSig; 2225 zExp = bExp; 2226 zSign ^= 1; 2227 goto normalizeRoundAndPack; 2228 aExpBigger: 2229 if ( aExp == 0xFF ) { 2230 if (aSig) { 2231 return propagateFloat32NaN(a, b, status); 2232 } 2233 return a; 2234 } 2235 if ( bExp == 0 ) { 2236 --expDiff; 2237 } 2238 else { 2239 bSig |= 0x40000000; 2240 } 2241 shift32RightJamming( bSig, expDiff, &bSig ); 2242 aSig |= 0x40000000; 2243 aBigger: 2244 zSig = aSig - bSig; 2245 zExp = aExp; 2246 normalizeRoundAndPack: 2247 --zExp; 2248 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2249 2250 } 2251 2252 /*---------------------------------------------------------------------------- 2253 | Returns the result of adding the single-precision floating-point values `a' 2254 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2255 | Binary Floating-Point Arithmetic. 2256 *----------------------------------------------------------------------------*/ 2257 2258 float32 float32_add(float32 a, float32 b, float_status *status) 2259 { 2260 flag aSign, bSign; 2261 a = float32_squash_input_denormal(a, status); 2262 b = float32_squash_input_denormal(b, status); 2263 2264 aSign = extractFloat32Sign( a ); 2265 bSign = extractFloat32Sign( b ); 2266 if ( aSign == bSign ) { 2267 return addFloat32Sigs(a, b, aSign, status); 2268 } 2269 else { 2270 return subFloat32Sigs(a, b, aSign, status); 2271 } 2272 2273 } 2274 2275 /*---------------------------------------------------------------------------- 2276 | Returns the result of subtracting the single-precision floating-point values 2277 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2278 | for Binary Floating-Point Arithmetic. 2279 *----------------------------------------------------------------------------*/ 2280 2281 float32 float32_sub(float32 a, float32 b, float_status *status) 2282 { 2283 flag aSign, bSign; 2284 a = float32_squash_input_denormal(a, status); 2285 b = float32_squash_input_denormal(b, status); 2286 2287 aSign = extractFloat32Sign( a ); 2288 bSign = extractFloat32Sign( b ); 2289 if ( aSign == bSign ) { 2290 return subFloat32Sigs(a, b, aSign, status); 2291 } 2292 else { 2293 return addFloat32Sigs(a, b, aSign, status); 2294 } 2295 2296 } 2297 2298 /*---------------------------------------------------------------------------- 2299 | Returns the result of multiplying the single-precision floating-point values 2300 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2301 | for Binary Floating-Point Arithmetic. 2302 *----------------------------------------------------------------------------*/ 2303 2304 float32 float32_mul(float32 a, float32 b, float_status *status) 2305 { 2306 flag aSign, bSign, zSign; 2307 int aExp, bExp, zExp; 2308 uint32_t aSig, bSig; 2309 uint64_t zSig64; 2310 uint32_t zSig; 2311 2312 a = float32_squash_input_denormal(a, status); 2313 b = float32_squash_input_denormal(b, status); 2314 2315 aSig = extractFloat32Frac( a ); 2316 aExp = extractFloat32Exp( a ); 2317 aSign = extractFloat32Sign( a ); 2318 bSig = extractFloat32Frac( b ); 2319 bExp = extractFloat32Exp( b ); 2320 bSign = extractFloat32Sign( b ); 2321 zSign = aSign ^ bSign; 2322 if ( aExp == 0xFF ) { 2323 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2324 return propagateFloat32NaN(a, b, status); 2325 } 2326 if ( ( bExp | bSig ) == 0 ) { 2327 float_raise(float_flag_invalid, status); 2328 return float32_default_nan(status); 2329 } 2330 return packFloat32( zSign, 0xFF, 0 ); 2331 } 2332 if ( bExp == 0xFF ) { 2333 if (bSig) { 2334 return propagateFloat32NaN(a, b, status); 2335 } 2336 if ( ( aExp | aSig ) == 0 ) { 2337 float_raise(float_flag_invalid, status); 2338 return float32_default_nan(status); 2339 } 2340 return packFloat32( zSign, 0xFF, 0 ); 2341 } 2342 if ( aExp == 0 ) { 2343 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2344 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2345 } 2346 if ( bExp == 0 ) { 2347 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2348 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2349 } 2350 zExp = aExp + bExp - 0x7F; 2351 aSig = ( aSig | 0x00800000 )<<7; 2352 bSig = ( bSig | 0x00800000 )<<8; 2353 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2354 zSig = zSig64; 2355 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2356 zSig <<= 1; 2357 --zExp; 2358 } 2359 return roundAndPackFloat32(zSign, zExp, zSig, status); 2360 2361 } 2362 2363 /*---------------------------------------------------------------------------- 2364 | Returns the result of dividing the single-precision floating-point value `a' 2365 | by the corresponding value `b'. The operation is performed according to the 2366 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2367 *----------------------------------------------------------------------------*/ 2368 2369 float32 float32_div(float32 a, float32 b, float_status *status) 2370 { 2371 flag aSign, bSign, zSign; 2372 int aExp, bExp, zExp; 2373 uint32_t aSig, bSig, zSig; 2374 a = float32_squash_input_denormal(a, status); 2375 b = float32_squash_input_denormal(b, status); 2376 2377 aSig = extractFloat32Frac( a ); 2378 aExp = extractFloat32Exp( a ); 2379 aSign = extractFloat32Sign( a ); 2380 bSig = extractFloat32Frac( b ); 2381 bExp = extractFloat32Exp( b ); 2382 bSign = extractFloat32Sign( b ); 2383 zSign = aSign ^ bSign; 2384 if ( aExp == 0xFF ) { 2385 if (aSig) { 2386 return propagateFloat32NaN(a, b, status); 2387 } 2388 if ( bExp == 0xFF ) { 2389 if (bSig) { 2390 return propagateFloat32NaN(a, b, status); 2391 } 2392 float_raise(float_flag_invalid, status); 2393 return float32_default_nan(status); 2394 } 2395 return packFloat32( zSign, 0xFF, 0 ); 2396 } 2397 if ( bExp == 0xFF ) { 2398 if (bSig) { 2399 return propagateFloat32NaN(a, b, status); 2400 } 2401 return packFloat32( zSign, 0, 0 ); 2402 } 2403 if ( bExp == 0 ) { 2404 if ( bSig == 0 ) { 2405 if ( ( aExp | aSig ) == 0 ) { 2406 float_raise(float_flag_invalid, status); 2407 return float32_default_nan(status); 2408 } 2409 float_raise(float_flag_divbyzero, status); 2410 return packFloat32( zSign, 0xFF, 0 ); 2411 } 2412 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2413 } 2414 if ( aExp == 0 ) { 2415 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2416 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2417 } 2418 zExp = aExp - bExp + 0x7D; 2419 aSig = ( aSig | 0x00800000 )<<7; 2420 bSig = ( bSig | 0x00800000 )<<8; 2421 if ( bSig <= ( aSig + aSig ) ) { 2422 aSig >>= 1; 2423 ++zExp; 2424 } 2425 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2426 if ( ( zSig & 0x3F ) == 0 ) { 2427 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2428 } 2429 return roundAndPackFloat32(zSign, zExp, zSig, status); 2430 2431 } 2432 2433 /*---------------------------------------------------------------------------- 2434 | Returns the remainder of the single-precision floating-point value `a' 2435 | with respect to the corresponding value `b'. The operation is performed 2436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2437 *----------------------------------------------------------------------------*/ 2438 2439 float32 float32_rem(float32 a, float32 b, float_status *status) 2440 { 2441 flag aSign, zSign; 2442 int aExp, bExp, expDiff; 2443 uint32_t aSig, bSig; 2444 uint32_t q; 2445 uint64_t aSig64, bSig64, q64; 2446 uint32_t alternateASig; 2447 int32_t sigMean; 2448 a = float32_squash_input_denormal(a, status); 2449 b = float32_squash_input_denormal(b, status); 2450 2451 aSig = extractFloat32Frac( a ); 2452 aExp = extractFloat32Exp( a ); 2453 aSign = extractFloat32Sign( a ); 2454 bSig = extractFloat32Frac( b ); 2455 bExp = extractFloat32Exp( b ); 2456 if ( aExp == 0xFF ) { 2457 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2458 return propagateFloat32NaN(a, b, status); 2459 } 2460 float_raise(float_flag_invalid, status); 2461 return float32_default_nan(status); 2462 } 2463 if ( bExp == 0xFF ) { 2464 if (bSig) { 2465 return propagateFloat32NaN(a, b, status); 2466 } 2467 return a; 2468 } 2469 if ( bExp == 0 ) { 2470 if ( bSig == 0 ) { 2471 float_raise(float_flag_invalid, status); 2472 return float32_default_nan(status); 2473 } 2474 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2475 } 2476 if ( aExp == 0 ) { 2477 if ( aSig == 0 ) return a; 2478 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2479 } 2480 expDiff = aExp - bExp; 2481 aSig |= 0x00800000; 2482 bSig |= 0x00800000; 2483 if ( expDiff < 32 ) { 2484 aSig <<= 8; 2485 bSig <<= 8; 2486 if ( expDiff < 0 ) { 2487 if ( expDiff < -1 ) return a; 2488 aSig >>= 1; 2489 } 2490 q = ( bSig <= aSig ); 2491 if ( q ) aSig -= bSig; 2492 if ( 0 < expDiff ) { 2493 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2494 q >>= 32 - expDiff; 2495 bSig >>= 2; 2496 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2497 } 2498 else { 2499 aSig >>= 2; 2500 bSig >>= 2; 2501 } 2502 } 2503 else { 2504 if ( bSig <= aSig ) aSig -= bSig; 2505 aSig64 = ( (uint64_t) aSig )<<40; 2506 bSig64 = ( (uint64_t) bSig )<<40; 2507 expDiff -= 64; 2508 while ( 0 < expDiff ) { 2509 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2510 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2511 aSig64 = - ( ( bSig * q64 )<<38 ); 2512 expDiff -= 62; 2513 } 2514 expDiff += 64; 2515 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2516 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2517 q = q64>>( 64 - expDiff ); 2518 bSig <<= 6; 2519 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2520 } 2521 do { 2522 alternateASig = aSig; 2523 ++q; 2524 aSig -= bSig; 2525 } while ( 0 <= (int32_t) aSig ); 2526 sigMean = aSig + alternateASig; 2527 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2528 aSig = alternateASig; 2529 } 2530 zSign = ( (int32_t) aSig < 0 ); 2531 if ( zSign ) aSig = - aSig; 2532 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2533 } 2534 2535 /*---------------------------------------------------------------------------- 2536 | Returns the result of multiplying the single-precision floating-point values 2537 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2538 | multiplication. The operation is performed according to the IEC/IEEE 2539 | Standard for Binary Floating-Point Arithmetic 754-2008. 2540 | The flags argument allows the caller to select negation of the 2541 | addend, the intermediate product, or the final result. (The difference 2542 | between this and having the caller do a separate negation is that negating 2543 | externally will flip the sign bit on NaNs.) 2544 *----------------------------------------------------------------------------*/ 2545 2546 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2547 float_status *status) 2548 { 2549 flag aSign, bSign, cSign, zSign; 2550 int aExp, bExp, cExp, pExp, zExp, expDiff; 2551 uint32_t aSig, bSig, cSig; 2552 flag pInf, pZero, pSign; 2553 uint64_t pSig64, cSig64, zSig64; 2554 uint32_t pSig; 2555 int shiftcount; 2556 flag signflip, infzero; 2557 2558 a = float32_squash_input_denormal(a, status); 2559 b = float32_squash_input_denormal(b, status); 2560 c = float32_squash_input_denormal(c, status); 2561 aSig = extractFloat32Frac(a); 2562 aExp = extractFloat32Exp(a); 2563 aSign = extractFloat32Sign(a); 2564 bSig = extractFloat32Frac(b); 2565 bExp = extractFloat32Exp(b); 2566 bSign = extractFloat32Sign(b); 2567 cSig = extractFloat32Frac(c); 2568 cExp = extractFloat32Exp(c); 2569 cSign = extractFloat32Sign(c); 2570 2571 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2572 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2573 2574 /* It is implementation-defined whether the cases of (0,inf,qnan) 2575 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2576 * they return if they do), so we have to hand this information 2577 * off to the target-specific pick-a-NaN routine. 2578 */ 2579 if (((aExp == 0xff) && aSig) || 2580 ((bExp == 0xff) && bSig) || 2581 ((cExp == 0xff) && cSig)) { 2582 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2583 } 2584 2585 if (infzero) { 2586 float_raise(float_flag_invalid, status); 2587 return float32_default_nan(status); 2588 } 2589 2590 if (flags & float_muladd_negate_c) { 2591 cSign ^= 1; 2592 } 2593 2594 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2595 2596 /* Work out the sign and type of the product */ 2597 pSign = aSign ^ bSign; 2598 if (flags & float_muladd_negate_product) { 2599 pSign ^= 1; 2600 } 2601 pInf = (aExp == 0xff) || (bExp == 0xff); 2602 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2603 2604 if (cExp == 0xff) { 2605 if (pInf && (pSign ^ cSign)) { 2606 /* addition of opposite-signed infinities => InvalidOperation */ 2607 float_raise(float_flag_invalid, status); 2608 return float32_default_nan(status); 2609 } 2610 /* Otherwise generate an infinity of the same sign */ 2611 return packFloat32(cSign ^ signflip, 0xff, 0); 2612 } 2613 2614 if (pInf) { 2615 return packFloat32(pSign ^ signflip, 0xff, 0); 2616 } 2617 2618 if (pZero) { 2619 if (cExp == 0) { 2620 if (cSig == 0) { 2621 /* Adding two exact zeroes */ 2622 if (pSign == cSign) { 2623 zSign = pSign; 2624 } else if (status->float_rounding_mode == float_round_down) { 2625 zSign = 1; 2626 } else { 2627 zSign = 0; 2628 } 2629 return packFloat32(zSign ^ signflip, 0, 0); 2630 } 2631 /* Exact zero plus a denorm */ 2632 if (status->flush_to_zero) { 2633 float_raise(float_flag_output_denormal, status); 2634 return packFloat32(cSign ^ signflip, 0, 0); 2635 } 2636 } 2637 /* Zero plus something non-zero : just return the something */ 2638 if (flags & float_muladd_halve_result) { 2639 if (cExp == 0) { 2640 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2641 } 2642 /* Subtract one to halve, and one again because roundAndPackFloat32 2643 * wants one less than the true exponent. 2644 */ 2645 cExp -= 2; 2646 cSig = (cSig | 0x00800000) << 7; 2647 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2648 } 2649 return packFloat32(cSign ^ signflip, cExp, cSig); 2650 } 2651 2652 if (aExp == 0) { 2653 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2654 } 2655 if (bExp == 0) { 2656 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2657 } 2658 2659 /* Calculate the actual result a * b + c */ 2660 2661 /* Multiply first; this is easy. */ 2662 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2663 * because we want the true exponent, not the "one-less-than" 2664 * flavour that roundAndPackFloat32() takes. 2665 */ 2666 pExp = aExp + bExp - 0x7e; 2667 aSig = (aSig | 0x00800000) << 7; 2668 bSig = (bSig | 0x00800000) << 8; 2669 pSig64 = (uint64_t)aSig * bSig; 2670 if ((int64_t)(pSig64 << 1) >= 0) { 2671 pSig64 <<= 1; 2672 pExp--; 2673 } 2674 2675 zSign = pSign ^ signflip; 2676 2677 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2678 * position 62. 2679 */ 2680 if (cExp == 0) { 2681 if (!cSig) { 2682 /* Throw out the special case of c being an exact zero now */ 2683 shift64RightJamming(pSig64, 32, &pSig64); 2684 pSig = pSig64; 2685 if (flags & float_muladd_halve_result) { 2686 pExp--; 2687 } 2688 return roundAndPackFloat32(zSign, pExp - 1, 2689 pSig, status); 2690 } 2691 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2692 } 2693 2694 cSig64 = (uint64_t)cSig << (62 - 23); 2695 cSig64 |= LIT64(0x4000000000000000); 2696 expDiff = pExp - cExp; 2697 2698 if (pSign == cSign) { 2699 /* Addition */ 2700 if (expDiff > 0) { 2701 /* scale c to match p */ 2702 shift64RightJamming(cSig64, expDiff, &cSig64); 2703 zExp = pExp; 2704 } else if (expDiff < 0) { 2705 /* scale p to match c */ 2706 shift64RightJamming(pSig64, -expDiff, &pSig64); 2707 zExp = cExp; 2708 } else { 2709 /* no scaling needed */ 2710 zExp = cExp; 2711 } 2712 /* Add significands and make sure explicit bit ends up in posn 62 */ 2713 zSig64 = pSig64 + cSig64; 2714 if ((int64_t)zSig64 < 0) { 2715 shift64RightJamming(zSig64, 1, &zSig64); 2716 } else { 2717 zExp--; 2718 } 2719 } else { 2720 /* Subtraction */ 2721 if (expDiff > 0) { 2722 shift64RightJamming(cSig64, expDiff, &cSig64); 2723 zSig64 = pSig64 - cSig64; 2724 zExp = pExp; 2725 } else if (expDiff < 0) { 2726 shift64RightJamming(pSig64, -expDiff, &pSig64); 2727 zSig64 = cSig64 - pSig64; 2728 zExp = cExp; 2729 zSign ^= 1; 2730 } else { 2731 zExp = pExp; 2732 if (cSig64 < pSig64) { 2733 zSig64 = pSig64 - cSig64; 2734 } else if (pSig64 < cSig64) { 2735 zSig64 = cSig64 - pSig64; 2736 zSign ^= 1; 2737 } else { 2738 /* Exact zero */ 2739 zSign = signflip; 2740 if (status->float_rounding_mode == float_round_down) { 2741 zSign ^= 1; 2742 } 2743 return packFloat32(zSign, 0, 0); 2744 } 2745 } 2746 --zExp; 2747 /* Normalize to put the explicit bit back into bit 62. */ 2748 shiftcount = countLeadingZeros64(zSig64) - 1; 2749 zSig64 <<= shiftcount; 2750 zExp -= shiftcount; 2751 } 2752 if (flags & float_muladd_halve_result) { 2753 zExp--; 2754 } 2755 2756 shift64RightJamming(zSig64, 32, &zSig64); 2757 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2758 } 2759 2760 2761 /*---------------------------------------------------------------------------- 2762 | Returns the square root of the single-precision floating-point value `a'. 2763 | The operation is performed according to the IEC/IEEE Standard for Binary 2764 | Floating-Point Arithmetic. 2765 *----------------------------------------------------------------------------*/ 2766 2767 float32 float32_sqrt(float32 a, float_status *status) 2768 { 2769 flag aSign; 2770 int aExp, zExp; 2771 uint32_t aSig, zSig; 2772 uint64_t rem, term; 2773 a = float32_squash_input_denormal(a, status); 2774 2775 aSig = extractFloat32Frac( a ); 2776 aExp = extractFloat32Exp( a ); 2777 aSign = extractFloat32Sign( a ); 2778 if ( aExp == 0xFF ) { 2779 if (aSig) { 2780 return propagateFloat32NaN(a, float32_zero, status); 2781 } 2782 if ( ! aSign ) return a; 2783 float_raise(float_flag_invalid, status); 2784 return float32_default_nan(status); 2785 } 2786 if ( aSign ) { 2787 if ( ( aExp | aSig ) == 0 ) return a; 2788 float_raise(float_flag_invalid, status); 2789 return float32_default_nan(status); 2790 } 2791 if ( aExp == 0 ) { 2792 if ( aSig == 0 ) return float32_zero; 2793 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2794 } 2795 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2796 aSig = ( aSig | 0x00800000 )<<8; 2797 zSig = estimateSqrt32( aExp, aSig ) + 2; 2798 if ( ( zSig & 0x7F ) <= 5 ) { 2799 if ( zSig < 2 ) { 2800 zSig = 0x7FFFFFFF; 2801 goto roundAndPack; 2802 } 2803 aSig >>= aExp & 1; 2804 term = ( (uint64_t) zSig ) * zSig; 2805 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2806 while ( (int64_t) rem < 0 ) { 2807 --zSig; 2808 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2809 } 2810 zSig |= ( rem != 0 ); 2811 } 2812 shift32RightJamming( zSig, 1, &zSig ); 2813 roundAndPack: 2814 return roundAndPackFloat32(0, zExp, zSig, status); 2815 2816 } 2817 2818 /*---------------------------------------------------------------------------- 2819 | Returns the binary exponential of the single-precision floating-point value 2820 | `a'. The operation is performed according to the IEC/IEEE Standard for 2821 | Binary Floating-Point Arithmetic. 2822 | 2823 | Uses the following identities: 2824 | 2825 | 1. ------------------------------------------------------------------------- 2826 | x x*ln(2) 2827 | 2 = e 2828 | 2829 | 2. ------------------------------------------------------------------------- 2830 | 2 3 4 5 n 2831 | x x x x x x x 2832 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2833 | 1! 2! 3! 4! 5! n! 2834 *----------------------------------------------------------------------------*/ 2835 2836 static const float64 float32_exp2_coefficients[15] = 2837 { 2838 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2839 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2840 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2841 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2842 const_float64( 0x3f81111111111111ll ), /* 5 */ 2843 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2844 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2845 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2846 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2847 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2848 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2849 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2850 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2851 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2852 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2853 }; 2854 2855 float32 float32_exp2(float32 a, float_status *status) 2856 { 2857 flag aSign; 2858 int aExp; 2859 uint32_t aSig; 2860 float64 r, x, xn; 2861 int i; 2862 a = float32_squash_input_denormal(a, status); 2863 2864 aSig = extractFloat32Frac( a ); 2865 aExp = extractFloat32Exp( a ); 2866 aSign = extractFloat32Sign( a ); 2867 2868 if ( aExp == 0xFF) { 2869 if (aSig) { 2870 return propagateFloat32NaN(a, float32_zero, status); 2871 } 2872 return (aSign) ? float32_zero : a; 2873 } 2874 if (aExp == 0) { 2875 if (aSig == 0) return float32_one; 2876 } 2877 2878 float_raise(float_flag_inexact, status); 2879 2880 /* ******************************* */ 2881 /* using float64 for approximation */ 2882 /* ******************************* */ 2883 x = float32_to_float64(a, status); 2884 x = float64_mul(x, float64_ln2, status); 2885 2886 xn = x; 2887 r = float64_one; 2888 for (i = 0 ; i < 15 ; i++) { 2889 float64 f; 2890 2891 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2892 r = float64_add(r, f, status); 2893 2894 xn = float64_mul(xn, x, status); 2895 } 2896 2897 return float64_to_float32(r, status); 2898 } 2899 2900 /*---------------------------------------------------------------------------- 2901 | Returns the binary log of the single-precision floating-point value `a'. 2902 | The operation is performed according to the IEC/IEEE Standard for Binary 2903 | Floating-Point Arithmetic. 2904 *----------------------------------------------------------------------------*/ 2905 float32 float32_log2(float32 a, float_status *status) 2906 { 2907 flag aSign, zSign; 2908 int aExp; 2909 uint32_t aSig, zSig, i; 2910 2911 a = float32_squash_input_denormal(a, status); 2912 aSig = extractFloat32Frac( a ); 2913 aExp = extractFloat32Exp( a ); 2914 aSign = extractFloat32Sign( a ); 2915 2916 if ( aExp == 0 ) { 2917 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2918 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2919 } 2920 if ( aSign ) { 2921 float_raise(float_flag_invalid, status); 2922 return float32_default_nan(status); 2923 } 2924 if ( aExp == 0xFF ) { 2925 if (aSig) { 2926 return propagateFloat32NaN(a, float32_zero, status); 2927 } 2928 return a; 2929 } 2930 2931 aExp -= 0x7F; 2932 aSig |= 0x00800000; 2933 zSign = aExp < 0; 2934 zSig = aExp << 23; 2935 2936 for (i = 1 << 22; i > 0; i >>= 1) { 2937 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2938 if ( aSig & 0x01000000 ) { 2939 aSig >>= 1; 2940 zSig |= i; 2941 } 2942 } 2943 2944 if ( zSign ) 2945 zSig = -zSig; 2946 2947 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2948 } 2949 2950 /*---------------------------------------------------------------------------- 2951 | Returns 1 if the single-precision floating-point value `a' is equal to 2952 | the corresponding value `b', and 0 otherwise. The invalid exception is 2953 | raised if either operand is a NaN. Otherwise, the comparison is performed 2954 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2955 *----------------------------------------------------------------------------*/ 2956 2957 int float32_eq(float32 a, float32 b, float_status *status) 2958 { 2959 uint32_t av, bv; 2960 a = float32_squash_input_denormal(a, status); 2961 b = float32_squash_input_denormal(b, status); 2962 2963 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2964 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2965 ) { 2966 float_raise(float_flag_invalid, status); 2967 return 0; 2968 } 2969 av = float32_val(a); 2970 bv = float32_val(b); 2971 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2972 } 2973 2974 /*---------------------------------------------------------------------------- 2975 | Returns 1 if the single-precision floating-point value `a' is less than 2976 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2977 | exception is raised if either operand is a NaN. The comparison is performed 2978 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2979 *----------------------------------------------------------------------------*/ 2980 2981 int float32_le(float32 a, float32 b, float_status *status) 2982 { 2983 flag aSign, bSign; 2984 uint32_t av, bv; 2985 a = float32_squash_input_denormal(a, status); 2986 b = float32_squash_input_denormal(b, status); 2987 2988 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2989 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2990 ) { 2991 float_raise(float_flag_invalid, status); 2992 return 0; 2993 } 2994 aSign = extractFloat32Sign( a ); 2995 bSign = extractFloat32Sign( b ); 2996 av = float32_val(a); 2997 bv = float32_val(b); 2998 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2999 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3000 3001 } 3002 3003 /*---------------------------------------------------------------------------- 3004 | Returns 1 if the single-precision floating-point value `a' is less than 3005 | the corresponding value `b', and 0 otherwise. The invalid exception is 3006 | raised if either operand is a NaN. The comparison is performed according 3007 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3008 *----------------------------------------------------------------------------*/ 3009 3010 int float32_lt(float32 a, float32 b, float_status *status) 3011 { 3012 flag aSign, bSign; 3013 uint32_t av, bv; 3014 a = float32_squash_input_denormal(a, status); 3015 b = float32_squash_input_denormal(b, status); 3016 3017 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3018 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3019 ) { 3020 float_raise(float_flag_invalid, status); 3021 return 0; 3022 } 3023 aSign = extractFloat32Sign( a ); 3024 bSign = extractFloat32Sign( b ); 3025 av = float32_val(a); 3026 bv = float32_val(b); 3027 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3028 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3029 3030 } 3031 3032 /*---------------------------------------------------------------------------- 3033 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3034 | be compared, and 0 otherwise. The invalid exception is raised if either 3035 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3036 | Standard for Binary Floating-Point Arithmetic. 3037 *----------------------------------------------------------------------------*/ 3038 3039 int float32_unordered(float32 a, float32 b, float_status *status) 3040 { 3041 a = float32_squash_input_denormal(a, status); 3042 b = float32_squash_input_denormal(b, status); 3043 3044 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3045 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3046 ) { 3047 float_raise(float_flag_invalid, status); 3048 return 1; 3049 } 3050 return 0; 3051 } 3052 3053 /*---------------------------------------------------------------------------- 3054 | Returns 1 if the single-precision floating-point value `a' is equal to 3055 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3056 | exception. The comparison is performed according to the IEC/IEEE Standard 3057 | for Binary Floating-Point Arithmetic. 3058 *----------------------------------------------------------------------------*/ 3059 3060 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3061 { 3062 a = float32_squash_input_denormal(a, status); 3063 b = float32_squash_input_denormal(b, status); 3064 3065 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3066 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3067 ) { 3068 if (float32_is_signaling_nan(a, status) 3069 || float32_is_signaling_nan(b, status)) { 3070 float_raise(float_flag_invalid, status); 3071 } 3072 return 0; 3073 } 3074 return ( float32_val(a) == float32_val(b) ) || 3075 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3076 } 3077 3078 /*---------------------------------------------------------------------------- 3079 | Returns 1 if the single-precision floating-point value `a' is less than or 3080 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3081 | cause an exception. Otherwise, the comparison is performed according to the 3082 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3083 *----------------------------------------------------------------------------*/ 3084 3085 int float32_le_quiet(float32 a, float32 b, float_status *status) 3086 { 3087 flag aSign, bSign; 3088 uint32_t av, bv; 3089 a = float32_squash_input_denormal(a, status); 3090 b = float32_squash_input_denormal(b, status); 3091 3092 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3093 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3094 ) { 3095 if (float32_is_signaling_nan(a, status) 3096 || float32_is_signaling_nan(b, status)) { 3097 float_raise(float_flag_invalid, status); 3098 } 3099 return 0; 3100 } 3101 aSign = extractFloat32Sign( a ); 3102 bSign = extractFloat32Sign( b ); 3103 av = float32_val(a); 3104 bv = float32_val(b); 3105 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3106 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3107 3108 } 3109 3110 /*---------------------------------------------------------------------------- 3111 | Returns 1 if the single-precision floating-point value `a' is less than 3112 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3113 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3114 | Standard for Binary Floating-Point Arithmetic. 3115 *----------------------------------------------------------------------------*/ 3116 3117 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3118 { 3119 flag aSign, bSign; 3120 uint32_t av, bv; 3121 a = float32_squash_input_denormal(a, status); 3122 b = float32_squash_input_denormal(b, status); 3123 3124 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3125 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3126 ) { 3127 if (float32_is_signaling_nan(a, status) 3128 || float32_is_signaling_nan(b, status)) { 3129 float_raise(float_flag_invalid, status); 3130 } 3131 return 0; 3132 } 3133 aSign = extractFloat32Sign( a ); 3134 bSign = extractFloat32Sign( b ); 3135 av = float32_val(a); 3136 bv = float32_val(b); 3137 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3138 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3139 3140 } 3141 3142 /*---------------------------------------------------------------------------- 3143 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3144 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3145 | comparison is performed according to the IEC/IEEE Standard for Binary 3146 | Floating-Point Arithmetic. 3147 *----------------------------------------------------------------------------*/ 3148 3149 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3150 { 3151 a = float32_squash_input_denormal(a, status); 3152 b = float32_squash_input_denormal(b, status); 3153 3154 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3155 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3156 ) { 3157 if (float32_is_signaling_nan(a, status) 3158 || float32_is_signaling_nan(b, status)) { 3159 float_raise(float_flag_invalid, status); 3160 } 3161 return 1; 3162 } 3163 return 0; 3164 } 3165 3166 /*---------------------------------------------------------------------------- 3167 | Returns the result of converting the double-precision floating-point value 3168 | `a' to the 32-bit two's complement integer format. The conversion is 3169 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3170 | Arithmetic---which means in particular that the conversion is rounded 3171 | according to the current rounding mode. If `a' is a NaN, the largest 3172 | positive integer is returned. Otherwise, if the conversion overflows, the 3173 | largest integer with the same sign as `a' is returned. 3174 *----------------------------------------------------------------------------*/ 3175 3176 int32_t float64_to_int32(float64 a, float_status *status) 3177 { 3178 flag aSign; 3179 int aExp; 3180 int shiftCount; 3181 uint64_t aSig; 3182 a = float64_squash_input_denormal(a, status); 3183 3184 aSig = extractFloat64Frac( a ); 3185 aExp = extractFloat64Exp( a ); 3186 aSign = extractFloat64Sign( a ); 3187 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3188 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3189 shiftCount = 0x42C - aExp; 3190 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3191 return roundAndPackInt32(aSign, aSig, status); 3192 3193 } 3194 3195 /*---------------------------------------------------------------------------- 3196 | Returns the result of converting the double-precision floating-point value 3197 | `a' to the 32-bit two's complement integer format. The conversion is 3198 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3199 | Arithmetic, except that the conversion is always rounded toward zero. 3200 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3201 | the conversion overflows, the largest integer with the same sign as `a' is 3202 | returned. 3203 *----------------------------------------------------------------------------*/ 3204 3205 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3206 { 3207 flag aSign; 3208 int aExp; 3209 int shiftCount; 3210 uint64_t aSig, savedASig; 3211 int32_t z; 3212 a = float64_squash_input_denormal(a, status); 3213 3214 aSig = extractFloat64Frac( a ); 3215 aExp = extractFloat64Exp( a ); 3216 aSign = extractFloat64Sign( a ); 3217 if ( 0x41E < aExp ) { 3218 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3219 goto invalid; 3220 } 3221 else if ( aExp < 0x3FF ) { 3222 if (aExp || aSig) { 3223 status->float_exception_flags |= float_flag_inexact; 3224 } 3225 return 0; 3226 } 3227 aSig |= LIT64( 0x0010000000000000 ); 3228 shiftCount = 0x433 - aExp; 3229 savedASig = aSig; 3230 aSig >>= shiftCount; 3231 z = aSig; 3232 if ( aSign ) z = - z; 3233 if ( ( z < 0 ) ^ aSign ) { 3234 invalid: 3235 float_raise(float_flag_invalid, status); 3236 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3237 } 3238 if ( ( aSig<<shiftCount ) != savedASig ) { 3239 status->float_exception_flags |= float_flag_inexact; 3240 } 3241 return z; 3242 3243 } 3244 3245 /*---------------------------------------------------------------------------- 3246 | Returns the result of converting the double-precision floating-point value 3247 | `a' to the 16-bit two's complement integer format. The conversion is 3248 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3249 | Arithmetic, except that the conversion is always rounded toward zero. 3250 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3251 | the conversion overflows, the largest integer with the same sign as `a' is 3252 | returned. 3253 *----------------------------------------------------------------------------*/ 3254 3255 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3256 { 3257 flag aSign; 3258 int aExp; 3259 int shiftCount; 3260 uint64_t aSig, savedASig; 3261 int32_t z; 3262 3263 aSig = extractFloat64Frac( a ); 3264 aExp = extractFloat64Exp( a ); 3265 aSign = extractFloat64Sign( a ); 3266 if ( 0x40E < aExp ) { 3267 if ( ( aExp == 0x7FF ) && aSig ) { 3268 aSign = 0; 3269 } 3270 goto invalid; 3271 } 3272 else if ( aExp < 0x3FF ) { 3273 if ( aExp || aSig ) { 3274 status->float_exception_flags |= float_flag_inexact; 3275 } 3276 return 0; 3277 } 3278 aSig |= LIT64( 0x0010000000000000 ); 3279 shiftCount = 0x433 - aExp; 3280 savedASig = aSig; 3281 aSig >>= shiftCount; 3282 z = aSig; 3283 if ( aSign ) { 3284 z = - z; 3285 } 3286 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3287 invalid: 3288 float_raise(float_flag_invalid, status); 3289 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3290 } 3291 if ( ( aSig<<shiftCount ) != savedASig ) { 3292 status->float_exception_flags |= float_flag_inexact; 3293 } 3294 return z; 3295 } 3296 3297 /*---------------------------------------------------------------------------- 3298 | Returns the result of converting the double-precision floating-point value 3299 | `a' to the 64-bit two's complement integer format. The conversion is 3300 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3301 | Arithmetic---which means in particular that the conversion is rounded 3302 | according to the current rounding mode. If `a' is a NaN, the largest 3303 | positive integer is returned. Otherwise, if the conversion overflows, the 3304 | largest integer with the same sign as `a' is returned. 3305 *----------------------------------------------------------------------------*/ 3306 3307 int64_t float64_to_int64(float64 a, float_status *status) 3308 { 3309 flag aSign; 3310 int aExp; 3311 int shiftCount; 3312 uint64_t aSig, aSigExtra; 3313 a = float64_squash_input_denormal(a, status); 3314 3315 aSig = extractFloat64Frac( a ); 3316 aExp = extractFloat64Exp( a ); 3317 aSign = extractFloat64Sign( a ); 3318 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3319 shiftCount = 0x433 - aExp; 3320 if ( shiftCount <= 0 ) { 3321 if ( 0x43E < aExp ) { 3322 float_raise(float_flag_invalid, status); 3323 if ( ! aSign 3324 || ( ( aExp == 0x7FF ) 3325 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3326 ) { 3327 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3328 } 3329 return (int64_t) LIT64( 0x8000000000000000 ); 3330 } 3331 aSigExtra = 0; 3332 aSig <<= - shiftCount; 3333 } 3334 else { 3335 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3336 } 3337 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3338 3339 } 3340 3341 /*---------------------------------------------------------------------------- 3342 | Returns the result of converting the double-precision floating-point value 3343 | `a' to the 64-bit two's complement integer format. The conversion is 3344 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3345 | Arithmetic, except that the conversion is always rounded toward zero. 3346 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3347 | the conversion overflows, the largest integer with the same sign as `a' is 3348 | returned. 3349 *----------------------------------------------------------------------------*/ 3350 3351 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3352 { 3353 flag aSign; 3354 int aExp; 3355 int shiftCount; 3356 uint64_t aSig; 3357 int64_t z; 3358 a = float64_squash_input_denormal(a, status); 3359 3360 aSig = extractFloat64Frac( a ); 3361 aExp = extractFloat64Exp( a ); 3362 aSign = extractFloat64Sign( a ); 3363 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3364 shiftCount = aExp - 0x433; 3365 if ( 0 <= shiftCount ) { 3366 if ( 0x43E <= aExp ) { 3367 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3368 float_raise(float_flag_invalid, status); 3369 if ( ! aSign 3370 || ( ( aExp == 0x7FF ) 3371 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3372 ) { 3373 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3374 } 3375 } 3376 return (int64_t) LIT64( 0x8000000000000000 ); 3377 } 3378 z = aSig<<shiftCount; 3379 } 3380 else { 3381 if ( aExp < 0x3FE ) { 3382 if (aExp | aSig) { 3383 status->float_exception_flags |= float_flag_inexact; 3384 } 3385 return 0; 3386 } 3387 z = aSig>>( - shiftCount ); 3388 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3389 status->float_exception_flags |= float_flag_inexact; 3390 } 3391 } 3392 if ( aSign ) z = - z; 3393 return z; 3394 3395 } 3396 3397 /*---------------------------------------------------------------------------- 3398 | Returns the result of converting the double-precision floating-point value 3399 | `a' to the single-precision floating-point format. The conversion is 3400 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3401 | Arithmetic. 3402 *----------------------------------------------------------------------------*/ 3403 3404 float32 float64_to_float32(float64 a, float_status *status) 3405 { 3406 flag aSign; 3407 int aExp; 3408 uint64_t aSig; 3409 uint32_t zSig; 3410 a = float64_squash_input_denormal(a, status); 3411 3412 aSig = extractFloat64Frac( a ); 3413 aExp = extractFloat64Exp( a ); 3414 aSign = extractFloat64Sign( a ); 3415 if ( aExp == 0x7FF ) { 3416 if (aSig) { 3417 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3418 } 3419 return packFloat32( aSign, 0xFF, 0 ); 3420 } 3421 shift64RightJamming( aSig, 22, &aSig ); 3422 zSig = aSig; 3423 if ( aExp || zSig ) { 3424 zSig |= 0x40000000; 3425 aExp -= 0x381; 3426 } 3427 return roundAndPackFloat32(aSign, aExp, zSig, status); 3428 3429 } 3430 3431 3432 /*---------------------------------------------------------------------------- 3433 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3434 | half-precision floating-point value, returning the result. After being 3435 | shifted into the proper positions, the three fields are simply added 3436 | together to form the result. This means that any integer portion of `zSig' 3437 | will be added into the exponent. Since a properly normalized significand 3438 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3439 | than the desired result exponent whenever `zSig' is a complete, normalized 3440 | significand. 3441 *----------------------------------------------------------------------------*/ 3442 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3443 { 3444 return make_float16( 3445 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3446 } 3447 3448 /*---------------------------------------------------------------------------- 3449 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3450 | and significand `zSig', and returns the proper half-precision floating- 3451 | point value corresponding to the abstract input. Ordinarily, the abstract 3452 | value is simply rounded and packed into the half-precision format, with 3453 | the inexact exception raised if the abstract input cannot be represented 3454 | exactly. However, if the abstract value is too large, the overflow and 3455 | inexact exceptions are raised and an infinity or maximal finite value is 3456 | returned. If the abstract value is too small, the input value is rounded to 3457 | a subnormal number, and the underflow and inexact exceptions are raised if 3458 | the abstract input cannot be represented exactly as a subnormal half- 3459 | precision floating-point number. 3460 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3461 | ARM-style "alternative representation", which omits the NaN and Inf 3462 | encodings in order to raise the maximum representable exponent by one. 3463 | The input significand `zSig' has its binary point between bits 22 3464 | and 23, which is 13 bits to the left of the usual location. This shifted 3465 | significand must be normalized or smaller. If `zSig' is not normalized, 3466 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3467 | and it must not require rounding. In the usual case that `zSig' is 3468 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3469 | Note the slightly odd position of the binary point in zSig compared with the 3470 | other roundAndPackFloat functions. This should probably be fixed if we 3471 | need to implement more float16 routines than just conversion. 3472 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3473 | Binary Floating-Point Arithmetic. 3474 *----------------------------------------------------------------------------*/ 3475 3476 static float16 roundAndPackFloat16(flag zSign, int zExp, 3477 uint32_t zSig, flag ieee, 3478 float_status *status) 3479 { 3480 int maxexp = ieee ? 29 : 30; 3481 uint32_t mask; 3482 uint32_t increment; 3483 bool rounding_bumps_exp; 3484 bool is_tiny = false; 3485 3486 /* Calculate the mask of bits of the mantissa which are not 3487 * representable in half-precision and will be lost. 3488 */ 3489 if (zExp < 1) { 3490 /* Will be denormal in halfprec */ 3491 mask = 0x00ffffff; 3492 if (zExp >= -11) { 3493 mask >>= 11 + zExp; 3494 } 3495 } else { 3496 /* Normal number in halfprec */ 3497 mask = 0x00001fff; 3498 } 3499 3500 switch (status->float_rounding_mode) { 3501 case float_round_nearest_even: 3502 increment = (mask + 1) >> 1; 3503 if ((zSig & mask) == increment) { 3504 increment = zSig & (increment << 1); 3505 } 3506 break; 3507 case float_round_ties_away: 3508 increment = (mask + 1) >> 1; 3509 break; 3510 case float_round_up: 3511 increment = zSign ? 0 : mask; 3512 break; 3513 case float_round_down: 3514 increment = zSign ? mask : 0; 3515 break; 3516 default: /* round_to_zero */ 3517 increment = 0; 3518 break; 3519 } 3520 3521 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3522 3523 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3524 if (ieee) { 3525 float_raise(float_flag_overflow | float_flag_inexact, status); 3526 return packFloat16(zSign, 0x1f, 0); 3527 } else { 3528 float_raise(float_flag_invalid, status); 3529 return packFloat16(zSign, 0x1f, 0x3ff); 3530 } 3531 } 3532 3533 if (zExp < 0) { 3534 /* Note that flush-to-zero does not affect half-precision results */ 3535 is_tiny = 3536 (status->float_detect_tininess == float_tininess_before_rounding) 3537 || (zExp < -1) 3538 || (!rounding_bumps_exp); 3539 } 3540 if (zSig & mask) { 3541 float_raise(float_flag_inexact, status); 3542 if (is_tiny) { 3543 float_raise(float_flag_underflow, status); 3544 } 3545 } 3546 3547 zSig += increment; 3548 if (rounding_bumps_exp) { 3549 zSig >>= 1; 3550 zExp++; 3551 } 3552 3553 if (zExp < -10) { 3554 return packFloat16(zSign, 0, 0); 3555 } 3556 if (zExp < 0) { 3557 zSig >>= -zExp; 3558 zExp = 0; 3559 } 3560 return packFloat16(zSign, zExp, zSig >> 13); 3561 } 3562 3563 /*---------------------------------------------------------------------------- 3564 | If `a' is denormal and we are in flush-to-zero mode then set the 3565 | input-denormal exception and return zero. Otherwise just return the value. 3566 *----------------------------------------------------------------------------*/ 3567 float16 float16_squash_input_denormal(float16 a, float_status *status) 3568 { 3569 if (status->flush_inputs_to_zero) { 3570 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3571 float_raise(float_flag_input_denormal, status); 3572 return make_float16(float16_val(a) & 0x8000); 3573 } 3574 } 3575 return a; 3576 } 3577 3578 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3579 uint32_t *zSigPtr) 3580 { 3581 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3582 *zSigPtr = aSig << shiftCount; 3583 *zExpPtr = 1 - shiftCount; 3584 } 3585 3586 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3587 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3588 3589 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3590 { 3591 flag aSign; 3592 int aExp; 3593 uint32_t aSig; 3594 3595 aSign = extractFloat16Sign(a); 3596 aExp = extractFloat16Exp(a); 3597 aSig = extractFloat16Frac(a); 3598 3599 if (aExp == 0x1f && ieee) { 3600 if (aSig) { 3601 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3602 } 3603 return packFloat32(aSign, 0xff, 0); 3604 } 3605 if (aExp == 0) { 3606 if (aSig == 0) { 3607 return packFloat32(aSign, 0, 0); 3608 } 3609 3610 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3611 aExp--; 3612 } 3613 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3614 } 3615 3616 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3617 { 3618 flag aSign; 3619 int aExp; 3620 uint32_t aSig; 3621 3622 a = float32_squash_input_denormal(a, status); 3623 3624 aSig = extractFloat32Frac( a ); 3625 aExp = extractFloat32Exp( a ); 3626 aSign = extractFloat32Sign( a ); 3627 if ( aExp == 0xFF ) { 3628 if (aSig) { 3629 /* Input is a NaN */ 3630 if (!ieee) { 3631 float_raise(float_flag_invalid, status); 3632 return packFloat16(aSign, 0, 0); 3633 } 3634 return commonNaNToFloat16( 3635 float32ToCommonNaN(a, status), status); 3636 } 3637 /* Infinity */ 3638 if (!ieee) { 3639 float_raise(float_flag_invalid, status); 3640 return packFloat16(aSign, 0x1f, 0x3ff); 3641 } 3642 return packFloat16(aSign, 0x1f, 0); 3643 } 3644 if (aExp == 0 && aSig == 0) { 3645 return packFloat16(aSign, 0, 0); 3646 } 3647 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3648 * even if the input is denormal; however this is harmless because 3649 * the largest possible single-precision denormal is still smaller 3650 * than the smallest representable half-precision denormal, and so we 3651 * will end up ignoring aSig and returning via the "always return zero" 3652 * codepath. 3653 */ 3654 aSig |= 0x00800000; 3655 aExp -= 0x71; 3656 3657 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3658 } 3659 3660 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3661 { 3662 flag aSign; 3663 int aExp; 3664 uint32_t aSig; 3665 3666 aSign = extractFloat16Sign(a); 3667 aExp = extractFloat16Exp(a); 3668 aSig = extractFloat16Frac(a); 3669 3670 if (aExp == 0x1f && ieee) { 3671 if (aSig) { 3672 return commonNaNToFloat64( 3673 float16ToCommonNaN(a, status), status); 3674 } 3675 return packFloat64(aSign, 0x7ff, 0); 3676 } 3677 if (aExp == 0) { 3678 if (aSig == 0) { 3679 return packFloat64(aSign, 0, 0); 3680 } 3681 3682 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3683 aExp--; 3684 } 3685 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3686 } 3687 3688 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3689 { 3690 flag aSign; 3691 int aExp; 3692 uint64_t aSig; 3693 uint32_t zSig; 3694 3695 a = float64_squash_input_denormal(a, status); 3696 3697 aSig = extractFloat64Frac(a); 3698 aExp = extractFloat64Exp(a); 3699 aSign = extractFloat64Sign(a); 3700 if (aExp == 0x7FF) { 3701 if (aSig) { 3702 /* Input is a NaN */ 3703 if (!ieee) { 3704 float_raise(float_flag_invalid, status); 3705 return packFloat16(aSign, 0, 0); 3706 } 3707 return commonNaNToFloat16( 3708 float64ToCommonNaN(a, status), status); 3709 } 3710 /* Infinity */ 3711 if (!ieee) { 3712 float_raise(float_flag_invalid, status); 3713 return packFloat16(aSign, 0x1f, 0x3ff); 3714 } 3715 return packFloat16(aSign, 0x1f, 0); 3716 } 3717 shift64RightJamming(aSig, 29, &aSig); 3718 zSig = aSig; 3719 if (aExp == 0 && zSig == 0) { 3720 return packFloat16(aSign, 0, 0); 3721 } 3722 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3723 * even if the input is denormal; however this is harmless because 3724 * the largest possible single-precision denormal is still smaller 3725 * than the smallest representable half-precision denormal, and so we 3726 * will end up ignoring aSig and returning via the "always return zero" 3727 * codepath. 3728 */ 3729 zSig |= 0x00800000; 3730 aExp -= 0x3F1; 3731 3732 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3733 } 3734 3735 /*---------------------------------------------------------------------------- 3736 | Returns the result of converting the double-precision floating-point value 3737 | `a' to the extended double-precision floating-point format. The conversion 3738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3739 | Arithmetic. 3740 *----------------------------------------------------------------------------*/ 3741 3742 floatx80 float64_to_floatx80(float64 a, float_status *status) 3743 { 3744 flag aSign; 3745 int aExp; 3746 uint64_t aSig; 3747 3748 a = float64_squash_input_denormal(a, status); 3749 aSig = extractFloat64Frac( a ); 3750 aExp = extractFloat64Exp( a ); 3751 aSign = extractFloat64Sign( a ); 3752 if ( aExp == 0x7FF ) { 3753 if (aSig) { 3754 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3755 } 3756 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3757 } 3758 if ( aExp == 0 ) { 3759 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3760 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3761 } 3762 return 3763 packFloatx80( 3764 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3765 3766 } 3767 3768 /*---------------------------------------------------------------------------- 3769 | Returns the result of converting the double-precision floating-point value 3770 | `a' to the quadruple-precision floating-point format. The conversion is 3771 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3772 | Arithmetic. 3773 *----------------------------------------------------------------------------*/ 3774 3775 float128 float64_to_float128(float64 a, float_status *status) 3776 { 3777 flag aSign; 3778 int aExp; 3779 uint64_t aSig, zSig0, zSig1; 3780 3781 a = float64_squash_input_denormal(a, status); 3782 aSig = extractFloat64Frac( a ); 3783 aExp = extractFloat64Exp( a ); 3784 aSign = extractFloat64Sign( a ); 3785 if ( aExp == 0x7FF ) { 3786 if (aSig) { 3787 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3788 } 3789 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3790 } 3791 if ( aExp == 0 ) { 3792 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3793 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3794 --aExp; 3795 } 3796 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3797 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3798 3799 } 3800 3801 /*---------------------------------------------------------------------------- 3802 | Rounds the double-precision floating-point value `a' to an integer, and 3803 | returns the result as a double-precision floating-point value. The 3804 | operation is performed according to the IEC/IEEE Standard for Binary 3805 | Floating-Point Arithmetic. 3806 *----------------------------------------------------------------------------*/ 3807 3808 float64 float64_round_to_int(float64 a, float_status *status) 3809 { 3810 flag aSign; 3811 int aExp; 3812 uint64_t lastBitMask, roundBitsMask; 3813 uint64_t z; 3814 a = float64_squash_input_denormal(a, status); 3815 3816 aExp = extractFloat64Exp( a ); 3817 if ( 0x433 <= aExp ) { 3818 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3819 return propagateFloat64NaN(a, a, status); 3820 } 3821 return a; 3822 } 3823 if ( aExp < 0x3FF ) { 3824 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3825 status->float_exception_flags |= float_flag_inexact; 3826 aSign = extractFloat64Sign( a ); 3827 switch (status->float_rounding_mode) { 3828 case float_round_nearest_even: 3829 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3830 return packFloat64( aSign, 0x3FF, 0 ); 3831 } 3832 break; 3833 case float_round_ties_away: 3834 if (aExp == 0x3FE) { 3835 return packFloat64(aSign, 0x3ff, 0); 3836 } 3837 break; 3838 case float_round_down: 3839 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3840 case float_round_up: 3841 return make_float64( 3842 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3843 } 3844 return packFloat64( aSign, 0, 0 ); 3845 } 3846 lastBitMask = 1; 3847 lastBitMask <<= 0x433 - aExp; 3848 roundBitsMask = lastBitMask - 1; 3849 z = float64_val(a); 3850 switch (status->float_rounding_mode) { 3851 case float_round_nearest_even: 3852 z += lastBitMask >> 1; 3853 if ((z & roundBitsMask) == 0) { 3854 z &= ~lastBitMask; 3855 } 3856 break; 3857 case float_round_ties_away: 3858 z += lastBitMask >> 1; 3859 break; 3860 case float_round_to_zero: 3861 break; 3862 case float_round_up: 3863 if (!extractFloat64Sign(make_float64(z))) { 3864 z += roundBitsMask; 3865 } 3866 break; 3867 case float_round_down: 3868 if (extractFloat64Sign(make_float64(z))) { 3869 z += roundBitsMask; 3870 } 3871 break; 3872 default: 3873 abort(); 3874 } 3875 z &= ~ roundBitsMask; 3876 if (z != float64_val(a)) { 3877 status->float_exception_flags |= float_flag_inexact; 3878 } 3879 return make_float64(z); 3880 3881 } 3882 3883 float64 float64_trunc_to_int(float64 a, float_status *status) 3884 { 3885 int oldmode; 3886 float64 res; 3887 oldmode = status->float_rounding_mode; 3888 status->float_rounding_mode = float_round_to_zero; 3889 res = float64_round_to_int(a, status); 3890 status->float_rounding_mode = oldmode; 3891 return res; 3892 } 3893 3894 /*---------------------------------------------------------------------------- 3895 | Returns the result of adding the absolute values of the double-precision 3896 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3897 | before being returned. `zSign' is ignored if the result is a NaN. 3898 | The addition is performed according to the IEC/IEEE Standard for Binary 3899 | Floating-Point Arithmetic. 3900 *----------------------------------------------------------------------------*/ 3901 3902 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3903 float_status *status) 3904 { 3905 int aExp, bExp, zExp; 3906 uint64_t aSig, bSig, zSig; 3907 int expDiff; 3908 3909 aSig = extractFloat64Frac( a ); 3910 aExp = extractFloat64Exp( a ); 3911 bSig = extractFloat64Frac( b ); 3912 bExp = extractFloat64Exp( b ); 3913 expDiff = aExp - bExp; 3914 aSig <<= 9; 3915 bSig <<= 9; 3916 if ( 0 < expDiff ) { 3917 if ( aExp == 0x7FF ) { 3918 if (aSig) { 3919 return propagateFloat64NaN(a, b, status); 3920 } 3921 return a; 3922 } 3923 if ( bExp == 0 ) { 3924 --expDiff; 3925 } 3926 else { 3927 bSig |= LIT64( 0x2000000000000000 ); 3928 } 3929 shift64RightJamming( bSig, expDiff, &bSig ); 3930 zExp = aExp; 3931 } 3932 else if ( expDiff < 0 ) { 3933 if ( bExp == 0x7FF ) { 3934 if (bSig) { 3935 return propagateFloat64NaN(a, b, status); 3936 } 3937 return packFloat64( zSign, 0x7FF, 0 ); 3938 } 3939 if ( aExp == 0 ) { 3940 ++expDiff; 3941 } 3942 else { 3943 aSig |= LIT64( 0x2000000000000000 ); 3944 } 3945 shift64RightJamming( aSig, - expDiff, &aSig ); 3946 zExp = bExp; 3947 } 3948 else { 3949 if ( aExp == 0x7FF ) { 3950 if (aSig | bSig) { 3951 return propagateFloat64NaN(a, b, status); 3952 } 3953 return a; 3954 } 3955 if ( aExp == 0 ) { 3956 if (status->flush_to_zero) { 3957 if (aSig | bSig) { 3958 float_raise(float_flag_output_denormal, status); 3959 } 3960 return packFloat64(zSign, 0, 0); 3961 } 3962 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3963 } 3964 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3965 zExp = aExp; 3966 goto roundAndPack; 3967 } 3968 aSig |= LIT64( 0x2000000000000000 ); 3969 zSig = ( aSig + bSig )<<1; 3970 --zExp; 3971 if ( (int64_t) zSig < 0 ) { 3972 zSig = aSig + bSig; 3973 ++zExp; 3974 } 3975 roundAndPack: 3976 return roundAndPackFloat64(zSign, zExp, zSig, status); 3977 3978 } 3979 3980 /*---------------------------------------------------------------------------- 3981 | Returns the result of subtracting the absolute values of the double- 3982 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3983 | difference is negated before being returned. `zSign' is ignored if the 3984 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3985 | Standard for Binary Floating-Point Arithmetic. 3986 *----------------------------------------------------------------------------*/ 3987 3988 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3989 float_status *status) 3990 { 3991 int aExp, bExp, zExp; 3992 uint64_t aSig, bSig, zSig; 3993 int expDiff; 3994 3995 aSig = extractFloat64Frac( a ); 3996 aExp = extractFloat64Exp( a ); 3997 bSig = extractFloat64Frac( b ); 3998 bExp = extractFloat64Exp( b ); 3999 expDiff = aExp - bExp; 4000 aSig <<= 10; 4001 bSig <<= 10; 4002 if ( 0 < expDiff ) goto aExpBigger; 4003 if ( expDiff < 0 ) goto bExpBigger; 4004 if ( aExp == 0x7FF ) { 4005 if (aSig | bSig) { 4006 return propagateFloat64NaN(a, b, status); 4007 } 4008 float_raise(float_flag_invalid, status); 4009 return float64_default_nan(status); 4010 } 4011 if ( aExp == 0 ) { 4012 aExp = 1; 4013 bExp = 1; 4014 } 4015 if ( bSig < aSig ) goto aBigger; 4016 if ( aSig < bSig ) goto bBigger; 4017 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 4018 bExpBigger: 4019 if ( bExp == 0x7FF ) { 4020 if (bSig) { 4021 return propagateFloat64NaN(a, b, status); 4022 } 4023 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 4024 } 4025 if ( aExp == 0 ) { 4026 ++expDiff; 4027 } 4028 else { 4029 aSig |= LIT64( 0x4000000000000000 ); 4030 } 4031 shift64RightJamming( aSig, - expDiff, &aSig ); 4032 bSig |= LIT64( 0x4000000000000000 ); 4033 bBigger: 4034 zSig = bSig - aSig; 4035 zExp = bExp; 4036 zSign ^= 1; 4037 goto normalizeRoundAndPack; 4038 aExpBigger: 4039 if ( aExp == 0x7FF ) { 4040 if (aSig) { 4041 return propagateFloat64NaN(a, b, status); 4042 } 4043 return a; 4044 } 4045 if ( bExp == 0 ) { 4046 --expDiff; 4047 } 4048 else { 4049 bSig |= LIT64( 0x4000000000000000 ); 4050 } 4051 shift64RightJamming( bSig, expDiff, &bSig ); 4052 aSig |= LIT64( 0x4000000000000000 ); 4053 aBigger: 4054 zSig = aSig - bSig; 4055 zExp = aExp; 4056 normalizeRoundAndPack: 4057 --zExp; 4058 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 4059 4060 } 4061 4062 /*---------------------------------------------------------------------------- 4063 | Returns the result of adding the double-precision floating-point values `a' 4064 | and `b'. The operation is performed according to the IEC/IEEE Standard for 4065 | Binary Floating-Point Arithmetic. 4066 *----------------------------------------------------------------------------*/ 4067 4068 float64 float64_add(float64 a, float64 b, float_status *status) 4069 { 4070 flag aSign, bSign; 4071 a = float64_squash_input_denormal(a, status); 4072 b = float64_squash_input_denormal(b, status); 4073 4074 aSign = extractFloat64Sign( a ); 4075 bSign = extractFloat64Sign( b ); 4076 if ( aSign == bSign ) { 4077 return addFloat64Sigs(a, b, aSign, status); 4078 } 4079 else { 4080 return subFloat64Sigs(a, b, aSign, status); 4081 } 4082 4083 } 4084 4085 /*---------------------------------------------------------------------------- 4086 | Returns the result of subtracting the double-precision floating-point values 4087 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4088 | for Binary Floating-Point Arithmetic. 4089 *----------------------------------------------------------------------------*/ 4090 4091 float64 float64_sub(float64 a, float64 b, float_status *status) 4092 { 4093 flag aSign, bSign; 4094 a = float64_squash_input_denormal(a, status); 4095 b = float64_squash_input_denormal(b, status); 4096 4097 aSign = extractFloat64Sign( a ); 4098 bSign = extractFloat64Sign( b ); 4099 if ( aSign == bSign ) { 4100 return subFloat64Sigs(a, b, aSign, status); 4101 } 4102 else { 4103 return addFloat64Sigs(a, b, aSign, status); 4104 } 4105 4106 } 4107 4108 /*---------------------------------------------------------------------------- 4109 | Returns the result of multiplying the double-precision floating-point values 4110 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4111 | for Binary Floating-Point Arithmetic. 4112 *----------------------------------------------------------------------------*/ 4113 4114 float64 float64_mul(float64 a, float64 b, float_status *status) 4115 { 4116 flag aSign, bSign, zSign; 4117 int aExp, bExp, zExp; 4118 uint64_t aSig, bSig, zSig0, zSig1; 4119 4120 a = float64_squash_input_denormal(a, status); 4121 b = float64_squash_input_denormal(b, status); 4122 4123 aSig = extractFloat64Frac( a ); 4124 aExp = extractFloat64Exp( a ); 4125 aSign = extractFloat64Sign( a ); 4126 bSig = extractFloat64Frac( b ); 4127 bExp = extractFloat64Exp( b ); 4128 bSign = extractFloat64Sign( b ); 4129 zSign = aSign ^ bSign; 4130 if ( aExp == 0x7FF ) { 4131 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4132 return propagateFloat64NaN(a, b, status); 4133 } 4134 if ( ( bExp | bSig ) == 0 ) { 4135 float_raise(float_flag_invalid, status); 4136 return float64_default_nan(status); 4137 } 4138 return packFloat64( zSign, 0x7FF, 0 ); 4139 } 4140 if ( bExp == 0x7FF ) { 4141 if (bSig) { 4142 return propagateFloat64NaN(a, b, status); 4143 } 4144 if ( ( aExp | aSig ) == 0 ) { 4145 float_raise(float_flag_invalid, status); 4146 return float64_default_nan(status); 4147 } 4148 return packFloat64( zSign, 0x7FF, 0 ); 4149 } 4150 if ( aExp == 0 ) { 4151 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4152 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4153 } 4154 if ( bExp == 0 ) { 4155 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4156 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4157 } 4158 zExp = aExp + bExp - 0x3FF; 4159 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4160 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4161 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4162 zSig0 |= ( zSig1 != 0 ); 4163 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4164 zSig0 <<= 1; 4165 --zExp; 4166 } 4167 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4168 4169 } 4170 4171 /*---------------------------------------------------------------------------- 4172 | Returns the result of dividing the double-precision floating-point value `a' 4173 | by the corresponding value `b'. The operation is performed according to 4174 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4175 *----------------------------------------------------------------------------*/ 4176 4177 float64 float64_div(float64 a, float64 b, float_status *status) 4178 { 4179 flag aSign, bSign, zSign; 4180 int aExp, bExp, zExp; 4181 uint64_t aSig, bSig, zSig; 4182 uint64_t rem0, rem1; 4183 uint64_t term0, term1; 4184 a = float64_squash_input_denormal(a, status); 4185 b = float64_squash_input_denormal(b, status); 4186 4187 aSig = extractFloat64Frac( a ); 4188 aExp = extractFloat64Exp( a ); 4189 aSign = extractFloat64Sign( a ); 4190 bSig = extractFloat64Frac( b ); 4191 bExp = extractFloat64Exp( b ); 4192 bSign = extractFloat64Sign( b ); 4193 zSign = aSign ^ bSign; 4194 if ( aExp == 0x7FF ) { 4195 if (aSig) { 4196 return propagateFloat64NaN(a, b, status); 4197 } 4198 if ( bExp == 0x7FF ) { 4199 if (bSig) { 4200 return propagateFloat64NaN(a, b, status); 4201 } 4202 float_raise(float_flag_invalid, status); 4203 return float64_default_nan(status); 4204 } 4205 return packFloat64( zSign, 0x7FF, 0 ); 4206 } 4207 if ( bExp == 0x7FF ) { 4208 if (bSig) { 4209 return propagateFloat64NaN(a, b, status); 4210 } 4211 return packFloat64( zSign, 0, 0 ); 4212 } 4213 if ( bExp == 0 ) { 4214 if ( bSig == 0 ) { 4215 if ( ( aExp | aSig ) == 0 ) { 4216 float_raise(float_flag_invalid, status); 4217 return float64_default_nan(status); 4218 } 4219 float_raise(float_flag_divbyzero, status); 4220 return packFloat64( zSign, 0x7FF, 0 ); 4221 } 4222 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4223 } 4224 if ( aExp == 0 ) { 4225 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4226 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4227 } 4228 zExp = aExp - bExp + 0x3FD; 4229 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4230 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4231 if ( bSig <= ( aSig + aSig ) ) { 4232 aSig >>= 1; 4233 ++zExp; 4234 } 4235 zSig = estimateDiv128To64( aSig, 0, bSig ); 4236 if ( ( zSig & 0x1FF ) <= 2 ) { 4237 mul64To128( bSig, zSig, &term0, &term1 ); 4238 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4239 while ( (int64_t) rem0 < 0 ) { 4240 --zSig; 4241 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4242 } 4243 zSig |= ( rem1 != 0 ); 4244 } 4245 return roundAndPackFloat64(zSign, zExp, zSig, status); 4246 4247 } 4248 4249 /*---------------------------------------------------------------------------- 4250 | Returns the remainder of the double-precision floating-point value `a' 4251 | with respect to the corresponding value `b'. The operation is performed 4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4253 *----------------------------------------------------------------------------*/ 4254 4255 float64 float64_rem(float64 a, float64 b, float_status *status) 4256 { 4257 flag aSign, zSign; 4258 int aExp, bExp, expDiff; 4259 uint64_t aSig, bSig; 4260 uint64_t q, alternateASig; 4261 int64_t sigMean; 4262 4263 a = float64_squash_input_denormal(a, status); 4264 b = float64_squash_input_denormal(b, status); 4265 aSig = extractFloat64Frac( a ); 4266 aExp = extractFloat64Exp( a ); 4267 aSign = extractFloat64Sign( a ); 4268 bSig = extractFloat64Frac( b ); 4269 bExp = extractFloat64Exp( b ); 4270 if ( aExp == 0x7FF ) { 4271 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4272 return propagateFloat64NaN(a, b, status); 4273 } 4274 float_raise(float_flag_invalid, status); 4275 return float64_default_nan(status); 4276 } 4277 if ( bExp == 0x7FF ) { 4278 if (bSig) { 4279 return propagateFloat64NaN(a, b, status); 4280 } 4281 return a; 4282 } 4283 if ( bExp == 0 ) { 4284 if ( bSig == 0 ) { 4285 float_raise(float_flag_invalid, status); 4286 return float64_default_nan(status); 4287 } 4288 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4289 } 4290 if ( aExp == 0 ) { 4291 if ( aSig == 0 ) return a; 4292 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4293 } 4294 expDiff = aExp - bExp; 4295 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4296 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4297 if ( expDiff < 0 ) { 4298 if ( expDiff < -1 ) return a; 4299 aSig >>= 1; 4300 } 4301 q = ( bSig <= aSig ); 4302 if ( q ) aSig -= bSig; 4303 expDiff -= 64; 4304 while ( 0 < expDiff ) { 4305 q = estimateDiv128To64( aSig, 0, bSig ); 4306 q = ( 2 < q ) ? q - 2 : 0; 4307 aSig = - ( ( bSig>>2 ) * q ); 4308 expDiff -= 62; 4309 } 4310 expDiff += 64; 4311 if ( 0 < expDiff ) { 4312 q = estimateDiv128To64( aSig, 0, bSig ); 4313 q = ( 2 < q ) ? q - 2 : 0; 4314 q >>= 64 - expDiff; 4315 bSig >>= 2; 4316 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4317 } 4318 else { 4319 aSig >>= 2; 4320 bSig >>= 2; 4321 } 4322 do { 4323 alternateASig = aSig; 4324 ++q; 4325 aSig -= bSig; 4326 } while ( 0 <= (int64_t) aSig ); 4327 sigMean = aSig + alternateASig; 4328 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4329 aSig = alternateASig; 4330 } 4331 zSign = ( (int64_t) aSig < 0 ); 4332 if ( zSign ) aSig = - aSig; 4333 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4334 4335 } 4336 4337 /*---------------------------------------------------------------------------- 4338 | Returns the result of multiplying the double-precision floating-point values 4339 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4340 | multiplication. The operation is performed according to the IEC/IEEE 4341 | Standard for Binary Floating-Point Arithmetic 754-2008. 4342 | The flags argument allows the caller to select negation of the 4343 | addend, the intermediate product, or the final result. (The difference 4344 | between this and having the caller do a separate negation is that negating 4345 | externally will flip the sign bit on NaNs.) 4346 *----------------------------------------------------------------------------*/ 4347 4348 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4349 float_status *status) 4350 { 4351 flag aSign, bSign, cSign, zSign; 4352 int aExp, bExp, cExp, pExp, zExp, expDiff; 4353 uint64_t aSig, bSig, cSig; 4354 flag pInf, pZero, pSign; 4355 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4356 int shiftcount; 4357 flag signflip, infzero; 4358 4359 a = float64_squash_input_denormal(a, status); 4360 b = float64_squash_input_denormal(b, status); 4361 c = float64_squash_input_denormal(c, status); 4362 aSig = extractFloat64Frac(a); 4363 aExp = extractFloat64Exp(a); 4364 aSign = extractFloat64Sign(a); 4365 bSig = extractFloat64Frac(b); 4366 bExp = extractFloat64Exp(b); 4367 bSign = extractFloat64Sign(b); 4368 cSig = extractFloat64Frac(c); 4369 cExp = extractFloat64Exp(c); 4370 cSign = extractFloat64Sign(c); 4371 4372 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4373 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4374 4375 /* It is implementation-defined whether the cases of (0,inf,qnan) 4376 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4377 * they return if they do), so we have to hand this information 4378 * off to the target-specific pick-a-NaN routine. 4379 */ 4380 if (((aExp == 0x7ff) && aSig) || 4381 ((bExp == 0x7ff) && bSig) || 4382 ((cExp == 0x7ff) && cSig)) { 4383 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4384 } 4385 4386 if (infzero) { 4387 float_raise(float_flag_invalid, status); 4388 return float64_default_nan(status); 4389 } 4390 4391 if (flags & float_muladd_negate_c) { 4392 cSign ^= 1; 4393 } 4394 4395 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4396 4397 /* Work out the sign and type of the product */ 4398 pSign = aSign ^ bSign; 4399 if (flags & float_muladd_negate_product) { 4400 pSign ^= 1; 4401 } 4402 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4403 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4404 4405 if (cExp == 0x7ff) { 4406 if (pInf && (pSign ^ cSign)) { 4407 /* addition of opposite-signed infinities => InvalidOperation */ 4408 float_raise(float_flag_invalid, status); 4409 return float64_default_nan(status); 4410 } 4411 /* Otherwise generate an infinity of the same sign */ 4412 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4413 } 4414 4415 if (pInf) { 4416 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4417 } 4418 4419 if (pZero) { 4420 if (cExp == 0) { 4421 if (cSig == 0) { 4422 /* Adding two exact zeroes */ 4423 if (pSign == cSign) { 4424 zSign = pSign; 4425 } else if (status->float_rounding_mode == float_round_down) { 4426 zSign = 1; 4427 } else { 4428 zSign = 0; 4429 } 4430 return packFloat64(zSign ^ signflip, 0, 0); 4431 } 4432 /* Exact zero plus a denorm */ 4433 if (status->flush_to_zero) { 4434 float_raise(float_flag_output_denormal, status); 4435 return packFloat64(cSign ^ signflip, 0, 0); 4436 } 4437 } 4438 /* Zero plus something non-zero : just return the something */ 4439 if (flags & float_muladd_halve_result) { 4440 if (cExp == 0) { 4441 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4442 } 4443 /* Subtract one to halve, and one again because roundAndPackFloat64 4444 * wants one less than the true exponent. 4445 */ 4446 cExp -= 2; 4447 cSig = (cSig | 0x0010000000000000ULL) << 10; 4448 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4449 } 4450 return packFloat64(cSign ^ signflip, cExp, cSig); 4451 } 4452 4453 if (aExp == 0) { 4454 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4455 } 4456 if (bExp == 0) { 4457 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4458 } 4459 4460 /* Calculate the actual result a * b + c */ 4461 4462 /* Multiply first; this is easy. */ 4463 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4464 * because we want the true exponent, not the "one-less-than" 4465 * flavour that roundAndPackFloat64() takes. 4466 */ 4467 pExp = aExp + bExp - 0x3fe; 4468 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4469 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4470 mul64To128(aSig, bSig, &pSig0, &pSig1); 4471 if ((int64_t)(pSig0 << 1) >= 0) { 4472 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4473 pExp--; 4474 } 4475 4476 zSign = pSign ^ signflip; 4477 4478 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4479 * bit in position 126. 4480 */ 4481 if (cExp == 0) { 4482 if (!cSig) { 4483 /* Throw out the special case of c being an exact zero now */ 4484 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4485 if (flags & float_muladd_halve_result) { 4486 pExp--; 4487 } 4488 return roundAndPackFloat64(zSign, pExp - 1, 4489 pSig1, status); 4490 } 4491 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4492 } 4493 4494 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4495 * significand of the addend, with the explicit bit in position 126. 4496 */ 4497 cSig0 = cSig << (126 - 64 - 52); 4498 cSig1 = 0; 4499 cSig0 |= LIT64(0x4000000000000000); 4500 expDiff = pExp - cExp; 4501 4502 if (pSign == cSign) { 4503 /* Addition */ 4504 if (expDiff > 0) { 4505 /* scale c to match p */ 4506 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4507 zExp = pExp; 4508 } else if (expDiff < 0) { 4509 /* scale p to match c */ 4510 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4511 zExp = cExp; 4512 } else { 4513 /* no scaling needed */ 4514 zExp = cExp; 4515 } 4516 /* Add significands and make sure explicit bit ends up in posn 126 */ 4517 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4518 if ((int64_t)zSig0 < 0) { 4519 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4520 } else { 4521 zExp--; 4522 } 4523 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4524 if (flags & float_muladd_halve_result) { 4525 zExp--; 4526 } 4527 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4528 } else { 4529 /* Subtraction */ 4530 if (expDiff > 0) { 4531 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4532 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4533 zExp = pExp; 4534 } else if (expDiff < 0) { 4535 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4536 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4537 zExp = cExp; 4538 zSign ^= 1; 4539 } else { 4540 zExp = pExp; 4541 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4542 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4543 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4544 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4545 zSign ^= 1; 4546 } else { 4547 /* Exact zero */ 4548 zSign = signflip; 4549 if (status->float_rounding_mode == float_round_down) { 4550 zSign ^= 1; 4551 } 4552 return packFloat64(zSign, 0, 0); 4553 } 4554 } 4555 --zExp; 4556 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4557 * starting with the significand in a pair of uint64_t. 4558 */ 4559 if (zSig0) { 4560 shiftcount = countLeadingZeros64(zSig0) - 1; 4561 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4562 if (zSig1) { 4563 zSig0 |= 1; 4564 } 4565 zExp -= shiftcount; 4566 } else { 4567 shiftcount = countLeadingZeros64(zSig1); 4568 if (shiftcount == 0) { 4569 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4570 zExp -= 63; 4571 } else { 4572 shiftcount--; 4573 zSig0 = zSig1 << shiftcount; 4574 zExp -= (shiftcount + 64); 4575 } 4576 } 4577 if (flags & float_muladd_halve_result) { 4578 zExp--; 4579 } 4580 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4581 } 4582 } 4583 4584 /*---------------------------------------------------------------------------- 4585 | Returns the square root of the double-precision floating-point value `a'. 4586 | The operation is performed according to the IEC/IEEE Standard for Binary 4587 | Floating-Point Arithmetic. 4588 *----------------------------------------------------------------------------*/ 4589 4590 float64 float64_sqrt(float64 a, float_status *status) 4591 { 4592 flag aSign; 4593 int aExp, zExp; 4594 uint64_t aSig, zSig, doubleZSig; 4595 uint64_t rem0, rem1, term0, term1; 4596 a = float64_squash_input_denormal(a, status); 4597 4598 aSig = extractFloat64Frac( a ); 4599 aExp = extractFloat64Exp( a ); 4600 aSign = extractFloat64Sign( a ); 4601 if ( aExp == 0x7FF ) { 4602 if (aSig) { 4603 return propagateFloat64NaN(a, a, status); 4604 } 4605 if ( ! aSign ) return a; 4606 float_raise(float_flag_invalid, status); 4607 return float64_default_nan(status); 4608 } 4609 if ( aSign ) { 4610 if ( ( aExp | aSig ) == 0 ) return a; 4611 float_raise(float_flag_invalid, status); 4612 return float64_default_nan(status); 4613 } 4614 if ( aExp == 0 ) { 4615 if ( aSig == 0 ) return float64_zero; 4616 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4617 } 4618 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4619 aSig |= LIT64( 0x0010000000000000 ); 4620 zSig = estimateSqrt32( aExp, aSig>>21 ); 4621 aSig <<= 9 - ( aExp & 1 ); 4622 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4623 if ( ( zSig & 0x1FF ) <= 5 ) { 4624 doubleZSig = zSig<<1; 4625 mul64To128( zSig, zSig, &term0, &term1 ); 4626 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4627 while ( (int64_t) rem0 < 0 ) { 4628 --zSig; 4629 doubleZSig -= 2; 4630 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4631 } 4632 zSig |= ( ( rem0 | rem1 ) != 0 ); 4633 } 4634 return roundAndPackFloat64(0, zExp, zSig, status); 4635 4636 } 4637 4638 /*---------------------------------------------------------------------------- 4639 | Returns the binary log of the double-precision floating-point value `a'. 4640 | The operation is performed according to the IEC/IEEE Standard for Binary 4641 | Floating-Point Arithmetic. 4642 *----------------------------------------------------------------------------*/ 4643 float64 float64_log2(float64 a, float_status *status) 4644 { 4645 flag aSign, zSign; 4646 int aExp; 4647 uint64_t aSig, aSig0, aSig1, zSig, i; 4648 a = float64_squash_input_denormal(a, status); 4649 4650 aSig = extractFloat64Frac( a ); 4651 aExp = extractFloat64Exp( a ); 4652 aSign = extractFloat64Sign( a ); 4653 4654 if ( aExp == 0 ) { 4655 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4656 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4657 } 4658 if ( aSign ) { 4659 float_raise(float_flag_invalid, status); 4660 return float64_default_nan(status); 4661 } 4662 if ( aExp == 0x7FF ) { 4663 if (aSig) { 4664 return propagateFloat64NaN(a, float64_zero, status); 4665 } 4666 return a; 4667 } 4668 4669 aExp -= 0x3FF; 4670 aSig |= LIT64( 0x0010000000000000 ); 4671 zSign = aExp < 0; 4672 zSig = (uint64_t)aExp << 52; 4673 for (i = 1LL << 51; i > 0; i >>= 1) { 4674 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4675 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4676 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4677 aSig >>= 1; 4678 zSig |= i; 4679 } 4680 } 4681 4682 if ( zSign ) 4683 zSig = -zSig; 4684 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4685 } 4686 4687 /*---------------------------------------------------------------------------- 4688 | Returns 1 if the double-precision floating-point value `a' is equal to the 4689 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4690 | if either operand is a NaN. Otherwise, the comparison is performed 4691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4692 *----------------------------------------------------------------------------*/ 4693 4694 int float64_eq(float64 a, float64 b, float_status *status) 4695 { 4696 uint64_t av, bv; 4697 a = float64_squash_input_denormal(a, status); 4698 b = float64_squash_input_denormal(b, status); 4699 4700 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4701 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4702 ) { 4703 float_raise(float_flag_invalid, status); 4704 return 0; 4705 } 4706 av = float64_val(a); 4707 bv = float64_val(b); 4708 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4709 4710 } 4711 4712 /*---------------------------------------------------------------------------- 4713 | Returns 1 if the double-precision floating-point value `a' is less than or 4714 | equal to the corresponding value `b', and 0 otherwise. The invalid 4715 | exception is raised if either operand is a NaN. The comparison is performed 4716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4717 *----------------------------------------------------------------------------*/ 4718 4719 int float64_le(float64 a, float64 b, float_status *status) 4720 { 4721 flag aSign, bSign; 4722 uint64_t av, bv; 4723 a = float64_squash_input_denormal(a, status); 4724 b = float64_squash_input_denormal(b, status); 4725 4726 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4727 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4728 ) { 4729 float_raise(float_flag_invalid, status); 4730 return 0; 4731 } 4732 aSign = extractFloat64Sign( a ); 4733 bSign = extractFloat64Sign( b ); 4734 av = float64_val(a); 4735 bv = float64_val(b); 4736 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4737 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4738 4739 } 4740 4741 /*---------------------------------------------------------------------------- 4742 | Returns 1 if the double-precision floating-point value `a' is less than 4743 | the corresponding value `b', and 0 otherwise. The invalid exception is 4744 | raised if either operand is a NaN. The comparison is performed according 4745 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4746 *----------------------------------------------------------------------------*/ 4747 4748 int float64_lt(float64 a, float64 b, float_status *status) 4749 { 4750 flag aSign, bSign; 4751 uint64_t av, bv; 4752 4753 a = float64_squash_input_denormal(a, status); 4754 b = float64_squash_input_denormal(b, status); 4755 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4756 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4757 ) { 4758 float_raise(float_flag_invalid, status); 4759 return 0; 4760 } 4761 aSign = extractFloat64Sign( a ); 4762 bSign = extractFloat64Sign( b ); 4763 av = float64_val(a); 4764 bv = float64_val(b); 4765 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4766 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4767 4768 } 4769 4770 /*---------------------------------------------------------------------------- 4771 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4772 | be compared, and 0 otherwise. The invalid exception is raised if either 4773 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4774 | Standard for Binary Floating-Point Arithmetic. 4775 *----------------------------------------------------------------------------*/ 4776 4777 int float64_unordered(float64 a, float64 b, float_status *status) 4778 { 4779 a = float64_squash_input_denormal(a, status); 4780 b = float64_squash_input_denormal(b, status); 4781 4782 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4783 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4784 ) { 4785 float_raise(float_flag_invalid, status); 4786 return 1; 4787 } 4788 return 0; 4789 } 4790 4791 /*---------------------------------------------------------------------------- 4792 | Returns 1 if the double-precision floating-point value `a' is equal to the 4793 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4794 | exception.The comparison is performed according to the IEC/IEEE Standard 4795 | for Binary Floating-Point Arithmetic. 4796 *----------------------------------------------------------------------------*/ 4797 4798 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4799 { 4800 uint64_t av, bv; 4801 a = float64_squash_input_denormal(a, status); 4802 b = float64_squash_input_denormal(b, status); 4803 4804 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4805 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4806 ) { 4807 if (float64_is_signaling_nan(a, status) 4808 || float64_is_signaling_nan(b, status)) { 4809 float_raise(float_flag_invalid, status); 4810 } 4811 return 0; 4812 } 4813 av = float64_val(a); 4814 bv = float64_val(b); 4815 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4816 4817 } 4818 4819 /*---------------------------------------------------------------------------- 4820 | Returns 1 if the double-precision floating-point value `a' is less than or 4821 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4822 | cause an exception. Otherwise, the comparison is performed according to the 4823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4824 *----------------------------------------------------------------------------*/ 4825 4826 int float64_le_quiet(float64 a, float64 b, float_status *status) 4827 { 4828 flag aSign, bSign; 4829 uint64_t av, bv; 4830 a = float64_squash_input_denormal(a, status); 4831 b = float64_squash_input_denormal(b, status); 4832 4833 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4834 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4835 ) { 4836 if (float64_is_signaling_nan(a, status) 4837 || float64_is_signaling_nan(b, status)) { 4838 float_raise(float_flag_invalid, status); 4839 } 4840 return 0; 4841 } 4842 aSign = extractFloat64Sign( a ); 4843 bSign = extractFloat64Sign( b ); 4844 av = float64_val(a); 4845 bv = float64_val(b); 4846 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4847 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4848 4849 } 4850 4851 /*---------------------------------------------------------------------------- 4852 | Returns 1 if the double-precision floating-point value `a' is less than 4853 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4854 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4855 | Standard for Binary Floating-Point Arithmetic. 4856 *----------------------------------------------------------------------------*/ 4857 4858 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4859 { 4860 flag aSign, bSign; 4861 uint64_t av, bv; 4862 a = float64_squash_input_denormal(a, status); 4863 b = float64_squash_input_denormal(b, status); 4864 4865 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4866 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4867 ) { 4868 if (float64_is_signaling_nan(a, status) 4869 || float64_is_signaling_nan(b, status)) { 4870 float_raise(float_flag_invalid, status); 4871 } 4872 return 0; 4873 } 4874 aSign = extractFloat64Sign( a ); 4875 bSign = extractFloat64Sign( b ); 4876 av = float64_val(a); 4877 bv = float64_val(b); 4878 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4879 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4880 4881 } 4882 4883 /*---------------------------------------------------------------------------- 4884 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4885 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4886 | comparison is performed according to the IEC/IEEE Standard for Binary 4887 | Floating-Point Arithmetic. 4888 *----------------------------------------------------------------------------*/ 4889 4890 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4891 { 4892 a = float64_squash_input_denormal(a, status); 4893 b = float64_squash_input_denormal(b, status); 4894 4895 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4896 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4897 ) { 4898 if (float64_is_signaling_nan(a, status) 4899 || float64_is_signaling_nan(b, status)) { 4900 float_raise(float_flag_invalid, status); 4901 } 4902 return 1; 4903 } 4904 return 0; 4905 } 4906 4907 /*---------------------------------------------------------------------------- 4908 | Returns the result of converting the extended double-precision floating- 4909 | point value `a' to the 32-bit two's complement integer format. The 4910 | conversion is performed according to the IEC/IEEE Standard for Binary 4911 | Floating-Point Arithmetic---which means in particular that the conversion 4912 | is rounded according to the current rounding mode. If `a' is a NaN, the 4913 | largest positive integer is returned. Otherwise, if the conversion 4914 | overflows, the largest integer with the same sign as `a' is returned. 4915 *----------------------------------------------------------------------------*/ 4916 4917 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4918 { 4919 flag aSign; 4920 int32_t aExp, shiftCount; 4921 uint64_t aSig; 4922 4923 if (floatx80_invalid_encoding(a)) { 4924 float_raise(float_flag_invalid, status); 4925 return 1 << 31; 4926 } 4927 aSig = extractFloatx80Frac( a ); 4928 aExp = extractFloatx80Exp( a ); 4929 aSign = extractFloatx80Sign( a ); 4930 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4931 shiftCount = 0x4037 - aExp; 4932 if ( shiftCount <= 0 ) shiftCount = 1; 4933 shift64RightJamming( aSig, shiftCount, &aSig ); 4934 return roundAndPackInt32(aSign, aSig, status); 4935 4936 } 4937 4938 /*---------------------------------------------------------------------------- 4939 | Returns the result of converting the extended double-precision floating- 4940 | point value `a' to the 32-bit two's complement integer format. The 4941 | conversion is performed according to the IEC/IEEE Standard for Binary 4942 | Floating-Point Arithmetic, except that the conversion is always rounded 4943 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4944 | Otherwise, if the conversion overflows, the largest integer with the same 4945 | sign as `a' is returned. 4946 *----------------------------------------------------------------------------*/ 4947 4948 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4949 { 4950 flag aSign; 4951 int32_t aExp, shiftCount; 4952 uint64_t aSig, savedASig; 4953 int32_t z; 4954 4955 if (floatx80_invalid_encoding(a)) { 4956 float_raise(float_flag_invalid, status); 4957 return 1 << 31; 4958 } 4959 aSig = extractFloatx80Frac( a ); 4960 aExp = extractFloatx80Exp( a ); 4961 aSign = extractFloatx80Sign( a ); 4962 if ( 0x401E < aExp ) { 4963 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4964 goto invalid; 4965 } 4966 else if ( aExp < 0x3FFF ) { 4967 if (aExp || aSig) { 4968 status->float_exception_flags |= float_flag_inexact; 4969 } 4970 return 0; 4971 } 4972 shiftCount = 0x403E - aExp; 4973 savedASig = aSig; 4974 aSig >>= shiftCount; 4975 z = aSig; 4976 if ( aSign ) z = - z; 4977 if ( ( z < 0 ) ^ aSign ) { 4978 invalid: 4979 float_raise(float_flag_invalid, status); 4980 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4981 } 4982 if ( ( aSig<<shiftCount ) != savedASig ) { 4983 status->float_exception_flags |= float_flag_inexact; 4984 } 4985 return z; 4986 4987 } 4988 4989 /*---------------------------------------------------------------------------- 4990 | Returns the result of converting the extended double-precision floating- 4991 | point value `a' to the 64-bit two's complement integer format. The 4992 | conversion is performed according to the IEC/IEEE Standard for Binary 4993 | Floating-Point Arithmetic---which means in particular that the conversion 4994 | is rounded according to the current rounding mode. If `a' is a NaN, 4995 | the largest positive integer is returned. Otherwise, if the conversion 4996 | overflows, the largest integer with the same sign as `a' is returned. 4997 *----------------------------------------------------------------------------*/ 4998 4999 int64_t floatx80_to_int64(floatx80 a, float_status *status) 5000 { 5001 flag aSign; 5002 int32_t aExp, shiftCount; 5003 uint64_t aSig, aSigExtra; 5004 5005 if (floatx80_invalid_encoding(a)) { 5006 float_raise(float_flag_invalid, status); 5007 return 1ULL << 63; 5008 } 5009 aSig = extractFloatx80Frac( a ); 5010 aExp = extractFloatx80Exp( a ); 5011 aSign = extractFloatx80Sign( a ); 5012 shiftCount = 0x403E - aExp; 5013 if ( shiftCount <= 0 ) { 5014 if ( shiftCount ) { 5015 float_raise(float_flag_invalid, status); 5016 if ( ! aSign 5017 || ( ( aExp == 0x7FFF ) 5018 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 5019 ) { 5020 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5021 } 5022 return (int64_t) LIT64( 0x8000000000000000 ); 5023 } 5024 aSigExtra = 0; 5025 } 5026 else { 5027 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 5028 } 5029 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 5030 5031 } 5032 5033 /*---------------------------------------------------------------------------- 5034 | Returns the result of converting the extended double-precision floating- 5035 | point value `a' to the 64-bit two's complement integer format. The 5036 | conversion is performed according to the IEC/IEEE Standard for Binary 5037 | Floating-Point Arithmetic, except that the conversion is always rounded 5038 | toward zero. If `a' is a NaN, the largest positive integer is returned. 5039 | Otherwise, if the conversion overflows, the largest integer with the same 5040 | sign as `a' is returned. 5041 *----------------------------------------------------------------------------*/ 5042 5043 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 5044 { 5045 flag aSign; 5046 int32_t aExp, shiftCount; 5047 uint64_t aSig; 5048 int64_t z; 5049 5050 if (floatx80_invalid_encoding(a)) { 5051 float_raise(float_flag_invalid, status); 5052 return 1ULL << 63; 5053 } 5054 aSig = extractFloatx80Frac( a ); 5055 aExp = extractFloatx80Exp( a ); 5056 aSign = extractFloatx80Sign( a ); 5057 shiftCount = aExp - 0x403E; 5058 if ( 0 <= shiftCount ) { 5059 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 5060 if ( ( a.high != 0xC03E ) || aSig ) { 5061 float_raise(float_flag_invalid, status); 5062 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 5063 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5064 } 5065 } 5066 return (int64_t) LIT64( 0x8000000000000000 ); 5067 } 5068 else if ( aExp < 0x3FFF ) { 5069 if (aExp | aSig) { 5070 status->float_exception_flags |= float_flag_inexact; 5071 } 5072 return 0; 5073 } 5074 z = aSig>>( - shiftCount ); 5075 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5076 status->float_exception_flags |= float_flag_inexact; 5077 } 5078 if ( aSign ) z = - z; 5079 return z; 5080 5081 } 5082 5083 /*---------------------------------------------------------------------------- 5084 | Returns the result of converting the extended double-precision floating- 5085 | point value `a' to the single-precision floating-point format. The 5086 | conversion is performed according to the IEC/IEEE Standard for Binary 5087 | Floating-Point Arithmetic. 5088 *----------------------------------------------------------------------------*/ 5089 5090 float32 floatx80_to_float32(floatx80 a, float_status *status) 5091 { 5092 flag aSign; 5093 int32_t aExp; 5094 uint64_t aSig; 5095 5096 if (floatx80_invalid_encoding(a)) { 5097 float_raise(float_flag_invalid, status); 5098 return float32_default_nan(status); 5099 } 5100 aSig = extractFloatx80Frac( a ); 5101 aExp = extractFloatx80Exp( a ); 5102 aSign = extractFloatx80Sign( a ); 5103 if ( aExp == 0x7FFF ) { 5104 if ( (uint64_t) ( aSig<<1 ) ) { 5105 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5106 } 5107 return packFloat32( aSign, 0xFF, 0 ); 5108 } 5109 shift64RightJamming( aSig, 33, &aSig ); 5110 if ( aExp || aSig ) aExp -= 0x3F81; 5111 return roundAndPackFloat32(aSign, aExp, aSig, status); 5112 5113 } 5114 5115 /*---------------------------------------------------------------------------- 5116 | Returns the result of converting the extended double-precision floating- 5117 | point value `a' to the double-precision floating-point format. The 5118 | conversion is performed according to the IEC/IEEE Standard for Binary 5119 | Floating-Point Arithmetic. 5120 *----------------------------------------------------------------------------*/ 5121 5122 float64 floatx80_to_float64(floatx80 a, float_status *status) 5123 { 5124 flag aSign; 5125 int32_t aExp; 5126 uint64_t aSig, zSig; 5127 5128 if (floatx80_invalid_encoding(a)) { 5129 float_raise(float_flag_invalid, status); 5130 return float64_default_nan(status); 5131 } 5132 aSig = extractFloatx80Frac( a ); 5133 aExp = extractFloatx80Exp( a ); 5134 aSign = extractFloatx80Sign( a ); 5135 if ( aExp == 0x7FFF ) { 5136 if ( (uint64_t) ( aSig<<1 ) ) { 5137 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5138 } 5139 return packFloat64( aSign, 0x7FF, 0 ); 5140 } 5141 shift64RightJamming( aSig, 1, &zSig ); 5142 if ( aExp || aSig ) aExp -= 0x3C01; 5143 return roundAndPackFloat64(aSign, aExp, zSig, status); 5144 5145 } 5146 5147 /*---------------------------------------------------------------------------- 5148 | Returns the result of converting the extended double-precision floating- 5149 | point value `a' to the quadruple-precision floating-point format. The 5150 | conversion is performed according to the IEC/IEEE Standard for Binary 5151 | Floating-Point Arithmetic. 5152 *----------------------------------------------------------------------------*/ 5153 5154 float128 floatx80_to_float128(floatx80 a, float_status *status) 5155 { 5156 flag aSign; 5157 int aExp; 5158 uint64_t aSig, zSig0, zSig1; 5159 5160 if (floatx80_invalid_encoding(a)) { 5161 float_raise(float_flag_invalid, status); 5162 return float128_default_nan(status); 5163 } 5164 aSig = extractFloatx80Frac( a ); 5165 aExp = extractFloatx80Exp( a ); 5166 aSign = extractFloatx80Sign( a ); 5167 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5168 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5169 } 5170 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5171 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5172 5173 } 5174 5175 /*---------------------------------------------------------------------------- 5176 | Rounds the extended double-precision floating-point value `a' 5177 | to the precision provided by floatx80_rounding_precision and returns the 5178 | result as an extended double-precision floating-point value. 5179 | The operation is performed according to the IEC/IEEE Standard for Binary 5180 | Floating-Point Arithmetic. 5181 *----------------------------------------------------------------------------*/ 5182 5183 floatx80 floatx80_round(floatx80 a, float_status *status) 5184 { 5185 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5186 extractFloatx80Sign(a), 5187 extractFloatx80Exp(a), 5188 extractFloatx80Frac(a), 0, status); 5189 } 5190 5191 /*---------------------------------------------------------------------------- 5192 | Rounds the extended double-precision floating-point value `a' to an integer, 5193 | and returns the result as an extended quadruple-precision floating-point 5194 | value. The operation is performed according to the IEC/IEEE Standard for 5195 | Binary Floating-Point Arithmetic. 5196 *----------------------------------------------------------------------------*/ 5197 5198 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5199 { 5200 flag aSign; 5201 int32_t aExp; 5202 uint64_t lastBitMask, roundBitsMask; 5203 floatx80 z; 5204 5205 if (floatx80_invalid_encoding(a)) { 5206 float_raise(float_flag_invalid, status); 5207 return floatx80_default_nan(status); 5208 } 5209 aExp = extractFloatx80Exp( a ); 5210 if ( 0x403E <= aExp ) { 5211 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5212 return propagateFloatx80NaN(a, a, status); 5213 } 5214 return a; 5215 } 5216 if ( aExp < 0x3FFF ) { 5217 if ( ( aExp == 0 ) 5218 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5219 return a; 5220 } 5221 status->float_exception_flags |= float_flag_inexact; 5222 aSign = extractFloatx80Sign( a ); 5223 switch (status->float_rounding_mode) { 5224 case float_round_nearest_even: 5225 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5226 ) { 5227 return 5228 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5229 } 5230 break; 5231 case float_round_ties_away: 5232 if (aExp == 0x3FFE) { 5233 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5234 } 5235 break; 5236 case float_round_down: 5237 return 5238 aSign ? 5239 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5240 : packFloatx80( 0, 0, 0 ); 5241 case float_round_up: 5242 return 5243 aSign ? packFloatx80( 1, 0, 0 ) 5244 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5245 } 5246 return packFloatx80( aSign, 0, 0 ); 5247 } 5248 lastBitMask = 1; 5249 lastBitMask <<= 0x403E - aExp; 5250 roundBitsMask = lastBitMask - 1; 5251 z = a; 5252 switch (status->float_rounding_mode) { 5253 case float_round_nearest_even: 5254 z.low += lastBitMask>>1; 5255 if ((z.low & roundBitsMask) == 0) { 5256 z.low &= ~lastBitMask; 5257 } 5258 break; 5259 case float_round_ties_away: 5260 z.low += lastBitMask >> 1; 5261 break; 5262 case float_round_to_zero: 5263 break; 5264 case float_round_up: 5265 if (!extractFloatx80Sign(z)) { 5266 z.low += roundBitsMask; 5267 } 5268 break; 5269 case float_round_down: 5270 if (extractFloatx80Sign(z)) { 5271 z.low += roundBitsMask; 5272 } 5273 break; 5274 default: 5275 abort(); 5276 } 5277 z.low &= ~ roundBitsMask; 5278 if ( z.low == 0 ) { 5279 ++z.high; 5280 z.low = LIT64( 0x8000000000000000 ); 5281 } 5282 if (z.low != a.low) { 5283 status->float_exception_flags |= float_flag_inexact; 5284 } 5285 return z; 5286 5287 } 5288 5289 /*---------------------------------------------------------------------------- 5290 | Returns the result of adding the absolute values of the extended double- 5291 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5292 | negated before being returned. `zSign' is ignored if the result is a NaN. 5293 | The addition is performed according to the IEC/IEEE Standard for Binary 5294 | Floating-Point Arithmetic. 5295 *----------------------------------------------------------------------------*/ 5296 5297 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5298 float_status *status) 5299 { 5300 int32_t aExp, bExp, zExp; 5301 uint64_t aSig, bSig, zSig0, zSig1; 5302 int32_t expDiff; 5303 5304 aSig = extractFloatx80Frac( a ); 5305 aExp = extractFloatx80Exp( a ); 5306 bSig = extractFloatx80Frac( b ); 5307 bExp = extractFloatx80Exp( b ); 5308 expDiff = aExp - bExp; 5309 if ( 0 < expDiff ) { 5310 if ( aExp == 0x7FFF ) { 5311 if ((uint64_t)(aSig << 1)) { 5312 return propagateFloatx80NaN(a, b, status); 5313 } 5314 return a; 5315 } 5316 if ( bExp == 0 ) --expDiff; 5317 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5318 zExp = aExp; 5319 } 5320 else if ( expDiff < 0 ) { 5321 if ( bExp == 0x7FFF ) { 5322 if ((uint64_t)(bSig << 1)) { 5323 return propagateFloatx80NaN(a, b, status); 5324 } 5325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5326 } 5327 if ( aExp == 0 ) ++expDiff; 5328 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5329 zExp = bExp; 5330 } 5331 else { 5332 if ( aExp == 0x7FFF ) { 5333 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5334 return propagateFloatx80NaN(a, b, status); 5335 } 5336 return a; 5337 } 5338 zSig1 = 0; 5339 zSig0 = aSig + bSig; 5340 if ( aExp == 0 ) { 5341 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5342 goto roundAndPack; 5343 } 5344 zExp = aExp; 5345 goto shiftRight1; 5346 } 5347 zSig0 = aSig + bSig; 5348 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5349 shiftRight1: 5350 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5351 zSig0 |= LIT64( 0x8000000000000000 ); 5352 ++zExp; 5353 roundAndPack: 5354 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5355 zSign, zExp, zSig0, zSig1, status); 5356 } 5357 5358 /*---------------------------------------------------------------------------- 5359 | Returns the result of subtracting the absolute values of the extended 5360 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5361 | difference is negated before being returned. `zSign' is ignored if the 5362 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5363 | Standard for Binary Floating-Point Arithmetic. 5364 *----------------------------------------------------------------------------*/ 5365 5366 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5367 float_status *status) 5368 { 5369 int32_t aExp, bExp, zExp; 5370 uint64_t aSig, bSig, zSig0, zSig1; 5371 int32_t expDiff; 5372 5373 aSig = extractFloatx80Frac( a ); 5374 aExp = extractFloatx80Exp( a ); 5375 bSig = extractFloatx80Frac( b ); 5376 bExp = extractFloatx80Exp( b ); 5377 expDiff = aExp - bExp; 5378 if ( 0 < expDiff ) goto aExpBigger; 5379 if ( expDiff < 0 ) goto bExpBigger; 5380 if ( aExp == 0x7FFF ) { 5381 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5382 return propagateFloatx80NaN(a, b, status); 5383 } 5384 float_raise(float_flag_invalid, status); 5385 return floatx80_default_nan(status); 5386 } 5387 if ( aExp == 0 ) { 5388 aExp = 1; 5389 bExp = 1; 5390 } 5391 zSig1 = 0; 5392 if ( bSig < aSig ) goto aBigger; 5393 if ( aSig < bSig ) goto bBigger; 5394 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5395 bExpBigger: 5396 if ( bExp == 0x7FFF ) { 5397 if ((uint64_t)(bSig << 1)) { 5398 return propagateFloatx80NaN(a, b, status); 5399 } 5400 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5401 } 5402 if ( aExp == 0 ) ++expDiff; 5403 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5404 bBigger: 5405 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5406 zExp = bExp; 5407 zSign ^= 1; 5408 goto normalizeRoundAndPack; 5409 aExpBigger: 5410 if ( aExp == 0x7FFF ) { 5411 if ((uint64_t)(aSig << 1)) { 5412 return propagateFloatx80NaN(a, b, status); 5413 } 5414 return a; 5415 } 5416 if ( bExp == 0 ) --expDiff; 5417 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5418 aBigger: 5419 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5420 zExp = aExp; 5421 normalizeRoundAndPack: 5422 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5423 zSign, zExp, zSig0, zSig1, status); 5424 } 5425 5426 /*---------------------------------------------------------------------------- 5427 | Returns the result of adding the extended double-precision floating-point 5428 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5429 | Standard for Binary Floating-Point Arithmetic. 5430 *----------------------------------------------------------------------------*/ 5431 5432 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5433 { 5434 flag aSign, bSign; 5435 5436 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5437 float_raise(float_flag_invalid, status); 5438 return floatx80_default_nan(status); 5439 } 5440 aSign = extractFloatx80Sign( a ); 5441 bSign = extractFloatx80Sign( b ); 5442 if ( aSign == bSign ) { 5443 return addFloatx80Sigs(a, b, aSign, status); 5444 } 5445 else { 5446 return subFloatx80Sigs(a, b, aSign, status); 5447 } 5448 5449 } 5450 5451 /*---------------------------------------------------------------------------- 5452 | Returns the result of subtracting the extended double-precision floating- 5453 | point values `a' and `b'. The operation is performed according to the 5454 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5455 *----------------------------------------------------------------------------*/ 5456 5457 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5458 { 5459 flag aSign, bSign; 5460 5461 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5462 float_raise(float_flag_invalid, status); 5463 return floatx80_default_nan(status); 5464 } 5465 aSign = extractFloatx80Sign( a ); 5466 bSign = extractFloatx80Sign( b ); 5467 if ( aSign == bSign ) { 5468 return subFloatx80Sigs(a, b, aSign, status); 5469 } 5470 else { 5471 return addFloatx80Sigs(a, b, aSign, status); 5472 } 5473 5474 } 5475 5476 /*---------------------------------------------------------------------------- 5477 | Returns the result of multiplying the extended double-precision floating- 5478 | point values `a' and `b'. The operation is performed according to the 5479 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5480 *----------------------------------------------------------------------------*/ 5481 5482 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5483 { 5484 flag aSign, bSign, zSign; 5485 int32_t aExp, bExp, zExp; 5486 uint64_t aSig, bSig, zSig0, zSig1; 5487 5488 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5489 float_raise(float_flag_invalid, status); 5490 return floatx80_default_nan(status); 5491 } 5492 aSig = extractFloatx80Frac( a ); 5493 aExp = extractFloatx80Exp( a ); 5494 aSign = extractFloatx80Sign( a ); 5495 bSig = extractFloatx80Frac( b ); 5496 bExp = extractFloatx80Exp( b ); 5497 bSign = extractFloatx80Sign( b ); 5498 zSign = aSign ^ bSign; 5499 if ( aExp == 0x7FFF ) { 5500 if ( (uint64_t) ( aSig<<1 ) 5501 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5502 return propagateFloatx80NaN(a, b, status); 5503 } 5504 if ( ( bExp | bSig ) == 0 ) goto invalid; 5505 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5506 } 5507 if ( bExp == 0x7FFF ) { 5508 if ((uint64_t)(bSig << 1)) { 5509 return propagateFloatx80NaN(a, b, status); 5510 } 5511 if ( ( aExp | aSig ) == 0 ) { 5512 invalid: 5513 float_raise(float_flag_invalid, status); 5514 return floatx80_default_nan(status); 5515 } 5516 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5517 } 5518 if ( aExp == 0 ) { 5519 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5520 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5521 } 5522 if ( bExp == 0 ) { 5523 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5524 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5525 } 5526 zExp = aExp + bExp - 0x3FFE; 5527 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5528 if ( 0 < (int64_t) zSig0 ) { 5529 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5530 --zExp; 5531 } 5532 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5533 zSign, zExp, zSig0, zSig1, status); 5534 } 5535 5536 /*---------------------------------------------------------------------------- 5537 | Returns the result of dividing the extended double-precision floating-point 5538 | value `a' by the corresponding value `b'. The operation is performed 5539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5540 *----------------------------------------------------------------------------*/ 5541 5542 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5543 { 5544 flag aSign, bSign, zSign; 5545 int32_t aExp, bExp, zExp; 5546 uint64_t aSig, bSig, zSig0, zSig1; 5547 uint64_t rem0, rem1, rem2, term0, term1, term2; 5548 5549 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5550 float_raise(float_flag_invalid, status); 5551 return floatx80_default_nan(status); 5552 } 5553 aSig = extractFloatx80Frac( a ); 5554 aExp = extractFloatx80Exp( a ); 5555 aSign = extractFloatx80Sign( a ); 5556 bSig = extractFloatx80Frac( b ); 5557 bExp = extractFloatx80Exp( b ); 5558 bSign = extractFloatx80Sign( b ); 5559 zSign = aSign ^ bSign; 5560 if ( aExp == 0x7FFF ) { 5561 if ((uint64_t)(aSig << 1)) { 5562 return propagateFloatx80NaN(a, b, status); 5563 } 5564 if ( bExp == 0x7FFF ) { 5565 if ((uint64_t)(bSig << 1)) { 5566 return propagateFloatx80NaN(a, b, status); 5567 } 5568 goto invalid; 5569 } 5570 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5571 } 5572 if ( bExp == 0x7FFF ) { 5573 if ((uint64_t)(bSig << 1)) { 5574 return propagateFloatx80NaN(a, b, status); 5575 } 5576 return packFloatx80( zSign, 0, 0 ); 5577 } 5578 if ( bExp == 0 ) { 5579 if ( bSig == 0 ) { 5580 if ( ( aExp | aSig ) == 0 ) { 5581 invalid: 5582 float_raise(float_flag_invalid, status); 5583 return floatx80_default_nan(status); 5584 } 5585 float_raise(float_flag_divbyzero, status); 5586 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5587 } 5588 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5589 } 5590 if ( aExp == 0 ) { 5591 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5592 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5593 } 5594 zExp = aExp - bExp + 0x3FFE; 5595 rem1 = 0; 5596 if ( bSig <= aSig ) { 5597 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5598 ++zExp; 5599 } 5600 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5601 mul64To128( bSig, zSig0, &term0, &term1 ); 5602 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5603 while ( (int64_t) rem0 < 0 ) { 5604 --zSig0; 5605 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5606 } 5607 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5608 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5609 mul64To128( bSig, zSig1, &term1, &term2 ); 5610 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5611 while ( (int64_t) rem1 < 0 ) { 5612 --zSig1; 5613 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5614 } 5615 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5616 } 5617 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5618 zSign, zExp, zSig0, zSig1, status); 5619 } 5620 5621 /*---------------------------------------------------------------------------- 5622 | Returns the remainder of the extended double-precision floating-point value 5623 | `a' with respect to the corresponding value `b'. The operation is performed 5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5625 *----------------------------------------------------------------------------*/ 5626 5627 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5628 { 5629 flag aSign, zSign; 5630 int32_t aExp, bExp, expDiff; 5631 uint64_t aSig0, aSig1, bSig; 5632 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5633 5634 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5635 float_raise(float_flag_invalid, status); 5636 return floatx80_default_nan(status); 5637 } 5638 aSig0 = extractFloatx80Frac( a ); 5639 aExp = extractFloatx80Exp( a ); 5640 aSign = extractFloatx80Sign( a ); 5641 bSig = extractFloatx80Frac( b ); 5642 bExp = extractFloatx80Exp( b ); 5643 if ( aExp == 0x7FFF ) { 5644 if ( (uint64_t) ( aSig0<<1 ) 5645 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5646 return propagateFloatx80NaN(a, b, status); 5647 } 5648 goto invalid; 5649 } 5650 if ( bExp == 0x7FFF ) { 5651 if ((uint64_t)(bSig << 1)) { 5652 return propagateFloatx80NaN(a, b, status); 5653 } 5654 return a; 5655 } 5656 if ( bExp == 0 ) { 5657 if ( bSig == 0 ) { 5658 invalid: 5659 float_raise(float_flag_invalid, status); 5660 return floatx80_default_nan(status); 5661 } 5662 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5663 } 5664 if ( aExp == 0 ) { 5665 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5666 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5667 } 5668 bSig |= LIT64( 0x8000000000000000 ); 5669 zSign = aSign; 5670 expDiff = aExp - bExp; 5671 aSig1 = 0; 5672 if ( expDiff < 0 ) { 5673 if ( expDiff < -1 ) return a; 5674 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5675 expDiff = 0; 5676 } 5677 q = ( bSig <= aSig0 ); 5678 if ( q ) aSig0 -= bSig; 5679 expDiff -= 64; 5680 while ( 0 < expDiff ) { 5681 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5682 q = ( 2 < q ) ? q - 2 : 0; 5683 mul64To128( bSig, q, &term0, &term1 ); 5684 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5685 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5686 expDiff -= 62; 5687 } 5688 expDiff += 64; 5689 if ( 0 < expDiff ) { 5690 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5691 q = ( 2 < q ) ? q - 2 : 0; 5692 q >>= 64 - expDiff; 5693 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5694 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5695 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5696 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5697 ++q; 5698 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5699 } 5700 } 5701 else { 5702 term1 = 0; 5703 term0 = bSig; 5704 } 5705 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5706 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5707 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5708 && ( q & 1 ) ) 5709 ) { 5710 aSig0 = alternateASig0; 5711 aSig1 = alternateASig1; 5712 zSign = ! zSign; 5713 } 5714 return 5715 normalizeRoundAndPackFloatx80( 5716 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5717 5718 } 5719 5720 /*---------------------------------------------------------------------------- 5721 | Returns the square root of the extended double-precision floating-point 5722 | value `a'. The operation is performed according to the IEC/IEEE Standard 5723 | for Binary Floating-Point Arithmetic. 5724 *----------------------------------------------------------------------------*/ 5725 5726 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5727 { 5728 flag aSign; 5729 int32_t aExp, zExp; 5730 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5731 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5732 5733 if (floatx80_invalid_encoding(a)) { 5734 float_raise(float_flag_invalid, status); 5735 return floatx80_default_nan(status); 5736 } 5737 aSig0 = extractFloatx80Frac( a ); 5738 aExp = extractFloatx80Exp( a ); 5739 aSign = extractFloatx80Sign( a ); 5740 if ( aExp == 0x7FFF ) { 5741 if ((uint64_t)(aSig0 << 1)) { 5742 return propagateFloatx80NaN(a, a, status); 5743 } 5744 if ( ! aSign ) return a; 5745 goto invalid; 5746 } 5747 if ( aSign ) { 5748 if ( ( aExp | aSig0 ) == 0 ) return a; 5749 invalid: 5750 float_raise(float_flag_invalid, status); 5751 return floatx80_default_nan(status); 5752 } 5753 if ( aExp == 0 ) { 5754 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5755 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5756 } 5757 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5758 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5759 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5760 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5761 doubleZSig0 = zSig0<<1; 5762 mul64To128( zSig0, zSig0, &term0, &term1 ); 5763 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5764 while ( (int64_t) rem0 < 0 ) { 5765 --zSig0; 5766 doubleZSig0 -= 2; 5767 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5768 } 5769 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5770 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5771 if ( zSig1 == 0 ) zSig1 = 1; 5772 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5773 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5774 mul64To128( zSig1, zSig1, &term2, &term3 ); 5775 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5776 while ( (int64_t) rem1 < 0 ) { 5777 --zSig1; 5778 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5779 term3 |= 1; 5780 term2 |= doubleZSig0; 5781 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5782 } 5783 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5784 } 5785 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5786 zSig0 |= doubleZSig0; 5787 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5788 0, zExp, zSig0, zSig1, status); 5789 } 5790 5791 /*---------------------------------------------------------------------------- 5792 | Returns 1 if the extended double-precision floating-point value `a' is equal 5793 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5794 | raised if either operand is a NaN. Otherwise, the comparison is performed 5795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5796 *----------------------------------------------------------------------------*/ 5797 5798 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5799 { 5800 5801 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5802 || (extractFloatx80Exp(a) == 0x7FFF 5803 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5804 || (extractFloatx80Exp(b) == 0x7FFF 5805 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5806 ) { 5807 float_raise(float_flag_invalid, status); 5808 return 0; 5809 } 5810 return 5811 ( a.low == b.low ) 5812 && ( ( a.high == b.high ) 5813 || ( ( a.low == 0 ) 5814 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5815 ); 5816 5817 } 5818 5819 /*---------------------------------------------------------------------------- 5820 | Returns 1 if the extended double-precision floating-point value `a' is 5821 | less than or equal to the corresponding value `b', and 0 otherwise. The 5822 | invalid exception is raised if either operand is a NaN. The comparison is 5823 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5824 | Arithmetic. 5825 *----------------------------------------------------------------------------*/ 5826 5827 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5828 { 5829 flag aSign, bSign; 5830 5831 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5832 || (extractFloatx80Exp(a) == 0x7FFF 5833 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5834 || (extractFloatx80Exp(b) == 0x7FFF 5835 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5836 ) { 5837 float_raise(float_flag_invalid, status); 5838 return 0; 5839 } 5840 aSign = extractFloatx80Sign( a ); 5841 bSign = extractFloatx80Sign( b ); 5842 if ( aSign != bSign ) { 5843 return 5844 aSign 5845 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5846 == 0 ); 5847 } 5848 return 5849 aSign ? le128( b.high, b.low, a.high, a.low ) 5850 : le128( a.high, a.low, b.high, b.low ); 5851 5852 } 5853 5854 /*---------------------------------------------------------------------------- 5855 | Returns 1 if the extended double-precision floating-point value `a' is 5856 | less than the corresponding value `b', and 0 otherwise. The invalid 5857 | exception is raised if either operand is a NaN. The comparison is performed 5858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5859 *----------------------------------------------------------------------------*/ 5860 5861 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5862 { 5863 flag aSign, bSign; 5864 5865 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5866 || (extractFloatx80Exp(a) == 0x7FFF 5867 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5868 || (extractFloatx80Exp(b) == 0x7FFF 5869 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5870 ) { 5871 float_raise(float_flag_invalid, status); 5872 return 0; 5873 } 5874 aSign = extractFloatx80Sign( a ); 5875 bSign = extractFloatx80Sign( b ); 5876 if ( aSign != bSign ) { 5877 return 5878 aSign 5879 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5880 != 0 ); 5881 } 5882 return 5883 aSign ? lt128( b.high, b.low, a.high, a.low ) 5884 : lt128( a.high, a.low, b.high, b.low ); 5885 5886 } 5887 5888 /*---------------------------------------------------------------------------- 5889 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5890 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5891 | either operand is a NaN. The comparison is performed according to the 5892 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5893 *----------------------------------------------------------------------------*/ 5894 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5895 { 5896 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5897 || (extractFloatx80Exp(a) == 0x7FFF 5898 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5899 || (extractFloatx80Exp(b) == 0x7FFF 5900 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5901 ) { 5902 float_raise(float_flag_invalid, status); 5903 return 1; 5904 } 5905 return 0; 5906 } 5907 5908 /*---------------------------------------------------------------------------- 5909 | Returns 1 if the extended double-precision floating-point value `a' is 5910 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5911 | cause an exception. The comparison is performed according to the IEC/IEEE 5912 | Standard for Binary Floating-Point Arithmetic. 5913 *----------------------------------------------------------------------------*/ 5914 5915 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5916 { 5917 5918 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5919 float_raise(float_flag_invalid, status); 5920 return 0; 5921 } 5922 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5923 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5924 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5925 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5926 ) { 5927 if (floatx80_is_signaling_nan(a, status) 5928 || floatx80_is_signaling_nan(b, status)) { 5929 float_raise(float_flag_invalid, status); 5930 } 5931 return 0; 5932 } 5933 return 5934 ( a.low == b.low ) 5935 && ( ( a.high == b.high ) 5936 || ( ( a.low == 0 ) 5937 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5938 ); 5939 5940 } 5941 5942 /*---------------------------------------------------------------------------- 5943 | Returns 1 if the extended double-precision floating-point value `a' is less 5944 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5945 | do not cause an exception. Otherwise, the comparison is performed according 5946 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5947 *----------------------------------------------------------------------------*/ 5948 5949 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5950 { 5951 flag aSign, bSign; 5952 5953 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5954 float_raise(float_flag_invalid, status); 5955 return 0; 5956 } 5957 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5958 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5959 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5960 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5961 ) { 5962 if (floatx80_is_signaling_nan(a, status) 5963 || floatx80_is_signaling_nan(b, status)) { 5964 float_raise(float_flag_invalid, status); 5965 } 5966 return 0; 5967 } 5968 aSign = extractFloatx80Sign( a ); 5969 bSign = extractFloatx80Sign( b ); 5970 if ( aSign != bSign ) { 5971 return 5972 aSign 5973 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5974 == 0 ); 5975 } 5976 return 5977 aSign ? le128( b.high, b.low, a.high, a.low ) 5978 : le128( a.high, a.low, b.high, b.low ); 5979 5980 } 5981 5982 /*---------------------------------------------------------------------------- 5983 | Returns 1 if the extended double-precision floating-point value `a' is less 5984 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5985 | an exception. Otherwise, the comparison is performed according to the 5986 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5987 *----------------------------------------------------------------------------*/ 5988 5989 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5990 { 5991 flag aSign, bSign; 5992 5993 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5994 float_raise(float_flag_invalid, status); 5995 return 0; 5996 } 5997 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5998 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5999 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6000 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6001 ) { 6002 if (floatx80_is_signaling_nan(a, status) 6003 || floatx80_is_signaling_nan(b, status)) { 6004 float_raise(float_flag_invalid, status); 6005 } 6006 return 0; 6007 } 6008 aSign = extractFloatx80Sign( a ); 6009 bSign = extractFloatx80Sign( b ); 6010 if ( aSign != bSign ) { 6011 return 6012 aSign 6013 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6014 != 0 ); 6015 } 6016 return 6017 aSign ? lt128( b.high, b.low, a.high, a.low ) 6018 : lt128( a.high, a.low, b.high, b.low ); 6019 6020 } 6021 6022 /*---------------------------------------------------------------------------- 6023 | Returns 1 if the extended double-precision floating-point values `a' and `b' 6024 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 6025 | The comparison is performed according to the IEC/IEEE Standard for Binary 6026 | Floating-Point Arithmetic. 6027 *----------------------------------------------------------------------------*/ 6028 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 6029 { 6030 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6031 float_raise(float_flag_invalid, status); 6032 return 1; 6033 } 6034 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 6035 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 6036 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 6037 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 6038 ) { 6039 if (floatx80_is_signaling_nan(a, status) 6040 || floatx80_is_signaling_nan(b, status)) { 6041 float_raise(float_flag_invalid, status); 6042 } 6043 return 1; 6044 } 6045 return 0; 6046 } 6047 6048 /*---------------------------------------------------------------------------- 6049 | Returns the result of converting the quadruple-precision floating-point 6050 | value `a' to the 32-bit two's complement integer format. The conversion 6051 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6052 | Arithmetic---which means in particular that the conversion is rounded 6053 | according to the current rounding mode. If `a' is a NaN, the largest 6054 | positive integer is returned. Otherwise, if the conversion overflows, the 6055 | largest integer with the same sign as `a' is returned. 6056 *----------------------------------------------------------------------------*/ 6057 6058 int32_t float128_to_int32(float128 a, float_status *status) 6059 { 6060 flag aSign; 6061 int32_t aExp, shiftCount; 6062 uint64_t aSig0, aSig1; 6063 6064 aSig1 = extractFloat128Frac1( a ); 6065 aSig0 = extractFloat128Frac0( a ); 6066 aExp = extractFloat128Exp( a ); 6067 aSign = extractFloat128Sign( a ); 6068 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 6069 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6070 aSig0 |= ( aSig1 != 0 ); 6071 shiftCount = 0x4028 - aExp; 6072 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6073 return roundAndPackInt32(aSign, aSig0, status); 6074 6075 } 6076 6077 /*---------------------------------------------------------------------------- 6078 | Returns the result of converting the quadruple-precision floating-point 6079 | value `a' to the 32-bit two's complement integer format. The conversion 6080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6081 | Arithmetic, except that the conversion is always rounded toward zero. If 6082 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6083 | conversion overflows, the largest integer with the same sign as `a' is 6084 | returned. 6085 *----------------------------------------------------------------------------*/ 6086 6087 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6088 { 6089 flag aSign; 6090 int32_t aExp, shiftCount; 6091 uint64_t aSig0, aSig1, savedASig; 6092 int32_t z; 6093 6094 aSig1 = extractFloat128Frac1( a ); 6095 aSig0 = extractFloat128Frac0( a ); 6096 aExp = extractFloat128Exp( a ); 6097 aSign = extractFloat128Sign( a ); 6098 aSig0 |= ( aSig1 != 0 ); 6099 if ( 0x401E < aExp ) { 6100 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6101 goto invalid; 6102 } 6103 else if ( aExp < 0x3FFF ) { 6104 if (aExp || aSig0) { 6105 status->float_exception_flags |= float_flag_inexact; 6106 } 6107 return 0; 6108 } 6109 aSig0 |= LIT64( 0x0001000000000000 ); 6110 shiftCount = 0x402F - aExp; 6111 savedASig = aSig0; 6112 aSig0 >>= shiftCount; 6113 z = aSig0; 6114 if ( aSign ) z = - z; 6115 if ( ( z < 0 ) ^ aSign ) { 6116 invalid: 6117 float_raise(float_flag_invalid, status); 6118 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6119 } 6120 if ( ( aSig0<<shiftCount ) != savedASig ) { 6121 status->float_exception_flags |= float_flag_inexact; 6122 } 6123 return z; 6124 6125 } 6126 6127 /*---------------------------------------------------------------------------- 6128 | Returns the result of converting the quadruple-precision floating-point 6129 | value `a' to the 64-bit two's complement integer format. The conversion 6130 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6131 | Arithmetic---which means in particular that the conversion is rounded 6132 | according to the current rounding mode. If `a' is a NaN, the largest 6133 | positive integer is returned. Otherwise, if the conversion overflows, the 6134 | largest integer with the same sign as `a' is returned. 6135 *----------------------------------------------------------------------------*/ 6136 6137 int64_t float128_to_int64(float128 a, float_status *status) 6138 { 6139 flag aSign; 6140 int32_t aExp, shiftCount; 6141 uint64_t aSig0, aSig1; 6142 6143 aSig1 = extractFloat128Frac1( a ); 6144 aSig0 = extractFloat128Frac0( a ); 6145 aExp = extractFloat128Exp( a ); 6146 aSign = extractFloat128Sign( a ); 6147 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6148 shiftCount = 0x402F - aExp; 6149 if ( shiftCount <= 0 ) { 6150 if ( 0x403E < aExp ) { 6151 float_raise(float_flag_invalid, status); 6152 if ( ! aSign 6153 || ( ( aExp == 0x7FFF ) 6154 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6155 ) 6156 ) { 6157 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6158 } 6159 return (int64_t) LIT64( 0x8000000000000000 ); 6160 } 6161 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6162 } 6163 else { 6164 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6165 } 6166 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6167 6168 } 6169 6170 /*---------------------------------------------------------------------------- 6171 | Returns the result of converting the quadruple-precision floating-point 6172 | value `a' to the 64-bit two's complement integer format. The conversion 6173 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6174 | Arithmetic, except that the conversion is always rounded toward zero. 6175 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6176 | the conversion overflows, the largest integer with the same sign as `a' is 6177 | returned. 6178 *----------------------------------------------------------------------------*/ 6179 6180 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6181 { 6182 flag aSign; 6183 int32_t aExp, shiftCount; 6184 uint64_t aSig0, aSig1; 6185 int64_t z; 6186 6187 aSig1 = extractFloat128Frac1( a ); 6188 aSig0 = extractFloat128Frac0( a ); 6189 aExp = extractFloat128Exp( a ); 6190 aSign = extractFloat128Sign( a ); 6191 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6192 shiftCount = aExp - 0x402F; 6193 if ( 0 < shiftCount ) { 6194 if ( 0x403E <= aExp ) { 6195 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6196 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6197 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6198 if (aSig1) { 6199 status->float_exception_flags |= float_flag_inexact; 6200 } 6201 } 6202 else { 6203 float_raise(float_flag_invalid, status); 6204 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6205 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6206 } 6207 } 6208 return (int64_t) LIT64( 0x8000000000000000 ); 6209 } 6210 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6211 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6212 status->float_exception_flags |= float_flag_inexact; 6213 } 6214 } 6215 else { 6216 if ( aExp < 0x3FFF ) { 6217 if ( aExp | aSig0 | aSig1 ) { 6218 status->float_exception_flags |= float_flag_inexact; 6219 } 6220 return 0; 6221 } 6222 z = aSig0>>( - shiftCount ); 6223 if ( aSig1 6224 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6225 status->float_exception_flags |= float_flag_inexact; 6226 } 6227 } 6228 if ( aSign ) z = - z; 6229 return z; 6230 6231 } 6232 6233 /*---------------------------------------------------------------------------- 6234 | Returns the result of converting the quadruple-precision floating-point value 6235 | `a' to the 64-bit unsigned integer format. The conversion is 6236 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6237 | Arithmetic---which means in particular that the conversion is rounded 6238 | according to the current rounding mode. If `a' is a NaN, the largest 6239 | positive integer is returned. If the conversion overflows, the 6240 | largest unsigned integer is returned. If 'a' is negative, the value is 6241 | rounded and zero is returned; negative values that do not round to zero 6242 | will raise the inexact exception. 6243 *----------------------------------------------------------------------------*/ 6244 6245 uint64_t float128_to_uint64(float128 a, float_status *status) 6246 { 6247 flag aSign; 6248 int aExp; 6249 int shiftCount; 6250 uint64_t aSig0, aSig1; 6251 6252 aSig0 = extractFloat128Frac0(a); 6253 aSig1 = extractFloat128Frac1(a); 6254 aExp = extractFloat128Exp(a); 6255 aSign = extractFloat128Sign(a); 6256 if (aSign && (aExp > 0x3FFE)) { 6257 float_raise(float_flag_invalid, status); 6258 if (float128_is_any_nan(a)) { 6259 return LIT64(0xFFFFFFFFFFFFFFFF); 6260 } else { 6261 return 0; 6262 } 6263 } 6264 if (aExp) { 6265 aSig0 |= LIT64(0x0001000000000000); 6266 } 6267 shiftCount = 0x402F - aExp; 6268 if (shiftCount <= 0) { 6269 if (0x403E < aExp) { 6270 float_raise(float_flag_invalid, status); 6271 return LIT64(0xFFFFFFFFFFFFFFFF); 6272 } 6273 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6274 } else { 6275 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6276 } 6277 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6278 } 6279 6280 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6281 { 6282 uint64_t v; 6283 signed char current_rounding_mode = status->float_rounding_mode; 6284 6285 set_float_rounding_mode(float_round_to_zero, status); 6286 v = float128_to_uint64(a, status); 6287 set_float_rounding_mode(current_rounding_mode, status); 6288 6289 return v; 6290 } 6291 6292 /*---------------------------------------------------------------------------- 6293 | Returns the result of converting the quadruple-precision floating-point 6294 | value `a' to the 32-bit unsigned integer format. The conversion 6295 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6296 | Arithmetic except that the conversion is always rounded toward zero. 6297 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6298 | if the conversion overflows, the largest unsigned integer is returned. 6299 | If 'a' is negative, the value is rounded and zero is returned; negative 6300 | values that do not round to zero will raise the inexact exception. 6301 *----------------------------------------------------------------------------*/ 6302 6303 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6304 { 6305 uint64_t v; 6306 uint32_t res; 6307 int old_exc_flags = get_float_exception_flags(status); 6308 6309 v = float128_to_uint64_round_to_zero(a, status); 6310 if (v > 0xffffffff) { 6311 res = 0xffffffff; 6312 } else { 6313 return v; 6314 } 6315 set_float_exception_flags(old_exc_flags, status); 6316 float_raise(float_flag_invalid, status); 6317 return res; 6318 } 6319 6320 /*---------------------------------------------------------------------------- 6321 | Returns the result of converting the quadruple-precision floating-point 6322 | value `a' to the single-precision floating-point format. The conversion 6323 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6324 | Arithmetic. 6325 *----------------------------------------------------------------------------*/ 6326 6327 float32 float128_to_float32(float128 a, float_status *status) 6328 { 6329 flag aSign; 6330 int32_t aExp; 6331 uint64_t aSig0, aSig1; 6332 uint32_t zSig; 6333 6334 aSig1 = extractFloat128Frac1( a ); 6335 aSig0 = extractFloat128Frac0( a ); 6336 aExp = extractFloat128Exp( a ); 6337 aSign = extractFloat128Sign( a ); 6338 if ( aExp == 0x7FFF ) { 6339 if ( aSig0 | aSig1 ) { 6340 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6341 } 6342 return packFloat32( aSign, 0xFF, 0 ); 6343 } 6344 aSig0 |= ( aSig1 != 0 ); 6345 shift64RightJamming( aSig0, 18, &aSig0 ); 6346 zSig = aSig0; 6347 if ( aExp || zSig ) { 6348 zSig |= 0x40000000; 6349 aExp -= 0x3F81; 6350 } 6351 return roundAndPackFloat32(aSign, aExp, zSig, status); 6352 6353 } 6354 6355 /*---------------------------------------------------------------------------- 6356 | Returns the result of converting the quadruple-precision floating-point 6357 | value `a' to the double-precision floating-point format. The conversion 6358 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6359 | Arithmetic. 6360 *----------------------------------------------------------------------------*/ 6361 6362 float64 float128_to_float64(float128 a, float_status *status) 6363 { 6364 flag aSign; 6365 int32_t aExp; 6366 uint64_t aSig0, aSig1; 6367 6368 aSig1 = extractFloat128Frac1( a ); 6369 aSig0 = extractFloat128Frac0( a ); 6370 aExp = extractFloat128Exp( a ); 6371 aSign = extractFloat128Sign( a ); 6372 if ( aExp == 0x7FFF ) { 6373 if ( aSig0 | aSig1 ) { 6374 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6375 } 6376 return packFloat64( aSign, 0x7FF, 0 ); 6377 } 6378 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6379 aSig0 |= ( aSig1 != 0 ); 6380 if ( aExp || aSig0 ) { 6381 aSig0 |= LIT64( 0x4000000000000000 ); 6382 aExp -= 0x3C01; 6383 } 6384 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6385 6386 } 6387 6388 /*---------------------------------------------------------------------------- 6389 | Returns the result of converting the quadruple-precision floating-point 6390 | value `a' to the extended double-precision floating-point format. The 6391 | conversion is performed according to the IEC/IEEE Standard for Binary 6392 | Floating-Point Arithmetic. 6393 *----------------------------------------------------------------------------*/ 6394 6395 floatx80 float128_to_floatx80(float128 a, float_status *status) 6396 { 6397 flag aSign; 6398 int32_t aExp; 6399 uint64_t aSig0, aSig1; 6400 6401 aSig1 = extractFloat128Frac1( a ); 6402 aSig0 = extractFloat128Frac0( a ); 6403 aExp = extractFloat128Exp( a ); 6404 aSign = extractFloat128Sign( a ); 6405 if ( aExp == 0x7FFF ) { 6406 if ( aSig0 | aSig1 ) { 6407 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6408 } 6409 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6410 } 6411 if ( aExp == 0 ) { 6412 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6413 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6414 } 6415 else { 6416 aSig0 |= LIT64( 0x0001000000000000 ); 6417 } 6418 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6419 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6420 6421 } 6422 6423 /*---------------------------------------------------------------------------- 6424 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6425 | returns the result as a quadruple-precision floating-point value. The 6426 | operation is performed according to the IEC/IEEE Standard for Binary 6427 | Floating-Point Arithmetic. 6428 *----------------------------------------------------------------------------*/ 6429 6430 float128 float128_round_to_int(float128 a, float_status *status) 6431 { 6432 flag aSign; 6433 int32_t aExp; 6434 uint64_t lastBitMask, roundBitsMask; 6435 float128 z; 6436 6437 aExp = extractFloat128Exp( a ); 6438 if ( 0x402F <= aExp ) { 6439 if ( 0x406F <= aExp ) { 6440 if ( ( aExp == 0x7FFF ) 6441 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6442 ) { 6443 return propagateFloat128NaN(a, a, status); 6444 } 6445 return a; 6446 } 6447 lastBitMask = 1; 6448 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6449 roundBitsMask = lastBitMask - 1; 6450 z = a; 6451 switch (status->float_rounding_mode) { 6452 case float_round_nearest_even: 6453 if ( lastBitMask ) { 6454 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6455 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6456 } 6457 else { 6458 if ( (int64_t) z.low < 0 ) { 6459 ++z.high; 6460 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6461 } 6462 } 6463 break; 6464 case float_round_ties_away: 6465 if (lastBitMask) { 6466 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6467 } else { 6468 if ((int64_t) z.low < 0) { 6469 ++z.high; 6470 } 6471 } 6472 break; 6473 case float_round_to_zero: 6474 break; 6475 case float_round_up: 6476 if (!extractFloat128Sign(z)) { 6477 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6478 } 6479 break; 6480 case float_round_down: 6481 if (extractFloat128Sign(z)) { 6482 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6483 } 6484 break; 6485 default: 6486 abort(); 6487 } 6488 z.low &= ~ roundBitsMask; 6489 } 6490 else { 6491 if ( aExp < 0x3FFF ) { 6492 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6493 status->float_exception_flags |= float_flag_inexact; 6494 aSign = extractFloat128Sign( a ); 6495 switch (status->float_rounding_mode) { 6496 case float_round_nearest_even: 6497 if ( ( aExp == 0x3FFE ) 6498 && ( extractFloat128Frac0( a ) 6499 | extractFloat128Frac1( a ) ) 6500 ) { 6501 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6502 } 6503 break; 6504 case float_round_ties_away: 6505 if (aExp == 0x3FFE) { 6506 return packFloat128(aSign, 0x3FFF, 0, 0); 6507 } 6508 break; 6509 case float_round_down: 6510 return 6511 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6512 : packFloat128( 0, 0, 0, 0 ); 6513 case float_round_up: 6514 return 6515 aSign ? packFloat128( 1, 0, 0, 0 ) 6516 : packFloat128( 0, 0x3FFF, 0, 0 ); 6517 } 6518 return packFloat128( aSign, 0, 0, 0 ); 6519 } 6520 lastBitMask = 1; 6521 lastBitMask <<= 0x402F - aExp; 6522 roundBitsMask = lastBitMask - 1; 6523 z.low = 0; 6524 z.high = a.high; 6525 switch (status->float_rounding_mode) { 6526 case float_round_nearest_even: 6527 z.high += lastBitMask>>1; 6528 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6529 z.high &= ~ lastBitMask; 6530 } 6531 break; 6532 case float_round_ties_away: 6533 z.high += lastBitMask>>1; 6534 break; 6535 case float_round_to_zero: 6536 break; 6537 case float_round_up: 6538 if (!extractFloat128Sign(z)) { 6539 z.high |= ( a.low != 0 ); 6540 z.high += roundBitsMask; 6541 } 6542 break; 6543 case float_round_down: 6544 if (extractFloat128Sign(z)) { 6545 z.high |= (a.low != 0); 6546 z.high += roundBitsMask; 6547 } 6548 break; 6549 default: 6550 abort(); 6551 } 6552 z.high &= ~ roundBitsMask; 6553 } 6554 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6555 status->float_exception_flags |= float_flag_inexact; 6556 } 6557 return z; 6558 6559 } 6560 6561 /*---------------------------------------------------------------------------- 6562 | Returns the result of adding the absolute values of the quadruple-precision 6563 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6564 | before being returned. `zSign' is ignored if the result is a NaN. 6565 | The addition is performed according to the IEC/IEEE Standard for Binary 6566 | Floating-Point Arithmetic. 6567 *----------------------------------------------------------------------------*/ 6568 6569 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6570 float_status *status) 6571 { 6572 int32_t aExp, bExp, zExp; 6573 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6574 int32_t expDiff; 6575 6576 aSig1 = extractFloat128Frac1( a ); 6577 aSig0 = extractFloat128Frac0( a ); 6578 aExp = extractFloat128Exp( a ); 6579 bSig1 = extractFloat128Frac1( b ); 6580 bSig0 = extractFloat128Frac0( b ); 6581 bExp = extractFloat128Exp( b ); 6582 expDiff = aExp - bExp; 6583 if ( 0 < expDiff ) { 6584 if ( aExp == 0x7FFF ) { 6585 if (aSig0 | aSig1) { 6586 return propagateFloat128NaN(a, b, status); 6587 } 6588 return a; 6589 } 6590 if ( bExp == 0 ) { 6591 --expDiff; 6592 } 6593 else { 6594 bSig0 |= LIT64( 0x0001000000000000 ); 6595 } 6596 shift128ExtraRightJamming( 6597 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6598 zExp = aExp; 6599 } 6600 else if ( expDiff < 0 ) { 6601 if ( bExp == 0x7FFF ) { 6602 if (bSig0 | bSig1) { 6603 return propagateFloat128NaN(a, b, status); 6604 } 6605 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6606 } 6607 if ( aExp == 0 ) { 6608 ++expDiff; 6609 } 6610 else { 6611 aSig0 |= LIT64( 0x0001000000000000 ); 6612 } 6613 shift128ExtraRightJamming( 6614 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6615 zExp = bExp; 6616 } 6617 else { 6618 if ( aExp == 0x7FFF ) { 6619 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6620 return propagateFloat128NaN(a, b, status); 6621 } 6622 return a; 6623 } 6624 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6625 if ( aExp == 0 ) { 6626 if (status->flush_to_zero) { 6627 if (zSig0 | zSig1) { 6628 float_raise(float_flag_output_denormal, status); 6629 } 6630 return packFloat128(zSign, 0, 0, 0); 6631 } 6632 return packFloat128( zSign, 0, zSig0, zSig1 ); 6633 } 6634 zSig2 = 0; 6635 zSig0 |= LIT64( 0x0002000000000000 ); 6636 zExp = aExp; 6637 goto shiftRight1; 6638 } 6639 aSig0 |= LIT64( 0x0001000000000000 ); 6640 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6641 --zExp; 6642 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6643 ++zExp; 6644 shiftRight1: 6645 shift128ExtraRightJamming( 6646 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6647 roundAndPack: 6648 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6649 6650 } 6651 6652 /*---------------------------------------------------------------------------- 6653 | Returns the result of subtracting the absolute values of the quadruple- 6654 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6655 | difference is negated before being returned. `zSign' is ignored if the 6656 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6657 | Standard for Binary Floating-Point Arithmetic. 6658 *----------------------------------------------------------------------------*/ 6659 6660 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6661 float_status *status) 6662 { 6663 int32_t aExp, bExp, zExp; 6664 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6665 int32_t expDiff; 6666 6667 aSig1 = extractFloat128Frac1( a ); 6668 aSig0 = extractFloat128Frac0( a ); 6669 aExp = extractFloat128Exp( a ); 6670 bSig1 = extractFloat128Frac1( b ); 6671 bSig0 = extractFloat128Frac0( b ); 6672 bExp = extractFloat128Exp( b ); 6673 expDiff = aExp - bExp; 6674 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6675 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6676 if ( 0 < expDiff ) goto aExpBigger; 6677 if ( expDiff < 0 ) goto bExpBigger; 6678 if ( aExp == 0x7FFF ) { 6679 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6680 return propagateFloat128NaN(a, b, status); 6681 } 6682 float_raise(float_flag_invalid, status); 6683 return float128_default_nan(status); 6684 } 6685 if ( aExp == 0 ) { 6686 aExp = 1; 6687 bExp = 1; 6688 } 6689 if ( bSig0 < aSig0 ) goto aBigger; 6690 if ( aSig0 < bSig0 ) goto bBigger; 6691 if ( bSig1 < aSig1 ) goto aBigger; 6692 if ( aSig1 < bSig1 ) goto bBigger; 6693 return packFloat128(status->float_rounding_mode == float_round_down, 6694 0, 0, 0); 6695 bExpBigger: 6696 if ( bExp == 0x7FFF ) { 6697 if (bSig0 | bSig1) { 6698 return propagateFloat128NaN(a, b, status); 6699 } 6700 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6701 } 6702 if ( aExp == 0 ) { 6703 ++expDiff; 6704 } 6705 else { 6706 aSig0 |= LIT64( 0x4000000000000000 ); 6707 } 6708 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6709 bSig0 |= LIT64( 0x4000000000000000 ); 6710 bBigger: 6711 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6712 zExp = bExp; 6713 zSign ^= 1; 6714 goto normalizeRoundAndPack; 6715 aExpBigger: 6716 if ( aExp == 0x7FFF ) { 6717 if (aSig0 | aSig1) { 6718 return propagateFloat128NaN(a, b, status); 6719 } 6720 return a; 6721 } 6722 if ( bExp == 0 ) { 6723 --expDiff; 6724 } 6725 else { 6726 bSig0 |= LIT64( 0x4000000000000000 ); 6727 } 6728 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6729 aSig0 |= LIT64( 0x4000000000000000 ); 6730 aBigger: 6731 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6732 zExp = aExp; 6733 normalizeRoundAndPack: 6734 --zExp; 6735 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6736 status); 6737 6738 } 6739 6740 /*---------------------------------------------------------------------------- 6741 | Returns the result of adding the quadruple-precision floating-point values 6742 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6743 | for Binary Floating-Point Arithmetic. 6744 *----------------------------------------------------------------------------*/ 6745 6746 float128 float128_add(float128 a, float128 b, float_status *status) 6747 { 6748 flag aSign, bSign; 6749 6750 aSign = extractFloat128Sign( a ); 6751 bSign = extractFloat128Sign( b ); 6752 if ( aSign == bSign ) { 6753 return addFloat128Sigs(a, b, aSign, status); 6754 } 6755 else { 6756 return subFloat128Sigs(a, b, aSign, status); 6757 } 6758 6759 } 6760 6761 /*---------------------------------------------------------------------------- 6762 | Returns the result of subtracting the quadruple-precision floating-point 6763 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6764 | Standard for Binary Floating-Point Arithmetic. 6765 *----------------------------------------------------------------------------*/ 6766 6767 float128 float128_sub(float128 a, float128 b, float_status *status) 6768 { 6769 flag aSign, bSign; 6770 6771 aSign = extractFloat128Sign( a ); 6772 bSign = extractFloat128Sign( b ); 6773 if ( aSign == bSign ) { 6774 return subFloat128Sigs(a, b, aSign, status); 6775 } 6776 else { 6777 return addFloat128Sigs(a, b, aSign, status); 6778 } 6779 6780 } 6781 6782 /*---------------------------------------------------------------------------- 6783 | Returns the result of multiplying the quadruple-precision floating-point 6784 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6785 | Standard for Binary Floating-Point Arithmetic. 6786 *----------------------------------------------------------------------------*/ 6787 6788 float128 float128_mul(float128 a, float128 b, float_status *status) 6789 { 6790 flag aSign, bSign, zSign; 6791 int32_t aExp, bExp, zExp; 6792 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6793 6794 aSig1 = extractFloat128Frac1( a ); 6795 aSig0 = extractFloat128Frac0( a ); 6796 aExp = extractFloat128Exp( a ); 6797 aSign = extractFloat128Sign( a ); 6798 bSig1 = extractFloat128Frac1( b ); 6799 bSig0 = extractFloat128Frac0( b ); 6800 bExp = extractFloat128Exp( b ); 6801 bSign = extractFloat128Sign( b ); 6802 zSign = aSign ^ bSign; 6803 if ( aExp == 0x7FFF ) { 6804 if ( ( aSig0 | aSig1 ) 6805 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6806 return propagateFloat128NaN(a, b, status); 6807 } 6808 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6809 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6810 } 6811 if ( bExp == 0x7FFF ) { 6812 if (bSig0 | bSig1) { 6813 return propagateFloat128NaN(a, b, status); 6814 } 6815 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6816 invalid: 6817 float_raise(float_flag_invalid, status); 6818 return float128_default_nan(status); 6819 } 6820 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6821 } 6822 if ( aExp == 0 ) { 6823 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6824 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6825 } 6826 if ( bExp == 0 ) { 6827 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6828 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6829 } 6830 zExp = aExp + bExp - 0x4000; 6831 aSig0 |= LIT64( 0x0001000000000000 ); 6832 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6833 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6834 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6835 zSig2 |= ( zSig3 != 0 ); 6836 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6837 shift128ExtraRightJamming( 6838 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6839 ++zExp; 6840 } 6841 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6842 6843 } 6844 6845 /*---------------------------------------------------------------------------- 6846 | Returns the result of dividing the quadruple-precision floating-point value 6847 | `a' by the corresponding value `b'. The operation is performed according to 6848 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6849 *----------------------------------------------------------------------------*/ 6850 6851 float128 float128_div(float128 a, float128 b, float_status *status) 6852 { 6853 flag aSign, bSign, zSign; 6854 int32_t aExp, bExp, zExp; 6855 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6856 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6857 6858 aSig1 = extractFloat128Frac1( a ); 6859 aSig0 = extractFloat128Frac0( a ); 6860 aExp = extractFloat128Exp( a ); 6861 aSign = extractFloat128Sign( a ); 6862 bSig1 = extractFloat128Frac1( b ); 6863 bSig0 = extractFloat128Frac0( b ); 6864 bExp = extractFloat128Exp( b ); 6865 bSign = extractFloat128Sign( b ); 6866 zSign = aSign ^ bSign; 6867 if ( aExp == 0x7FFF ) { 6868 if (aSig0 | aSig1) { 6869 return propagateFloat128NaN(a, b, status); 6870 } 6871 if ( bExp == 0x7FFF ) { 6872 if (bSig0 | bSig1) { 6873 return propagateFloat128NaN(a, b, status); 6874 } 6875 goto invalid; 6876 } 6877 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6878 } 6879 if ( bExp == 0x7FFF ) { 6880 if (bSig0 | bSig1) { 6881 return propagateFloat128NaN(a, b, status); 6882 } 6883 return packFloat128( zSign, 0, 0, 0 ); 6884 } 6885 if ( bExp == 0 ) { 6886 if ( ( bSig0 | bSig1 ) == 0 ) { 6887 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6888 invalid: 6889 float_raise(float_flag_invalid, status); 6890 return float128_default_nan(status); 6891 } 6892 float_raise(float_flag_divbyzero, status); 6893 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6894 } 6895 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6896 } 6897 if ( aExp == 0 ) { 6898 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6899 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6900 } 6901 zExp = aExp - bExp + 0x3FFD; 6902 shortShift128Left( 6903 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6904 shortShift128Left( 6905 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6906 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6907 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6908 ++zExp; 6909 } 6910 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6911 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6912 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6913 while ( (int64_t) rem0 < 0 ) { 6914 --zSig0; 6915 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6916 } 6917 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6918 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6919 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6920 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6921 while ( (int64_t) rem1 < 0 ) { 6922 --zSig1; 6923 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6924 } 6925 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6926 } 6927 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6928 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6929 6930 } 6931 6932 /*---------------------------------------------------------------------------- 6933 | Returns the remainder of the quadruple-precision floating-point value `a' 6934 | with respect to the corresponding value `b'. The operation is performed 6935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6936 *----------------------------------------------------------------------------*/ 6937 6938 float128 float128_rem(float128 a, float128 b, float_status *status) 6939 { 6940 flag aSign, zSign; 6941 int32_t aExp, bExp, expDiff; 6942 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6943 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6944 int64_t sigMean0; 6945 6946 aSig1 = extractFloat128Frac1( a ); 6947 aSig0 = extractFloat128Frac0( a ); 6948 aExp = extractFloat128Exp( a ); 6949 aSign = extractFloat128Sign( a ); 6950 bSig1 = extractFloat128Frac1( b ); 6951 bSig0 = extractFloat128Frac0( b ); 6952 bExp = extractFloat128Exp( b ); 6953 if ( aExp == 0x7FFF ) { 6954 if ( ( aSig0 | aSig1 ) 6955 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6956 return propagateFloat128NaN(a, b, status); 6957 } 6958 goto invalid; 6959 } 6960 if ( bExp == 0x7FFF ) { 6961 if (bSig0 | bSig1) { 6962 return propagateFloat128NaN(a, b, status); 6963 } 6964 return a; 6965 } 6966 if ( bExp == 0 ) { 6967 if ( ( bSig0 | bSig1 ) == 0 ) { 6968 invalid: 6969 float_raise(float_flag_invalid, status); 6970 return float128_default_nan(status); 6971 } 6972 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6973 } 6974 if ( aExp == 0 ) { 6975 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6976 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6977 } 6978 expDiff = aExp - bExp; 6979 if ( expDiff < -1 ) return a; 6980 shortShift128Left( 6981 aSig0 | LIT64( 0x0001000000000000 ), 6982 aSig1, 6983 15 - ( expDiff < 0 ), 6984 &aSig0, 6985 &aSig1 6986 ); 6987 shortShift128Left( 6988 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6989 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6990 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6991 expDiff -= 64; 6992 while ( 0 < expDiff ) { 6993 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6994 q = ( 4 < q ) ? q - 4 : 0; 6995 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6996 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6997 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6998 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6999 expDiff -= 61; 7000 } 7001 if ( -64 < expDiff ) { 7002 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 7003 q = ( 4 < q ) ? q - 4 : 0; 7004 q >>= - expDiff; 7005 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7006 expDiff += 52; 7007 if ( expDiff < 0 ) { 7008 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 7009 } 7010 else { 7011 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 7012 } 7013 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 7014 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 7015 } 7016 else { 7017 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 7018 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 7019 } 7020 do { 7021 alternateASig0 = aSig0; 7022 alternateASig1 = aSig1; 7023 ++q; 7024 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 7025 } while ( 0 <= (int64_t) aSig0 ); 7026 add128( 7027 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 7028 if ( ( sigMean0 < 0 ) 7029 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 7030 aSig0 = alternateASig0; 7031 aSig1 = alternateASig1; 7032 } 7033 zSign = ( (int64_t) aSig0 < 0 ); 7034 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 7035 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 7036 status); 7037 } 7038 7039 /*---------------------------------------------------------------------------- 7040 | Returns the square root of the quadruple-precision floating-point value `a'. 7041 | The operation is performed according to the IEC/IEEE Standard for Binary 7042 | Floating-Point Arithmetic. 7043 *----------------------------------------------------------------------------*/ 7044 7045 float128 float128_sqrt(float128 a, float_status *status) 7046 { 7047 flag aSign; 7048 int32_t aExp, zExp; 7049 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 7050 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 7051 7052 aSig1 = extractFloat128Frac1( a ); 7053 aSig0 = extractFloat128Frac0( a ); 7054 aExp = extractFloat128Exp( a ); 7055 aSign = extractFloat128Sign( a ); 7056 if ( aExp == 0x7FFF ) { 7057 if (aSig0 | aSig1) { 7058 return propagateFloat128NaN(a, a, status); 7059 } 7060 if ( ! aSign ) return a; 7061 goto invalid; 7062 } 7063 if ( aSign ) { 7064 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 7065 invalid: 7066 float_raise(float_flag_invalid, status); 7067 return float128_default_nan(status); 7068 } 7069 if ( aExp == 0 ) { 7070 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 7071 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7072 } 7073 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7074 aSig0 |= LIT64( 0x0001000000000000 ); 7075 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7076 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7077 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7078 doubleZSig0 = zSig0<<1; 7079 mul64To128( zSig0, zSig0, &term0, &term1 ); 7080 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7081 while ( (int64_t) rem0 < 0 ) { 7082 --zSig0; 7083 doubleZSig0 -= 2; 7084 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7085 } 7086 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7087 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7088 if ( zSig1 == 0 ) zSig1 = 1; 7089 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7090 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7091 mul64To128( zSig1, zSig1, &term2, &term3 ); 7092 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7093 while ( (int64_t) rem1 < 0 ) { 7094 --zSig1; 7095 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7096 term3 |= 1; 7097 term2 |= doubleZSig0; 7098 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7099 } 7100 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7101 } 7102 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7103 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7104 7105 } 7106 7107 /*---------------------------------------------------------------------------- 7108 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7109 | the corresponding value `b', and 0 otherwise. The invalid exception is 7110 | raised if either operand is a NaN. Otherwise, the comparison is performed 7111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7112 *----------------------------------------------------------------------------*/ 7113 7114 int float128_eq(float128 a, float128 b, float_status *status) 7115 { 7116 7117 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7118 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7119 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7120 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7121 ) { 7122 float_raise(float_flag_invalid, status); 7123 return 0; 7124 } 7125 return 7126 ( a.low == b.low ) 7127 && ( ( a.high == b.high ) 7128 || ( ( a.low == 0 ) 7129 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7130 ); 7131 7132 } 7133 7134 /*---------------------------------------------------------------------------- 7135 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7136 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7137 | exception is raised if either operand is a NaN. The comparison is performed 7138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7139 *----------------------------------------------------------------------------*/ 7140 7141 int float128_le(float128 a, float128 b, float_status *status) 7142 { 7143 flag aSign, bSign; 7144 7145 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7146 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7147 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7148 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7149 ) { 7150 float_raise(float_flag_invalid, status); 7151 return 0; 7152 } 7153 aSign = extractFloat128Sign( a ); 7154 bSign = extractFloat128Sign( b ); 7155 if ( aSign != bSign ) { 7156 return 7157 aSign 7158 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7159 == 0 ); 7160 } 7161 return 7162 aSign ? le128( b.high, b.low, a.high, a.low ) 7163 : le128( a.high, a.low, b.high, b.low ); 7164 7165 } 7166 7167 /*---------------------------------------------------------------------------- 7168 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7169 | the corresponding value `b', and 0 otherwise. The invalid exception is 7170 | raised if either operand is a NaN. The comparison is performed according 7171 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7172 *----------------------------------------------------------------------------*/ 7173 7174 int float128_lt(float128 a, float128 b, float_status *status) 7175 { 7176 flag aSign, bSign; 7177 7178 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7179 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7180 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7181 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7182 ) { 7183 float_raise(float_flag_invalid, status); 7184 return 0; 7185 } 7186 aSign = extractFloat128Sign( a ); 7187 bSign = extractFloat128Sign( b ); 7188 if ( aSign != bSign ) { 7189 return 7190 aSign 7191 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7192 != 0 ); 7193 } 7194 return 7195 aSign ? lt128( b.high, b.low, a.high, a.low ) 7196 : lt128( a.high, a.low, b.high, b.low ); 7197 7198 } 7199 7200 /*---------------------------------------------------------------------------- 7201 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7202 | be compared, and 0 otherwise. The invalid exception is raised if either 7203 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7204 | Standard for Binary Floating-Point Arithmetic. 7205 *----------------------------------------------------------------------------*/ 7206 7207 int float128_unordered(float128 a, float128 b, float_status *status) 7208 { 7209 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7210 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7211 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7212 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7213 ) { 7214 float_raise(float_flag_invalid, status); 7215 return 1; 7216 } 7217 return 0; 7218 } 7219 7220 /*---------------------------------------------------------------------------- 7221 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7222 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7223 | exception. The comparison is performed according to the IEC/IEEE Standard 7224 | for Binary Floating-Point Arithmetic. 7225 *----------------------------------------------------------------------------*/ 7226 7227 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7228 { 7229 7230 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7231 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7232 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7233 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7234 ) { 7235 if (float128_is_signaling_nan(a, status) 7236 || float128_is_signaling_nan(b, status)) { 7237 float_raise(float_flag_invalid, status); 7238 } 7239 return 0; 7240 } 7241 return 7242 ( a.low == b.low ) 7243 && ( ( a.high == b.high ) 7244 || ( ( a.low == 0 ) 7245 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7246 ); 7247 7248 } 7249 7250 /*---------------------------------------------------------------------------- 7251 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7252 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7253 | cause an exception. Otherwise, the comparison is performed according to the 7254 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7255 *----------------------------------------------------------------------------*/ 7256 7257 int float128_le_quiet(float128 a, float128 b, float_status *status) 7258 { 7259 flag aSign, bSign; 7260 7261 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7262 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7263 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7264 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7265 ) { 7266 if (float128_is_signaling_nan(a, status) 7267 || float128_is_signaling_nan(b, status)) { 7268 float_raise(float_flag_invalid, status); 7269 } 7270 return 0; 7271 } 7272 aSign = extractFloat128Sign( a ); 7273 bSign = extractFloat128Sign( b ); 7274 if ( aSign != bSign ) { 7275 return 7276 aSign 7277 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7278 == 0 ); 7279 } 7280 return 7281 aSign ? le128( b.high, b.low, a.high, a.low ) 7282 : le128( a.high, a.low, b.high, b.low ); 7283 7284 } 7285 7286 /*---------------------------------------------------------------------------- 7287 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7288 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7289 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7290 | Standard for Binary Floating-Point Arithmetic. 7291 *----------------------------------------------------------------------------*/ 7292 7293 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7294 { 7295 flag aSign, bSign; 7296 7297 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7298 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7299 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7300 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7301 ) { 7302 if (float128_is_signaling_nan(a, status) 7303 || float128_is_signaling_nan(b, status)) { 7304 float_raise(float_flag_invalid, status); 7305 } 7306 return 0; 7307 } 7308 aSign = extractFloat128Sign( a ); 7309 bSign = extractFloat128Sign( b ); 7310 if ( aSign != bSign ) { 7311 return 7312 aSign 7313 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7314 != 0 ); 7315 } 7316 return 7317 aSign ? lt128( b.high, b.low, a.high, a.low ) 7318 : lt128( a.high, a.low, b.high, b.low ); 7319 7320 } 7321 7322 /*---------------------------------------------------------------------------- 7323 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7324 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7325 | comparison is performed according to the IEC/IEEE Standard for Binary 7326 | Floating-Point Arithmetic. 7327 *----------------------------------------------------------------------------*/ 7328 7329 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7330 { 7331 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7332 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7333 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7334 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7335 ) { 7336 if (float128_is_signaling_nan(a, status) 7337 || float128_is_signaling_nan(b, status)) { 7338 float_raise(float_flag_invalid, status); 7339 } 7340 return 1; 7341 } 7342 return 0; 7343 } 7344 7345 /* misc functions */ 7346 float32 uint32_to_float32(uint32_t a, float_status *status) 7347 { 7348 return int64_to_float32(a, status); 7349 } 7350 7351 float64 uint32_to_float64(uint32_t a, float_status *status) 7352 { 7353 return int64_to_float64(a, status); 7354 } 7355 7356 uint32_t float32_to_uint32(float32 a, float_status *status) 7357 { 7358 int64_t v; 7359 uint32_t res; 7360 int old_exc_flags = get_float_exception_flags(status); 7361 7362 v = float32_to_int64(a, status); 7363 if (v < 0) { 7364 res = 0; 7365 } else if (v > 0xffffffff) { 7366 res = 0xffffffff; 7367 } else { 7368 return v; 7369 } 7370 set_float_exception_flags(old_exc_flags, status); 7371 float_raise(float_flag_invalid, status); 7372 return res; 7373 } 7374 7375 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7376 { 7377 int64_t v; 7378 uint32_t res; 7379 int old_exc_flags = get_float_exception_flags(status); 7380 7381 v = float32_to_int64_round_to_zero(a, status); 7382 if (v < 0) { 7383 res = 0; 7384 } else if (v > 0xffffffff) { 7385 res = 0xffffffff; 7386 } else { 7387 return v; 7388 } 7389 set_float_exception_flags(old_exc_flags, status); 7390 float_raise(float_flag_invalid, status); 7391 return res; 7392 } 7393 7394 int16_t float32_to_int16(float32 a, float_status *status) 7395 { 7396 int32_t v; 7397 int16_t res; 7398 int old_exc_flags = get_float_exception_flags(status); 7399 7400 v = float32_to_int32(a, status); 7401 if (v < -0x8000) { 7402 res = -0x8000; 7403 } else if (v > 0x7fff) { 7404 res = 0x7fff; 7405 } else { 7406 return v; 7407 } 7408 7409 set_float_exception_flags(old_exc_flags, status); 7410 float_raise(float_flag_invalid, status); 7411 return res; 7412 } 7413 7414 uint16_t float32_to_uint16(float32 a, float_status *status) 7415 { 7416 int32_t v; 7417 uint16_t res; 7418 int old_exc_flags = get_float_exception_flags(status); 7419 7420 v = float32_to_int32(a, status); 7421 if (v < 0) { 7422 res = 0; 7423 } else if (v > 0xffff) { 7424 res = 0xffff; 7425 } else { 7426 return v; 7427 } 7428 7429 set_float_exception_flags(old_exc_flags, status); 7430 float_raise(float_flag_invalid, status); 7431 return res; 7432 } 7433 7434 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7435 { 7436 int64_t v; 7437 uint16_t res; 7438 int old_exc_flags = get_float_exception_flags(status); 7439 7440 v = float32_to_int64_round_to_zero(a, status); 7441 if (v < 0) { 7442 res = 0; 7443 } else if (v > 0xffff) { 7444 res = 0xffff; 7445 } else { 7446 return v; 7447 } 7448 set_float_exception_flags(old_exc_flags, status); 7449 float_raise(float_flag_invalid, status); 7450 return res; 7451 } 7452 7453 uint32_t float64_to_uint32(float64 a, float_status *status) 7454 { 7455 uint64_t v; 7456 uint32_t res; 7457 int old_exc_flags = get_float_exception_flags(status); 7458 7459 v = float64_to_uint64(a, status); 7460 if (v > 0xffffffff) { 7461 res = 0xffffffff; 7462 } else { 7463 return v; 7464 } 7465 set_float_exception_flags(old_exc_flags, status); 7466 float_raise(float_flag_invalid, status); 7467 return res; 7468 } 7469 7470 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7471 { 7472 uint64_t v; 7473 uint32_t res; 7474 int old_exc_flags = get_float_exception_flags(status); 7475 7476 v = float64_to_uint64_round_to_zero(a, status); 7477 if (v > 0xffffffff) { 7478 res = 0xffffffff; 7479 } else { 7480 return v; 7481 } 7482 set_float_exception_flags(old_exc_flags, status); 7483 float_raise(float_flag_invalid, status); 7484 return res; 7485 } 7486 7487 int16_t float64_to_int16(float64 a, float_status *status) 7488 { 7489 int64_t v; 7490 int16_t res; 7491 int old_exc_flags = get_float_exception_flags(status); 7492 7493 v = float64_to_int32(a, status); 7494 if (v < -0x8000) { 7495 res = -0x8000; 7496 } else if (v > 0x7fff) { 7497 res = 0x7fff; 7498 } else { 7499 return v; 7500 } 7501 7502 set_float_exception_flags(old_exc_flags, status); 7503 float_raise(float_flag_invalid, status); 7504 return res; 7505 } 7506 7507 uint16_t float64_to_uint16(float64 a, float_status *status) 7508 { 7509 int64_t v; 7510 uint16_t res; 7511 int old_exc_flags = get_float_exception_flags(status); 7512 7513 v = float64_to_int32(a, status); 7514 if (v < 0) { 7515 res = 0; 7516 } else if (v > 0xffff) { 7517 res = 0xffff; 7518 } else { 7519 return v; 7520 } 7521 7522 set_float_exception_flags(old_exc_flags, status); 7523 float_raise(float_flag_invalid, status); 7524 return res; 7525 } 7526 7527 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7528 { 7529 int64_t v; 7530 uint16_t res; 7531 int old_exc_flags = get_float_exception_flags(status); 7532 7533 v = float64_to_int64_round_to_zero(a, status); 7534 if (v < 0) { 7535 res = 0; 7536 } else if (v > 0xffff) { 7537 res = 0xffff; 7538 } else { 7539 return v; 7540 } 7541 set_float_exception_flags(old_exc_flags, status); 7542 float_raise(float_flag_invalid, status); 7543 return res; 7544 } 7545 7546 /*---------------------------------------------------------------------------- 7547 | Returns the result of converting the double-precision floating-point value 7548 | `a' to the 64-bit unsigned integer format. The conversion is 7549 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7550 | Arithmetic---which means in particular that the conversion is rounded 7551 | according to the current rounding mode. If `a' is a NaN, the largest 7552 | positive integer is returned. If the conversion overflows, the 7553 | largest unsigned integer is returned. If 'a' is negative, the value is 7554 | rounded and zero is returned; negative values that do not round to zero 7555 | will raise the inexact exception. 7556 *----------------------------------------------------------------------------*/ 7557 7558 uint64_t float64_to_uint64(float64 a, float_status *status) 7559 { 7560 flag aSign; 7561 int aExp; 7562 int shiftCount; 7563 uint64_t aSig, aSigExtra; 7564 a = float64_squash_input_denormal(a, status); 7565 7566 aSig = extractFloat64Frac(a); 7567 aExp = extractFloat64Exp(a); 7568 aSign = extractFloat64Sign(a); 7569 if (aSign && (aExp > 1022)) { 7570 float_raise(float_flag_invalid, status); 7571 if (float64_is_any_nan(a)) { 7572 return LIT64(0xFFFFFFFFFFFFFFFF); 7573 } else { 7574 return 0; 7575 } 7576 } 7577 if (aExp) { 7578 aSig |= LIT64(0x0010000000000000); 7579 } 7580 shiftCount = 0x433 - aExp; 7581 if (shiftCount <= 0) { 7582 if (0x43E < aExp) { 7583 float_raise(float_flag_invalid, status); 7584 return LIT64(0xFFFFFFFFFFFFFFFF); 7585 } 7586 aSigExtra = 0; 7587 aSig <<= -shiftCount; 7588 } else { 7589 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7590 } 7591 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7592 } 7593 7594 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7595 { 7596 signed char current_rounding_mode = status->float_rounding_mode; 7597 set_float_rounding_mode(float_round_to_zero, status); 7598 uint64_t v = float64_to_uint64(a, status); 7599 set_float_rounding_mode(current_rounding_mode, status); 7600 return v; 7601 } 7602 7603 #define COMPARE(s, nan_exp) \ 7604 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7605 int is_quiet, float_status *status) \ 7606 { \ 7607 flag aSign, bSign; \ 7608 uint ## s ## _t av, bv; \ 7609 a = float ## s ## _squash_input_denormal(a, status); \ 7610 b = float ## s ## _squash_input_denormal(b, status); \ 7611 \ 7612 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7613 extractFloat ## s ## Frac( a ) ) || \ 7614 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7615 extractFloat ## s ## Frac( b ) )) { \ 7616 if (!is_quiet || \ 7617 float ## s ## _is_signaling_nan(a, status) || \ 7618 float ## s ## _is_signaling_nan(b, status)) { \ 7619 float_raise(float_flag_invalid, status); \ 7620 } \ 7621 return float_relation_unordered; \ 7622 } \ 7623 aSign = extractFloat ## s ## Sign( a ); \ 7624 bSign = extractFloat ## s ## Sign( b ); \ 7625 av = float ## s ## _val(a); \ 7626 bv = float ## s ## _val(b); \ 7627 if ( aSign != bSign ) { \ 7628 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7629 /* zero case */ \ 7630 return float_relation_equal; \ 7631 } else { \ 7632 return 1 - (2 * aSign); \ 7633 } \ 7634 } else { \ 7635 if (av == bv) { \ 7636 return float_relation_equal; \ 7637 } else { \ 7638 return 1 - 2 * (aSign ^ ( av < bv )); \ 7639 } \ 7640 } \ 7641 } \ 7642 \ 7643 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7644 { \ 7645 return float ## s ## _compare_internal(a, b, 0, status); \ 7646 } \ 7647 \ 7648 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7649 float_status *status) \ 7650 { \ 7651 return float ## s ## _compare_internal(a, b, 1, status); \ 7652 } 7653 7654 COMPARE(32, 0xff) 7655 COMPARE(64, 0x7ff) 7656 7657 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7658 int is_quiet, float_status *status) 7659 { 7660 flag aSign, bSign; 7661 7662 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7663 float_raise(float_flag_invalid, status); 7664 return float_relation_unordered; 7665 } 7666 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7667 ( extractFloatx80Frac( a )<<1 ) ) || 7668 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7669 ( extractFloatx80Frac( b )<<1 ) )) { 7670 if (!is_quiet || 7671 floatx80_is_signaling_nan(a, status) || 7672 floatx80_is_signaling_nan(b, status)) { 7673 float_raise(float_flag_invalid, status); 7674 } 7675 return float_relation_unordered; 7676 } 7677 aSign = extractFloatx80Sign( a ); 7678 bSign = extractFloatx80Sign( b ); 7679 if ( aSign != bSign ) { 7680 7681 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7682 ( ( a.low | b.low ) == 0 ) ) { 7683 /* zero case */ 7684 return float_relation_equal; 7685 } else { 7686 return 1 - (2 * aSign); 7687 } 7688 } else { 7689 if (a.low == b.low && a.high == b.high) { 7690 return float_relation_equal; 7691 } else { 7692 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7693 } 7694 } 7695 } 7696 7697 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7698 { 7699 return floatx80_compare_internal(a, b, 0, status); 7700 } 7701 7702 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7703 { 7704 return floatx80_compare_internal(a, b, 1, status); 7705 } 7706 7707 static inline int float128_compare_internal(float128 a, float128 b, 7708 int is_quiet, float_status *status) 7709 { 7710 flag aSign, bSign; 7711 7712 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7713 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7714 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7715 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7716 if (!is_quiet || 7717 float128_is_signaling_nan(a, status) || 7718 float128_is_signaling_nan(b, status)) { 7719 float_raise(float_flag_invalid, status); 7720 } 7721 return float_relation_unordered; 7722 } 7723 aSign = extractFloat128Sign( a ); 7724 bSign = extractFloat128Sign( b ); 7725 if ( aSign != bSign ) { 7726 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7727 /* zero case */ 7728 return float_relation_equal; 7729 } else { 7730 return 1 - (2 * aSign); 7731 } 7732 } else { 7733 if (a.low == b.low && a.high == b.high) { 7734 return float_relation_equal; 7735 } else { 7736 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7737 } 7738 } 7739 } 7740 7741 int float128_compare(float128 a, float128 b, float_status *status) 7742 { 7743 return float128_compare_internal(a, b, 0, status); 7744 } 7745 7746 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7747 { 7748 return float128_compare_internal(a, b, 1, status); 7749 } 7750 7751 /* min() and max() functions. These can't be implemented as 7752 * 'compare and pick one input' because that would mishandle 7753 * NaNs and +0 vs -0. 7754 * 7755 * minnum() and maxnum() functions. These are similar to the min() 7756 * and max() functions but if one of the arguments is a QNaN and 7757 * the other is numerical then the numerical argument is returned. 7758 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7759 * and maxNum() operations. min() and max() are the typical min/max 7760 * semantics provided by many CPUs which predate that specification. 7761 * 7762 * minnummag() and maxnummag() functions correspond to minNumMag() 7763 * and minNumMag() from the IEEE-754 2008. 7764 */ 7765 #define MINMAX(s) \ 7766 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7767 int ismin, int isieee, \ 7768 int ismag, \ 7769 float_status *status) \ 7770 { \ 7771 flag aSign, bSign; \ 7772 uint ## s ## _t av, bv, aav, abv; \ 7773 a = float ## s ## _squash_input_denormal(a, status); \ 7774 b = float ## s ## _squash_input_denormal(b, status); \ 7775 if (float ## s ## _is_any_nan(a) || \ 7776 float ## s ## _is_any_nan(b)) { \ 7777 if (isieee) { \ 7778 if (float ## s ## _is_quiet_nan(a, status) && \ 7779 !float ## s ##_is_any_nan(b)) { \ 7780 return b; \ 7781 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7782 !float ## s ## _is_any_nan(a)) { \ 7783 return a; \ 7784 } \ 7785 } \ 7786 return propagateFloat ## s ## NaN(a, b, status); \ 7787 } \ 7788 aSign = extractFloat ## s ## Sign(a); \ 7789 bSign = extractFloat ## s ## Sign(b); \ 7790 av = float ## s ## _val(a); \ 7791 bv = float ## s ## _val(b); \ 7792 if (ismag) { \ 7793 aav = float ## s ## _abs(av); \ 7794 abv = float ## s ## _abs(bv); \ 7795 if (aav != abv) { \ 7796 if (ismin) { \ 7797 return (aav < abv) ? a : b; \ 7798 } else { \ 7799 return (aav < abv) ? b : a; \ 7800 } \ 7801 } \ 7802 } \ 7803 if (aSign != bSign) { \ 7804 if (ismin) { \ 7805 return aSign ? a : b; \ 7806 } else { \ 7807 return aSign ? b : a; \ 7808 } \ 7809 } else { \ 7810 if (ismin) { \ 7811 return (aSign ^ (av < bv)) ? a : b; \ 7812 } else { \ 7813 return (aSign ^ (av < bv)) ? b : a; \ 7814 } \ 7815 } \ 7816 } \ 7817 \ 7818 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7819 float_status *status) \ 7820 { \ 7821 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7822 } \ 7823 \ 7824 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7825 float_status *status) \ 7826 { \ 7827 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7828 } \ 7829 \ 7830 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7831 float_status *status) \ 7832 { \ 7833 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7834 } \ 7835 \ 7836 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7837 float_status *status) \ 7838 { \ 7839 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7840 } \ 7841 \ 7842 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7843 float_status *status) \ 7844 { \ 7845 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7846 } \ 7847 \ 7848 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7849 float_status *status) \ 7850 { \ 7851 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7852 } 7853 7854 MINMAX(32) 7855 MINMAX(64) 7856 7857 7858 /* Multiply A by 2 raised to the power N. */ 7859 float32 float32_scalbn(float32 a, int n, float_status *status) 7860 { 7861 flag aSign; 7862 int16_t aExp; 7863 uint32_t aSig; 7864 7865 a = float32_squash_input_denormal(a, status); 7866 aSig = extractFloat32Frac( a ); 7867 aExp = extractFloat32Exp( a ); 7868 aSign = extractFloat32Sign( a ); 7869 7870 if ( aExp == 0xFF ) { 7871 if ( aSig ) { 7872 return propagateFloat32NaN(a, a, status); 7873 } 7874 return a; 7875 } 7876 if (aExp != 0) { 7877 aSig |= 0x00800000; 7878 } else if (aSig == 0) { 7879 return a; 7880 } else { 7881 aExp++; 7882 } 7883 7884 if (n > 0x200) { 7885 n = 0x200; 7886 } else if (n < -0x200) { 7887 n = -0x200; 7888 } 7889 7890 aExp += n - 1; 7891 aSig <<= 7; 7892 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7893 } 7894 7895 float64 float64_scalbn(float64 a, int n, float_status *status) 7896 { 7897 flag aSign; 7898 int16_t aExp; 7899 uint64_t aSig; 7900 7901 a = float64_squash_input_denormal(a, status); 7902 aSig = extractFloat64Frac( a ); 7903 aExp = extractFloat64Exp( a ); 7904 aSign = extractFloat64Sign( a ); 7905 7906 if ( aExp == 0x7FF ) { 7907 if ( aSig ) { 7908 return propagateFloat64NaN(a, a, status); 7909 } 7910 return a; 7911 } 7912 if (aExp != 0) { 7913 aSig |= LIT64( 0x0010000000000000 ); 7914 } else if (aSig == 0) { 7915 return a; 7916 } else { 7917 aExp++; 7918 } 7919 7920 if (n > 0x1000) { 7921 n = 0x1000; 7922 } else if (n < -0x1000) { 7923 n = -0x1000; 7924 } 7925 7926 aExp += n - 1; 7927 aSig <<= 10; 7928 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7929 } 7930 7931 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7932 { 7933 flag aSign; 7934 int32_t aExp; 7935 uint64_t aSig; 7936 7937 if (floatx80_invalid_encoding(a)) { 7938 float_raise(float_flag_invalid, status); 7939 return floatx80_default_nan(status); 7940 } 7941 aSig = extractFloatx80Frac( a ); 7942 aExp = extractFloatx80Exp( a ); 7943 aSign = extractFloatx80Sign( a ); 7944 7945 if ( aExp == 0x7FFF ) { 7946 if ( aSig<<1 ) { 7947 return propagateFloatx80NaN(a, a, status); 7948 } 7949 return a; 7950 } 7951 7952 if (aExp == 0) { 7953 if (aSig == 0) { 7954 return a; 7955 } 7956 aExp++; 7957 } 7958 7959 if (n > 0x10000) { 7960 n = 0x10000; 7961 } else if (n < -0x10000) { 7962 n = -0x10000; 7963 } 7964 7965 aExp += n; 7966 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7967 aSign, aExp, aSig, 0, status); 7968 } 7969 7970 float128 float128_scalbn(float128 a, int n, float_status *status) 7971 { 7972 flag aSign; 7973 int32_t aExp; 7974 uint64_t aSig0, aSig1; 7975 7976 aSig1 = extractFloat128Frac1( a ); 7977 aSig0 = extractFloat128Frac0( a ); 7978 aExp = extractFloat128Exp( a ); 7979 aSign = extractFloat128Sign( a ); 7980 if ( aExp == 0x7FFF ) { 7981 if ( aSig0 | aSig1 ) { 7982 return propagateFloat128NaN(a, a, status); 7983 } 7984 return a; 7985 } 7986 if (aExp != 0) { 7987 aSig0 |= LIT64( 0x0001000000000000 ); 7988 } else if (aSig0 == 0 && aSig1 == 0) { 7989 return a; 7990 } else { 7991 aExp++; 7992 } 7993 7994 if (n > 0x10000) { 7995 n = 0x10000; 7996 } else if (n < -0x10000) { 7997 n = -0x10000; 7998 } 7999 8000 aExp += n - 1; 8001 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 8002 , status); 8003 8004 } 8005