1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the 138 | input. If `zSign' is 1, the input is negated before being converted to an 139 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 140 | is simply rounded to an integer, with the inexact exception raised if the 141 | input cannot be represented exactly as an integer. However, if the fixed- 142 | point input is too large, the invalid exception is raised and the largest 143 | positive or negative integer is returned. 144 *----------------------------------------------------------------------------*/ 145 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 147 { 148 int8_t roundingMode; 149 flag roundNearestEven; 150 int8_t roundIncrement, roundBits; 151 int32_t z; 152 153 roundingMode = status->float_rounding_mode; 154 roundNearestEven = ( roundingMode == float_round_nearest_even ); 155 switch (roundingMode) { 156 case float_round_nearest_even: 157 case float_round_ties_away: 158 roundIncrement = 0x40; 159 break; 160 case float_round_to_zero: 161 roundIncrement = 0; 162 break; 163 case float_round_up: 164 roundIncrement = zSign ? 0 : 0x7f; 165 break; 166 case float_round_down: 167 roundIncrement = zSign ? 0x7f : 0; 168 break; 169 default: 170 abort(); 171 } 172 roundBits = absZ & 0x7F; 173 absZ = ( absZ + roundIncrement )>>7; 174 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 175 z = absZ; 176 if ( zSign ) z = - z; 177 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 178 float_raise(float_flag_invalid, status); 179 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 180 } 181 if (roundBits) { 182 status->float_exception_flags |= float_flag_inexact; 183 } 184 return z; 185 186 } 187 188 /*---------------------------------------------------------------------------- 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 190 | `absZ1', with binary point between bits 63 and 64 (between the input words), 191 | and returns the properly rounded 64-bit integer corresponding to the input. 192 | If `zSign' is 1, the input is negated before being converted to an integer. 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with 194 | the inexact exception raised if the input cannot be represented exactly as 195 | an integer. However, if the fixed-point input is too large, the invalid 196 | exception is raised and the largest positive or negative integer is 197 | returned. 198 *----------------------------------------------------------------------------*/ 199 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 201 float_status *status) 202 { 203 int8_t roundingMode; 204 flag roundNearestEven, increment; 205 int64_t z; 206 207 roundingMode = status->float_rounding_mode; 208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 209 switch (roundingMode) { 210 case float_round_nearest_even: 211 case float_round_ties_away: 212 increment = ((int64_t) absZ1 < 0); 213 break; 214 case float_round_to_zero: 215 increment = 0; 216 break; 217 case float_round_up: 218 increment = !zSign && absZ1; 219 break; 220 case float_round_down: 221 increment = zSign && absZ1; 222 break; 223 default: 224 abort(); 225 } 226 if ( increment ) { 227 ++absZ0; 228 if ( absZ0 == 0 ) goto overflow; 229 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 230 } 231 z = absZ0; 232 if ( zSign ) z = - z; 233 if ( z && ( ( z < 0 ) ^ zSign ) ) { 234 overflow: 235 float_raise(float_flag_invalid, status); 236 return 237 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 238 : LIT64( 0x7FFFFFFFFFFFFFFF ); 239 } 240 if (absZ1) { 241 status->float_exception_flags |= float_flag_inexact; 242 } 243 return z; 244 245 } 246 247 /*---------------------------------------------------------------------------- 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 249 | `absZ1', with binary point between bits 63 and 64 (between the input words), 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the 251 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 252 | with the inexact exception raised if the input cannot be represented exactly 253 | as an integer. However, if the fixed-point input is too large, the invalid 254 | exception is raised and the largest unsigned integer is returned. 255 *----------------------------------------------------------------------------*/ 256 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 258 uint64_t absZ1, float_status *status) 259 { 260 int8_t roundingMode; 261 flag roundNearestEven, increment; 262 263 roundingMode = status->float_rounding_mode; 264 roundNearestEven = (roundingMode == float_round_nearest_even); 265 switch (roundingMode) { 266 case float_round_nearest_even: 267 case float_round_ties_away: 268 increment = ((int64_t)absZ1 < 0); 269 break; 270 case float_round_to_zero: 271 increment = 0; 272 break; 273 case float_round_up: 274 increment = !zSign && absZ1; 275 break; 276 case float_round_down: 277 increment = zSign && absZ1; 278 break; 279 default: 280 abort(); 281 } 282 if (increment) { 283 ++absZ0; 284 if (absZ0 == 0) { 285 float_raise(float_flag_invalid, status); 286 return LIT64(0xFFFFFFFFFFFFFFFF); 287 } 288 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 289 } 290 291 if (zSign && absZ0) { 292 float_raise(float_flag_invalid, status); 293 return 0; 294 } 295 296 if (absZ1) { 297 status->float_exception_flags |= float_flag_inexact; 298 } 299 return absZ0; 300 } 301 302 /*---------------------------------------------------------------------------- 303 | Returns the fraction bits of the single-precision floating-point value `a'. 304 *----------------------------------------------------------------------------*/ 305 306 static inline uint32_t extractFloat32Frac( float32 a ) 307 { 308 309 return float32_val(a) & 0x007FFFFF; 310 311 } 312 313 /*---------------------------------------------------------------------------- 314 | Returns the exponent bits of the single-precision floating-point value `a'. 315 *----------------------------------------------------------------------------*/ 316 317 static inline int extractFloat32Exp(float32 a) 318 { 319 320 return ( float32_val(a)>>23 ) & 0xFF; 321 322 } 323 324 /*---------------------------------------------------------------------------- 325 | Returns the sign bit of the single-precision floating-point value `a'. 326 *----------------------------------------------------------------------------*/ 327 328 static inline flag extractFloat32Sign( float32 a ) 329 { 330 331 return float32_val(a)>>31; 332 333 } 334 335 /*---------------------------------------------------------------------------- 336 | If `a' is denormal and we are in flush-to-zero mode then set the 337 | input-denormal exception and return zero. Otherwise just return the value. 338 *----------------------------------------------------------------------------*/ 339 float32 float32_squash_input_denormal(float32 a, float_status *status) 340 { 341 if (status->flush_inputs_to_zero) { 342 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 343 float_raise(float_flag_input_denormal, status); 344 return make_float32(float32_val(a) & 0x80000000); 345 } 346 } 347 return a; 348 } 349 350 /*---------------------------------------------------------------------------- 351 | Normalizes the subnormal single-precision floating-point value represented 352 | by the denormalized significand `aSig'. The normalized exponent and 353 | significand are stored at the locations pointed to by `zExpPtr' and 354 | `zSigPtr', respectively. 355 *----------------------------------------------------------------------------*/ 356 357 static void 358 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 359 { 360 int8_t shiftCount; 361 362 shiftCount = countLeadingZeros32( aSig ) - 8; 363 *zSigPtr = aSig<<shiftCount; 364 *zExpPtr = 1 - shiftCount; 365 366 } 367 368 /*---------------------------------------------------------------------------- 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 370 | single-precision floating-point value, returning the result. After being 371 | shifted into the proper positions, the three fields are simply added 372 | together to form the result. This means that any integer portion of `zSig' 373 | will be added into the exponent. Since a properly normalized significand 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less 375 | than the desired result exponent whenever `zSig' is a complete, normalized 376 | significand. 377 *----------------------------------------------------------------------------*/ 378 379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 380 { 381 382 return make_float32( 383 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 384 385 } 386 387 /*---------------------------------------------------------------------------- 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 389 | and significand `zSig', and returns the proper single-precision floating- 390 | point value corresponding to the abstract input. Ordinarily, the abstract 391 | value is simply rounded and packed into the single-precision format, with 392 | the inexact exception raised if the abstract input cannot be represented 393 | exactly. However, if the abstract value is too large, the overflow and 394 | inexact exceptions are raised and an infinity or maximal finite value is 395 | returned. If the abstract value is too small, the input value is rounded to 396 | a subnormal number, and the underflow and inexact exceptions are raised if 397 | the abstract input cannot be represented exactly as a subnormal single- 398 | precision floating-point number. 399 | The input significand `zSig' has its binary point between bits 30 400 | and 29, which is 7 bits to the left of the usual location. This shifted 401 | significand must be normalized or smaller. If `zSig' is not normalized, 402 | `zExp' must be 0; in that case, the result returned is a subnormal number, 403 | and it must not require rounding. In the usual case that `zSig' is 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for 406 | Binary Floating-Point Arithmetic. 407 *----------------------------------------------------------------------------*/ 408 409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 410 float_status *status) 411 { 412 int8_t roundingMode; 413 flag roundNearestEven; 414 int8_t roundIncrement, roundBits; 415 flag isTiny; 416 417 roundingMode = status->float_rounding_mode; 418 roundNearestEven = ( roundingMode == float_round_nearest_even ); 419 switch (roundingMode) { 420 case float_round_nearest_even: 421 case float_round_ties_away: 422 roundIncrement = 0x40; 423 break; 424 case float_round_to_zero: 425 roundIncrement = 0; 426 break; 427 case float_round_up: 428 roundIncrement = zSign ? 0 : 0x7f; 429 break; 430 case float_round_down: 431 roundIncrement = zSign ? 0x7f : 0; 432 break; 433 default: 434 abort(); 435 break; 436 } 437 roundBits = zSig & 0x7F; 438 if ( 0xFD <= (uint16_t) zExp ) { 439 if ( ( 0xFD < zExp ) 440 || ( ( zExp == 0xFD ) 441 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 442 ) { 443 float_raise(float_flag_overflow | float_flag_inexact, status); 444 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 445 } 446 if ( zExp < 0 ) { 447 if (status->flush_to_zero) { 448 float_raise(float_flag_output_denormal, status); 449 return packFloat32(zSign, 0, 0); 450 } 451 isTiny = 452 (status->float_detect_tininess 453 == float_tininess_before_rounding) 454 || ( zExp < -1 ) 455 || ( zSig + roundIncrement < 0x80000000 ); 456 shift32RightJamming( zSig, - zExp, &zSig ); 457 zExp = 0; 458 roundBits = zSig & 0x7F; 459 if (isTiny && roundBits) { 460 float_raise(float_flag_underflow, status); 461 } 462 } 463 } 464 if (roundBits) { 465 status->float_exception_flags |= float_flag_inexact; 466 } 467 zSig = ( zSig + roundIncrement )>>7; 468 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 469 if ( zSig == 0 ) zExp = 0; 470 return packFloat32( zSign, zExp, zSig ); 471 472 } 473 474 /*---------------------------------------------------------------------------- 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 476 | and significand `zSig', and returns the proper single-precision floating- 477 | point value corresponding to the abstract input. This routine is just like 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 480 | floating-point exponent. 481 *----------------------------------------------------------------------------*/ 482 483 static float32 484 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 485 float_status *status) 486 { 487 int8_t shiftCount; 488 489 shiftCount = countLeadingZeros32( zSig ) - 1; 490 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 491 status); 492 493 } 494 495 /*---------------------------------------------------------------------------- 496 | Returns the fraction bits of the double-precision floating-point value `a'. 497 *----------------------------------------------------------------------------*/ 498 499 static inline uint64_t extractFloat64Frac( float64 a ) 500 { 501 502 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); 503 504 } 505 506 /*---------------------------------------------------------------------------- 507 | Returns the exponent bits of the double-precision floating-point value `a'. 508 *----------------------------------------------------------------------------*/ 509 510 static inline int extractFloat64Exp(float64 a) 511 { 512 513 return ( float64_val(a)>>52 ) & 0x7FF; 514 515 } 516 517 /*---------------------------------------------------------------------------- 518 | Returns the sign bit of the double-precision floating-point value `a'. 519 *----------------------------------------------------------------------------*/ 520 521 static inline flag extractFloat64Sign( float64 a ) 522 { 523 524 return float64_val(a)>>63; 525 526 } 527 528 /*---------------------------------------------------------------------------- 529 | If `a' is denormal and we are in flush-to-zero mode then set the 530 | input-denormal exception and return zero. Otherwise just return the value. 531 *----------------------------------------------------------------------------*/ 532 float64 float64_squash_input_denormal(float64 a, float_status *status) 533 { 534 if (status->flush_inputs_to_zero) { 535 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 536 float_raise(float_flag_input_denormal, status); 537 return make_float64(float64_val(a) & (1ULL << 63)); 538 } 539 } 540 return a; 541 } 542 543 /*---------------------------------------------------------------------------- 544 | Normalizes the subnormal double-precision floating-point value represented 545 | by the denormalized significand `aSig'. The normalized exponent and 546 | significand are stored at the locations pointed to by `zExpPtr' and 547 | `zSigPtr', respectively. 548 *----------------------------------------------------------------------------*/ 549 550 static void 551 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 552 { 553 int8_t shiftCount; 554 555 shiftCount = countLeadingZeros64( aSig ) - 11; 556 *zSigPtr = aSig<<shiftCount; 557 *zExpPtr = 1 - shiftCount; 558 559 } 560 561 /*---------------------------------------------------------------------------- 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 563 | double-precision floating-point value, returning the result. After being 564 | shifted into the proper positions, the three fields are simply added 565 | together to form the result. This means that any integer portion of `zSig' 566 | will be added into the exponent. Since a properly normalized significand 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less 568 | than the desired result exponent whenever `zSig' is a complete, normalized 569 | significand. 570 *----------------------------------------------------------------------------*/ 571 572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 573 { 574 575 return make_float64( 576 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 577 578 } 579 580 /*---------------------------------------------------------------------------- 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 582 | and significand `zSig', and returns the proper double-precision floating- 583 | point value corresponding to the abstract input. Ordinarily, the abstract 584 | value is simply rounded and packed into the double-precision format, with 585 | the inexact exception raised if the abstract input cannot be represented 586 | exactly. However, if the abstract value is too large, the overflow and 587 | inexact exceptions are raised and an infinity or maximal finite value is 588 | returned. If the abstract value is too small, the input value is rounded to 589 | a subnormal number, and the underflow and inexact exceptions are raised if 590 | the abstract input cannot be represented exactly as a subnormal double- 591 | precision floating-point number. 592 | The input significand `zSig' has its binary point between bits 62 593 | and 61, which is 10 bits to the left of the usual location. This shifted 594 | significand must be normalized or smaller. If `zSig' is not normalized, 595 | `zExp' must be 0; in that case, the result returned is a subnormal number, 596 | and it must not require rounding. In the usual case that `zSig' is 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for 599 | Binary Floating-Point Arithmetic. 600 *----------------------------------------------------------------------------*/ 601 602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 603 float_status *status) 604 { 605 int8_t roundingMode; 606 flag roundNearestEven; 607 int roundIncrement, roundBits; 608 flag isTiny; 609 610 roundingMode = status->float_rounding_mode; 611 roundNearestEven = ( roundingMode == float_round_nearest_even ); 612 switch (roundingMode) { 613 case float_round_nearest_even: 614 case float_round_ties_away: 615 roundIncrement = 0x200; 616 break; 617 case float_round_to_zero: 618 roundIncrement = 0; 619 break; 620 case float_round_up: 621 roundIncrement = zSign ? 0 : 0x3ff; 622 break; 623 case float_round_down: 624 roundIncrement = zSign ? 0x3ff : 0; 625 break; 626 case float_round_to_odd: 627 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 628 break; 629 default: 630 abort(); 631 } 632 roundBits = zSig & 0x3FF; 633 if ( 0x7FD <= (uint16_t) zExp ) { 634 if ( ( 0x7FD < zExp ) 635 || ( ( zExp == 0x7FD ) 636 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 637 ) { 638 bool overflow_to_inf = roundingMode != float_round_to_odd && 639 roundIncrement != 0; 640 float_raise(float_flag_overflow | float_flag_inexact, status); 641 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 642 } 643 if ( zExp < 0 ) { 644 if (status->flush_to_zero) { 645 float_raise(float_flag_output_denormal, status); 646 return packFloat64(zSign, 0, 0); 647 } 648 isTiny = 649 (status->float_detect_tininess 650 == float_tininess_before_rounding) 651 || ( zExp < -1 ) 652 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 653 shift64RightJamming( zSig, - zExp, &zSig ); 654 zExp = 0; 655 roundBits = zSig & 0x3FF; 656 if (isTiny && roundBits) { 657 float_raise(float_flag_underflow, status); 658 } 659 if (roundingMode == float_round_to_odd) { 660 /* 661 * For round-to-odd case, the roundIncrement depends on 662 * zSig which just changed. 663 */ 664 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 665 } 666 } 667 } 668 if (roundBits) { 669 status->float_exception_flags |= float_flag_inexact; 670 } 671 zSig = ( zSig + roundIncrement )>>10; 672 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 673 if ( zSig == 0 ) zExp = 0; 674 return packFloat64( zSign, zExp, zSig ); 675 676 } 677 678 /*---------------------------------------------------------------------------- 679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 680 | and significand `zSig', and returns the proper double-precision floating- 681 | point value corresponding to the abstract input. This routine is just like 682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 684 | floating-point exponent. 685 *----------------------------------------------------------------------------*/ 686 687 static float64 688 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 689 float_status *status) 690 { 691 int8_t shiftCount; 692 693 shiftCount = countLeadingZeros64( zSig ) - 1; 694 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 695 status); 696 697 } 698 699 /*---------------------------------------------------------------------------- 700 | Returns the fraction bits of the extended double-precision floating-point 701 | value `a'. 702 *----------------------------------------------------------------------------*/ 703 704 static inline uint64_t extractFloatx80Frac( floatx80 a ) 705 { 706 707 return a.low; 708 709 } 710 711 /*---------------------------------------------------------------------------- 712 | Returns the exponent bits of the extended double-precision floating-point 713 | value `a'. 714 *----------------------------------------------------------------------------*/ 715 716 static inline int32_t extractFloatx80Exp( floatx80 a ) 717 { 718 719 return a.high & 0x7FFF; 720 721 } 722 723 /*---------------------------------------------------------------------------- 724 | Returns the sign bit of the extended double-precision floating-point value 725 | `a'. 726 *----------------------------------------------------------------------------*/ 727 728 static inline flag extractFloatx80Sign( floatx80 a ) 729 { 730 731 return a.high>>15; 732 733 } 734 735 /*---------------------------------------------------------------------------- 736 | Normalizes the subnormal extended double-precision floating-point value 737 | represented by the denormalized significand `aSig'. The normalized exponent 738 | and significand are stored at the locations pointed to by `zExpPtr' and 739 | `zSigPtr', respectively. 740 *----------------------------------------------------------------------------*/ 741 742 static void 743 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 744 { 745 int8_t shiftCount; 746 747 shiftCount = countLeadingZeros64( aSig ); 748 *zSigPtr = aSig<<shiftCount; 749 *zExpPtr = 1 - shiftCount; 750 751 } 752 753 /*---------------------------------------------------------------------------- 754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 755 | extended double-precision floating-point value, returning the result. 756 *----------------------------------------------------------------------------*/ 757 758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 759 { 760 floatx80 z; 761 762 z.low = zSig; 763 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 764 return z; 765 766 } 767 768 /*---------------------------------------------------------------------------- 769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 770 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 771 | and returns the proper extended double-precision floating-point value 772 | corresponding to the abstract input. Ordinarily, the abstract value is 773 | rounded and packed into the extended double-precision format, with the 774 | inexact exception raised if the abstract input cannot be represented 775 | exactly. However, if the abstract value is too large, the overflow and 776 | inexact exceptions are raised and an infinity or maximal finite value is 777 | returned. If the abstract value is too small, the input value is rounded to 778 | a subnormal number, and the underflow and inexact exceptions are raised if 779 | the abstract input cannot be represented exactly as a subnormal extended 780 | double-precision floating-point number. 781 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 782 | number of bits as single or double precision, respectively. Otherwise, the 783 | result is rounded to the full precision of the extended double-precision 784 | format. 785 | The input significand must be normalized or smaller. If the input 786 | significand is not normalized, `zExp' must be 0; in that case, the result 787 | returned is a subnormal number, and it must not require rounding. The 788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 789 | Floating-Point Arithmetic. 790 *----------------------------------------------------------------------------*/ 791 792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 793 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 794 float_status *status) 795 { 796 int8_t roundingMode; 797 flag roundNearestEven, increment, isTiny; 798 int64_t roundIncrement, roundMask, roundBits; 799 800 roundingMode = status->float_rounding_mode; 801 roundNearestEven = ( roundingMode == float_round_nearest_even ); 802 if ( roundingPrecision == 80 ) goto precision80; 803 if ( roundingPrecision == 64 ) { 804 roundIncrement = LIT64( 0x0000000000000400 ); 805 roundMask = LIT64( 0x00000000000007FF ); 806 } 807 else if ( roundingPrecision == 32 ) { 808 roundIncrement = LIT64( 0x0000008000000000 ); 809 roundMask = LIT64( 0x000000FFFFFFFFFF ); 810 } 811 else { 812 goto precision80; 813 } 814 zSig0 |= ( zSig1 != 0 ); 815 switch (roundingMode) { 816 case float_round_nearest_even: 817 case float_round_ties_away: 818 break; 819 case float_round_to_zero: 820 roundIncrement = 0; 821 break; 822 case float_round_up: 823 roundIncrement = zSign ? 0 : roundMask; 824 break; 825 case float_round_down: 826 roundIncrement = zSign ? roundMask : 0; 827 break; 828 default: 829 abort(); 830 } 831 roundBits = zSig0 & roundMask; 832 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 833 if ( ( 0x7FFE < zExp ) 834 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 835 ) { 836 goto overflow; 837 } 838 if ( zExp <= 0 ) { 839 if (status->flush_to_zero) { 840 float_raise(float_flag_output_denormal, status); 841 return packFloatx80(zSign, 0, 0); 842 } 843 isTiny = 844 (status->float_detect_tininess 845 == float_tininess_before_rounding) 846 || ( zExp < 0 ) 847 || ( zSig0 <= zSig0 + roundIncrement ); 848 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 849 zExp = 0; 850 roundBits = zSig0 & roundMask; 851 if (isTiny && roundBits) { 852 float_raise(float_flag_underflow, status); 853 } 854 if (roundBits) { 855 status->float_exception_flags |= float_flag_inexact; 856 } 857 zSig0 += roundIncrement; 858 if ( (int64_t) zSig0 < 0 ) zExp = 1; 859 roundIncrement = roundMask + 1; 860 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 861 roundMask |= roundIncrement; 862 } 863 zSig0 &= ~ roundMask; 864 return packFloatx80( zSign, zExp, zSig0 ); 865 } 866 } 867 if (roundBits) { 868 status->float_exception_flags |= float_flag_inexact; 869 } 870 zSig0 += roundIncrement; 871 if ( zSig0 < roundIncrement ) { 872 ++zExp; 873 zSig0 = LIT64( 0x8000000000000000 ); 874 } 875 roundIncrement = roundMask + 1; 876 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 877 roundMask |= roundIncrement; 878 } 879 zSig0 &= ~ roundMask; 880 if ( zSig0 == 0 ) zExp = 0; 881 return packFloatx80( zSign, zExp, zSig0 ); 882 precision80: 883 switch (roundingMode) { 884 case float_round_nearest_even: 885 case float_round_ties_away: 886 increment = ((int64_t)zSig1 < 0); 887 break; 888 case float_round_to_zero: 889 increment = 0; 890 break; 891 case float_round_up: 892 increment = !zSign && zSig1; 893 break; 894 case float_round_down: 895 increment = zSign && zSig1; 896 break; 897 default: 898 abort(); 899 } 900 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 901 if ( ( 0x7FFE < zExp ) 902 || ( ( zExp == 0x7FFE ) 903 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 904 && increment 905 ) 906 ) { 907 roundMask = 0; 908 overflow: 909 float_raise(float_flag_overflow | float_flag_inexact, status); 910 if ( ( roundingMode == float_round_to_zero ) 911 || ( zSign && ( roundingMode == float_round_up ) ) 912 || ( ! zSign && ( roundingMode == float_round_down ) ) 913 ) { 914 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 915 } 916 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 917 } 918 if ( zExp <= 0 ) { 919 isTiny = 920 (status->float_detect_tininess 921 == float_tininess_before_rounding) 922 || ( zExp < 0 ) 923 || ! increment 924 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 925 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 926 zExp = 0; 927 if (isTiny && zSig1) { 928 float_raise(float_flag_underflow, status); 929 } 930 if (zSig1) { 931 status->float_exception_flags |= float_flag_inexact; 932 } 933 switch (roundingMode) { 934 case float_round_nearest_even: 935 case float_round_ties_away: 936 increment = ((int64_t)zSig1 < 0); 937 break; 938 case float_round_to_zero: 939 increment = 0; 940 break; 941 case float_round_up: 942 increment = !zSign && zSig1; 943 break; 944 case float_round_down: 945 increment = zSign && zSig1; 946 break; 947 default: 948 abort(); 949 } 950 if ( increment ) { 951 ++zSig0; 952 zSig0 &= 953 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 954 if ( (int64_t) zSig0 < 0 ) zExp = 1; 955 } 956 return packFloatx80( zSign, zExp, zSig0 ); 957 } 958 } 959 if (zSig1) { 960 status->float_exception_flags |= float_flag_inexact; 961 } 962 if ( increment ) { 963 ++zSig0; 964 if ( zSig0 == 0 ) { 965 ++zExp; 966 zSig0 = LIT64( 0x8000000000000000 ); 967 } 968 else { 969 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 970 } 971 } 972 else { 973 if ( zSig0 == 0 ) zExp = 0; 974 } 975 return packFloatx80( zSign, zExp, zSig0 ); 976 977 } 978 979 /*---------------------------------------------------------------------------- 980 | Takes an abstract floating-point value having sign `zSign', exponent 981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 982 | and returns the proper extended double-precision floating-point value 983 | corresponding to the abstract input. This routine is just like 984 | `roundAndPackFloatx80' except that the input significand does not have to be 985 | normalized. 986 *----------------------------------------------------------------------------*/ 987 988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 989 flag zSign, int32_t zExp, 990 uint64_t zSig0, uint64_t zSig1, 991 float_status *status) 992 { 993 int8_t shiftCount; 994 995 if ( zSig0 == 0 ) { 996 zSig0 = zSig1; 997 zSig1 = 0; 998 zExp -= 64; 999 } 1000 shiftCount = countLeadingZeros64( zSig0 ); 1001 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1002 zExp -= shiftCount; 1003 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 1004 zSig0, zSig1, status); 1005 1006 } 1007 1008 /*---------------------------------------------------------------------------- 1009 | Returns the least-significant 64 fraction bits of the quadruple-precision 1010 | floating-point value `a'. 1011 *----------------------------------------------------------------------------*/ 1012 1013 static inline uint64_t extractFloat128Frac1( float128 a ) 1014 { 1015 1016 return a.low; 1017 1018 } 1019 1020 /*---------------------------------------------------------------------------- 1021 | Returns the most-significant 48 fraction bits of the quadruple-precision 1022 | floating-point value `a'. 1023 *----------------------------------------------------------------------------*/ 1024 1025 static inline uint64_t extractFloat128Frac0( float128 a ) 1026 { 1027 1028 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1029 1030 } 1031 1032 /*---------------------------------------------------------------------------- 1033 | Returns the exponent bits of the quadruple-precision floating-point value 1034 | `a'. 1035 *----------------------------------------------------------------------------*/ 1036 1037 static inline int32_t extractFloat128Exp( float128 a ) 1038 { 1039 1040 return ( a.high>>48 ) & 0x7FFF; 1041 1042 } 1043 1044 /*---------------------------------------------------------------------------- 1045 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1046 *----------------------------------------------------------------------------*/ 1047 1048 static inline flag extractFloat128Sign( float128 a ) 1049 { 1050 1051 return a.high>>63; 1052 1053 } 1054 1055 /*---------------------------------------------------------------------------- 1056 | Normalizes the subnormal quadruple-precision floating-point value 1057 | represented by the denormalized significand formed by the concatenation of 1058 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1059 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1060 | significand are stored at the location pointed to by `zSig0Ptr', and the 1061 | least significant 64 bits of the normalized significand are stored at the 1062 | location pointed to by `zSig1Ptr'. 1063 *----------------------------------------------------------------------------*/ 1064 1065 static void 1066 normalizeFloat128Subnormal( 1067 uint64_t aSig0, 1068 uint64_t aSig1, 1069 int32_t *zExpPtr, 1070 uint64_t *zSig0Ptr, 1071 uint64_t *zSig1Ptr 1072 ) 1073 { 1074 int8_t shiftCount; 1075 1076 if ( aSig0 == 0 ) { 1077 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1078 if ( shiftCount < 0 ) { 1079 *zSig0Ptr = aSig1>>( - shiftCount ); 1080 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1081 } 1082 else { 1083 *zSig0Ptr = aSig1<<shiftCount; 1084 *zSig1Ptr = 0; 1085 } 1086 *zExpPtr = - shiftCount - 63; 1087 } 1088 else { 1089 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1090 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1091 *zExpPtr = 1 - shiftCount; 1092 } 1093 1094 } 1095 1096 /*---------------------------------------------------------------------------- 1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1099 | floating-point value, returning the result. After being shifted into the 1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1101 | added together to form the most significant 32 bits of the result. This 1102 | means that any integer portion of `zSig0' will be added into the exponent. 1103 | Since a properly normalized significand will have an integer portion equal 1104 | to 1, the `zExp' input should be 1 less than the desired result exponent 1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1106 | significand. 1107 *----------------------------------------------------------------------------*/ 1108 1109 static inline float128 1110 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1111 { 1112 float128 z; 1113 1114 z.low = zSig1; 1115 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1116 return z; 1117 1118 } 1119 1120 /*---------------------------------------------------------------------------- 1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1122 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1123 | and `zSig2', and returns the proper quadruple-precision floating-point value 1124 | corresponding to the abstract input. Ordinarily, the abstract value is 1125 | simply rounded and packed into the quadruple-precision format, with the 1126 | inexact exception raised if the abstract input cannot be represented 1127 | exactly. However, if the abstract value is too large, the overflow and 1128 | inexact exceptions are raised and an infinity or maximal finite value is 1129 | returned. If the abstract value is too small, the input value is rounded to 1130 | a subnormal number, and the underflow and inexact exceptions are raised if 1131 | the abstract input cannot be represented exactly as a subnormal quadruple- 1132 | precision floating-point number. 1133 | The input significand must be normalized or smaller. If the input 1134 | significand is not normalized, `zExp' must be 0; in that case, the result 1135 | returned is a subnormal number, and it must not require rounding. In the 1136 | usual case that the input significand is normalized, `zExp' must be 1 less 1137 | than the ``true'' floating-point exponent. The handling of underflow and 1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1139 *----------------------------------------------------------------------------*/ 1140 1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1142 uint64_t zSig0, uint64_t zSig1, 1143 uint64_t zSig2, float_status *status) 1144 { 1145 int8_t roundingMode; 1146 flag roundNearestEven, increment, isTiny; 1147 1148 roundingMode = status->float_rounding_mode; 1149 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1150 switch (roundingMode) { 1151 case float_round_nearest_even: 1152 case float_round_ties_away: 1153 increment = ((int64_t)zSig2 < 0); 1154 break; 1155 case float_round_to_zero: 1156 increment = 0; 1157 break; 1158 case float_round_up: 1159 increment = !zSign && zSig2; 1160 break; 1161 case float_round_down: 1162 increment = zSign && zSig2; 1163 break; 1164 case float_round_to_odd: 1165 increment = !(zSig1 & 0x1) && zSig2; 1166 break; 1167 default: 1168 abort(); 1169 } 1170 if ( 0x7FFD <= (uint32_t) zExp ) { 1171 if ( ( 0x7FFD < zExp ) 1172 || ( ( zExp == 0x7FFD ) 1173 && eq128( 1174 LIT64( 0x0001FFFFFFFFFFFF ), 1175 LIT64( 0xFFFFFFFFFFFFFFFF ), 1176 zSig0, 1177 zSig1 1178 ) 1179 && increment 1180 ) 1181 ) { 1182 float_raise(float_flag_overflow | float_flag_inexact, status); 1183 if ( ( roundingMode == float_round_to_zero ) 1184 || ( zSign && ( roundingMode == float_round_up ) ) 1185 || ( ! zSign && ( roundingMode == float_round_down ) ) 1186 || (roundingMode == float_round_to_odd) 1187 ) { 1188 return 1189 packFloat128( 1190 zSign, 1191 0x7FFE, 1192 LIT64( 0x0000FFFFFFFFFFFF ), 1193 LIT64( 0xFFFFFFFFFFFFFFFF ) 1194 ); 1195 } 1196 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1197 } 1198 if ( zExp < 0 ) { 1199 if (status->flush_to_zero) { 1200 float_raise(float_flag_output_denormal, status); 1201 return packFloat128(zSign, 0, 0, 0); 1202 } 1203 isTiny = 1204 (status->float_detect_tininess 1205 == float_tininess_before_rounding) 1206 || ( zExp < -1 ) 1207 || ! increment 1208 || lt128( 1209 zSig0, 1210 zSig1, 1211 LIT64( 0x0001FFFFFFFFFFFF ), 1212 LIT64( 0xFFFFFFFFFFFFFFFF ) 1213 ); 1214 shift128ExtraRightJamming( 1215 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1216 zExp = 0; 1217 if (isTiny && zSig2) { 1218 float_raise(float_flag_underflow, status); 1219 } 1220 switch (roundingMode) { 1221 case float_round_nearest_even: 1222 case float_round_ties_away: 1223 increment = ((int64_t)zSig2 < 0); 1224 break; 1225 case float_round_to_zero: 1226 increment = 0; 1227 break; 1228 case float_round_up: 1229 increment = !zSign && zSig2; 1230 break; 1231 case float_round_down: 1232 increment = zSign && zSig2; 1233 break; 1234 case float_round_to_odd: 1235 increment = !(zSig1 & 0x1) && zSig2; 1236 break; 1237 default: 1238 abort(); 1239 } 1240 } 1241 } 1242 if (zSig2) { 1243 status->float_exception_flags |= float_flag_inexact; 1244 } 1245 if ( increment ) { 1246 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1247 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1248 } 1249 else { 1250 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1251 } 1252 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1253 1254 } 1255 1256 /*---------------------------------------------------------------------------- 1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1259 | returns the proper quadruple-precision floating-point value corresponding 1260 | to the abstract input. This routine is just like `roundAndPackFloat128' 1261 | except that the input significand has fewer bits and does not have to be 1262 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1263 | point exponent. 1264 *----------------------------------------------------------------------------*/ 1265 1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1267 uint64_t zSig0, uint64_t zSig1, 1268 float_status *status) 1269 { 1270 int8_t shiftCount; 1271 uint64_t zSig2; 1272 1273 if ( zSig0 == 0 ) { 1274 zSig0 = zSig1; 1275 zSig1 = 0; 1276 zExp -= 64; 1277 } 1278 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1279 if ( 0 <= shiftCount ) { 1280 zSig2 = 0; 1281 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1282 } 1283 else { 1284 shift128ExtraRightJamming( 1285 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1286 } 1287 zExp -= shiftCount; 1288 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1289 1290 } 1291 1292 /*---------------------------------------------------------------------------- 1293 | Returns the result of converting the 32-bit two's complement integer `a' 1294 | to the single-precision floating-point format. The conversion is performed 1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1296 *----------------------------------------------------------------------------*/ 1297 1298 float32 int32_to_float32(int32_t a, float_status *status) 1299 { 1300 flag zSign; 1301 1302 if ( a == 0 ) return float32_zero; 1303 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1304 zSign = ( a < 0 ); 1305 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1306 } 1307 1308 /*---------------------------------------------------------------------------- 1309 | Returns the result of converting the 32-bit two's complement integer `a' 1310 | to the double-precision floating-point format. The conversion is performed 1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1312 *----------------------------------------------------------------------------*/ 1313 1314 float64 int32_to_float64(int32_t a, float_status *status) 1315 { 1316 flag zSign; 1317 uint32_t absA; 1318 int8_t shiftCount; 1319 uint64_t zSig; 1320 1321 if ( a == 0 ) return float64_zero; 1322 zSign = ( a < 0 ); 1323 absA = zSign ? - a : a; 1324 shiftCount = countLeadingZeros32( absA ) + 21; 1325 zSig = absA; 1326 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1327 1328 } 1329 1330 /*---------------------------------------------------------------------------- 1331 | Returns the result of converting the 32-bit two's complement integer `a' 1332 | to the extended double-precision floating-point format. The conversion 1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1334 | Arithmetic. 1335 *----------------------------------------------------------------------------*/ 1336 1337 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1338 { 1339 flag zSign; 1340 uint32_t absA; 1341 int8_t shiftCount; 1342 uint64_t zSig; 1343 1344 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1345 zSign = ( a < 0 ); 1346 absA = zSign ? - a : a; 1347 shiftCount = countLeadingZeros32( absA ) + 32; 1348 zSig = absA; 1349 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1350 1351 } 1352 1353 /*---------------------------------------------------------------------------- 1354 | Returns the result of converting the 32-bit two's complement integer `a' to 1355 | the quadruple-precision floating-point format. The conversion is performed 1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1357 *----------------------------------------------------------------------------*/ 1358 1359 float128 int32_to_float128(int32_t a, float_status *status) 1360 { 1361 flag zSign; 1362 uint32_t absA; 1363 int8_t shiftCount; 1364 uint64_t zSig0; 1365 1366 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1367 zSign = ( a < 0 ); 1368 absA = zSign ? - a : a; 1369 shiftCount = countLeadingZeros32( absA ) + 17; 1370 zSig0 = absA; 1371 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1372 1373 } 1374 1375 /*---------------------------------------------------------------------------- 1376 | Returns the result of converting the 64-bit two's complement integer `a' 1377 | to the single-precision floating-point format. The conversion is performed 1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1379 *----------------------------------------------------------------------------*/ 1380 1381 float32 int64_to_float32(int64_t a, float_status *status) 1382 { 1383 flag zSign; 1384 uint64_t absA; 1385 int8_t shiftCount; 1386 1387 if ( a == 0 ) return float32_zero; 1388 zSign = ( a < 0 ); 1389 absA = zSign ? - a : a; 1390 shiftCount = countLeadingZeros64( absA ) - 40; 1391 if ( 0 <= shiftCount ) { 1392 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1393 } 1394 else { 1395 shiftCount += 7; 1396 if ( shiftCount < 0 ) { 1397 shift64RightJamming( absA, - shiftCount, &absA ); 1398 } 1399 else { 1400 absA <<= shiftCount; 1401 } 1402 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1403 } 1404 1405 } 1406 1407 /*---------------------------------------------------------------------------- 1408 | Returns the result of converting the 64-bit two's complement integer `a' 1409 | to the double-precision floating-point format. The conversion is performed 1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1411 *----------------------------------------------------------------------------*/ 1412 1413 float64 int64_to_float64(int64_t a, float_status *status) 1414 { 1415 flag zSign; 1416 1417 if ( a == 0 ) return float64_zero; 1418 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1419 return packFloat64( 1, 0x43E, 0 ); 1420 } 1421 zSign = ( a < 0 ); 1422 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1423 } 1424 1425 /*---------------------------------------------------------------------------- 1426 | Returns the result of converting the 64-bit two's complement integer `a' 1427 | to the extended double-precision floating-point format. The conversion 1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1429 | Arithmetic. 1430 *----------------------------------------------------------------------------*/ 1431 1432 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1433 { 1434 flag zSign; 1435 uint64_t absA; 1436 int8_t shiftCount; 1437 1438 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1439 zSign = ( a < 0 ); 1440 absA = zSign ? - a : a; 1441 shiftCount = countLeadingZeros64( absA ); 1442 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1443 1444 } 1445 1446 /*---------------------------------------------------------------------------- 1447 | Returns the result of converting the 64-bit two's complement integer `a' to 1448 | the quadruple-precision floating-point format. The conversion is performed 1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1450 *----------------------------------------------------------------------------*/ 1451 1452 float128 int64_to_float128(int64_t a, float_status *status) 1453 { 1454 flag zSign; 1455 uint64_t absA; 1456 int8_t shiftCount; 1457 int32_t zExp; 1458 uint64_t zSig0, zSig1; 1459 1460 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1461 zSign = ( a < 0 ); 1462 absA = zSign ? - a : a; 1463 shiftCount = countLeadingZeros64( absA ) + 49; 1464 zExp = 0x406E - shiftCount; 1465 if ( 64 <= shiftCount ) { 1466 zSig1 = 0; 1467 zSig0 = absA; 1468 shiftCount -= 64; 1469 } 1470 else { 1471 zSig1 = absA; 1472 zSig0 = 0; 1473 } 1474 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1475 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1476 1477 } 1478 1479 /*---------------------------------------------------------------------------- 1480 | Returns the result of converting the 64-bit unsigned integer `a' 1481 | to the single-precision floating-point format. The conversion is performed 1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1483 *----------------------------------------------------------------------------*/ 1484 1485 float32 uint64_to_float32(uint64_t a, float_status *status) 1486 { 1487 int shiftcount; 1488 1489 if (a == 0) { 1490 return float32_zero; 1491 } 1492 1493 /* Determine (left) shift needed to put first set bit into bit posn 23 1494 * (since packFloat32() expects the binary point between bits 23 and 22); 1495 * this is the fast case for smallish numbers. 1496 */ 1497 shiftcount = countLeadingZeros64(a) - 40; 1498 if (shiftcount >= 0) { 1499 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1500 } 1501 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1502 * expects the binary point between bits 30 and 29, hence the + 7. 1503 */ 1504 shiftcount += 7; 1505 if (shiftcount < 0) { 1506 shift64RightJamming(a, -shiftcount, &a); 1507 } else { 1508 a <<= shiftcount; 1509 } 1510 1511 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1512 } 1513 1514 /*---------------------------------------------------------------------------- 1515 | Returns the result of converting the 64-bit unsigned integer `a' 1516 | to the double-precision floating-point format. The conversion is performed 1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1518 *----------------------------------------------------------------------------*/ 1519 1520 float64 uint64_to_float64(uint64_t a, float_status *status) 1521 { 1522 int exp = 0x43C; 1523 int shiftcount; 1524 1525 if (a == 0) { 1526 return float64_zero; 1527 } 1528 1529 shiftcount = countLeadingZeros64(a) - 1; 1530 if (shiftcount < 0) { 1531 shift64RightJamming(a, -shiftcount, &a); 1532 } else { 1533 a <<= shiftcount; 1534 } 1535 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1536 } 1537 1538 /*---------------------------------------------------------------------------- 1539 | Returns the result of converting the 64-bit unsigned integer `a' 1540 | to the quadruple-precision floating-point format. The conversion is performed 1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1542 *----------------------------------------------------------------------------*/ 1543 1544 float128 uint64_to_float128(uint64_t a, float_status *status) 1545 { 1546 if (a == 0) { 1547 return float128_zero; 1548 } 1549 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1550 } 1551 1552 /*---------------------------------------------------------------------------- 1553 | Returns the result of converting the single-precision floating-point value 1554 | `a' to the 32-bit two's complement integer format. The conversion is 1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1556 | Arithmetic---which means in particular that the conversion is rounded 1557 | according to the current rounding mode. If `a' is a NaN, the largest 1558 | positive integer is returned. Otherwise, if the conversion overflows, the 1559 | largest integer with the same sign as `a' is returned. 1560 *----------------------------------------------------------------------------*/ 1561 1562 int32_t float32_to_int32(float32 a, float_status *status) 1563 { 1564 flag aSign; 1565 int aExp; 1566 int shiftCount; 1567 uint32_t aSig; 1568 uint64_t aSig64; 1569 1570 a = float32_squash_input_denormal(a, status); 1571 aSig = extractFloat32Frac( a ); 1572 aExp = extractFloat32Exp( a ); 1573 aSign = extractFloat32Sign( a ); 1574 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1575 if ( aExp ) aSig |= 0x00800000; 1576 shiftCount = 0xAF - aExp; 1577 aSig64 = aSig; 1578 aSig64 <<= 32; 1579 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1580 return roundAndPackInt32(aSign, aSig64, status); 1581 1582 } 1583 1584 /*---------------------------------------------------------------------------- 1585 | Returns the result of converting the single-precision floating-point value 1586 | `a' to the 32-bit two's complement integer format. The conversion is 1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1588 | Arithmetic, except that the conversion is always rounded toward zero. 1589 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1590 | the conversion overflows, the largest integer with the same sign as `a' is 1591 | returned. 1592 *----------------------------------------------------------------------------*/ 1593 1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1595 { 1596 flag aSign; 1597 int aExp; 1598 int shiftCount; 1599 uint32_t aSig; 1600 int32_t z; 1601 a = float32_squash_input_denormal(a, status); 1602 1603 aSig = extractFloat32Frac( a ); 1604 aExp = extractFloat32Exp( a ); 1605 aSign = extractFloat32Sign( a ); 1606 shiftCount = aExp - 0x9E; 1607 if ( 0 <= shiftCount ) { 1608 if ( float32_val(a) != 0xCF000000 ) { 1609 float_raise(float_flag_invalid, status); 1610 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1611 } 1612 return (int32_t) 0x80000000; 1613 } 1614 else if ( aExp <= 0x7E ) { 1615 if (aExp | aSig) { 1616 status->float_exception_flags |= float_flag_inexact; 1617 } 1618 return 0; 1619 } 1620 aSig = ( aSig | 0x00800000 )<<8; 1621 z = aSig>>( - shiftCount ); 1622 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1623 status->float_exception_flags |= float_flag_inexact; 1624 } 1625 if ( aSign ) z = - z; 1626 return z; 1627 1628 } 1629 1630 /*---------------------------------------------------------------------------- 1631 | Returns the result of converting the single-precision floating-point value 1632 | `a' to the 16-bit two's complement integer format. The conversion is 1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1634 | Arithmetic, except that the conversion is always rounded toward zero. 1635 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1636 | the conversion overflows, the largest integer with the same sign as `a' is 1637 | returned. 1638 *----------------------------------------------------------------------------*/ 1639 1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1641 { 1642 flag aSign; 1643 int aExp; 1644 int shiftCount; 1645 uint32_t aSig; 1646 int32_t z; 1647 1648 aSig = extractFloat32Frac( a ); 1649 aExp = extractFloat32Exp( a ); 1650 aSign = extractFloat32Sign( a ); 1651 shiftCount = aExp - 0x8E; 1652 if ( 0 <= shiftCount ) { 1653 if ( float32_val(a) != 0xC7000000 ) { 1654 float_raise(float_flag_invalid, status); 1655 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1656 return 0x7FFF; 1657 } 1658 } 1659 return (int32_t) 0xffff8000; 1660 } 1661 else if ( aExp <= 0x7E ) { 1662 if ( aExp | aSig ) { 1663 status->float_exception_flags |= float_flag_inexact; 1664 } 1665 return 0; 1666 } 1667 shiftCount -= 0x10; 1668 aSig = ( aSig | 0x00800000 )<<8; 1669 z = aSig>>( - shiftCount ); 1670 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1671 status->float_exception_flags |= float_flag_inexact; 1672 } 1673 if ( aSign ) { 1674 z = - z; 1675 } 1676 return z; 1677 1678 } 1679 1680 /*---------------------------------------------------------------------------- 1681 | Returns the result of converting the single-precision floating-point value 1682 | `a' to the 64-bit two's complement integer format. The conversion is 1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1684 | Arithmetic---which means in particular that the conversion is rounded 1685 | according to the current rounding mode. If `a' is a NaN, the largest 1686 | positive integer is returned. Otherwise, if the conversion overflows, the 1687 | largest integer with the same sign as `a' is returned. 1688 *----------------------------------------------------------------------------*/ 1689 1690 int64_t float32_to_int64(float32 a, float_status *status) 1691 { 1692 flag aSign; 1693 int aExp; 1694 int shiftCount; 1695 uint32_t aSig; 1696 uint64_t aSig64, aSigExtra; 1697 a = float32_squash_input_denormal(a, status); 1698 1699 aSig = extractFloat32Frac( a ); 1700 aExp = extractFloat32Exp( a ); 1701 aSign = extractFloat32Sign( a ); 1702 shiftCount = 0xBE - aExp; 1703 if ( shiftCount < 0 ) { 1704 float_raise(float_flag_invalid, status); 1705 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1706 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1707 } 1708 return (int64_t) LIT64( 0x8000000000000000 ); 1709 } 1710 if ( aExp ) aSig |= 0x00800000; 1711 aSig64 = aSig; 1712 aSig64 <<= 40; 1713 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1714 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1715 1716 } 1717 1718 /*---------------------------------------------------------------------------- 1719 | Returns the result of converting the single-precision floating-point value 1720 | `a' to the 64-bit unsigned integer format. The conversion is 1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1722 | Arithmetic---which means in particular that the conversion is rounded 1723 | according to the current rounding mode. If `a' is a NaN, the largest 1724 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1725 | largest unsigned integer is returned. If the 'a' is negative, the result 1726 | is rounded and zero is returned; values that do not round to zero will 1727 | raise the inexact exception flag. 1728 *----------------------------------------------------------------------------*/ 1729 1730 uint64_t float32_to_uint64(float32 a, float_status *status) 1731 { 1732 flag aSign; 1733 int aExp; 1734 int shiftCount; 1735 uint32_t aSig; 1736 uint64_t aSig64, aSigExtra; 1737 a = float32_squash_input_denormal(a, status); 1738 1739 aSig = extractFloat32Frac(a); 1740 aExp = extractFloat32Exp(a); 1741 aSign = extractFloat32Sign(a); 1742 if ((aSign) && (aExp > 126)) { 1743 float_raise(float_flag_invalid, status); 1744 if (float32_is_any_nan(a)) { 1745 return LIT64(0xFFFFFFFFFFFFFFFF); 1746 } else { 1747 return 0; 1748 } 1749 } 1750 shiftCount = 0xBE - aExp; 1751 if (aExp) { 1752 aSig |= 0x00800000; 1753 } 1754 if (shiftCount < 0) { 1755 float_raise(float_flag_invalid, status); 1756 return LIT64(0xFFFFFFFFFFFFFFFF); 1757 } 1758 1759 aSig64 = aSig; 1760 aSig64 <<= 40; 1761 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1762 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1763 } 1764 1765 /*---------------------------------------------------------------------------- 1766 | Returns the result of converting the single-precision floating-point value 1767 | `a' to the 64-bit unsigned integer format. The conversion is 1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1769 | Arithmetic, except that the conversion is always rounded toward zero. If 1770 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1771 | conversion overflows, the largest unsigned integer is returned. If the 1772 | 'a' is negative, the result is rounded and zero is returned; values that do 1773 | not round to zero will raise the inexact flag. 1774 *----------------------------------------------------------------------------*/ 1775 1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1777 { 1778 signed char current_rounding_mode = status->float_rounding_mode; 1779 set_float_rounding_mode(float_round_to_zero, status); 1780 int64_t v = float32_to_uint64(a, status); 1781 set_float_rounding_mode(current_rounding_mode, status); 1782 return v; 1783 } 1784 1785 /*---------------------------------------------------------------------------- 1786 | Returns the result of converting the single-precision floating-point value 1787 | `a' to the 64-bit two's complement integer format. The conversion is 1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1789 | Arithmetic, except that the conversion is always rounded toward zero. If 1790 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1791 | conversion overflows, the largest integer with the same sign as `a' is 1792 | returned. 1793 *----------------------------------------------------------------------------*/ 1794 1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1796 { 1797 flag aSign; 1798 int aExp; 1799 int shiftCount; 1800 uint32_t aSig; 1801 uint64_t aSig64; 1802 int64_t z; 1803 a = float32_squash_input_denormal(a, status); 1804 1805 aSig = extractFloat32Frac( a ); 1806 aExp = extractFloat32Exp( a ); 1807 aSign = extractFloat32Sign( a ); 1808 shiftCount = aExp - 0xBE; 1809 if ( 0 <= shiftCount ) { 1810 if ( float32_val(a) != 0xDF000000 ) { 1811 float_raise(float_flag_invalid, status); 1812 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1813 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1814 } 1815 } 1816 return (int64_t) LIT64( 0x8000000000000000 ); 1817 } 1818 else if ( aExp <= 0x7E ) { 1819 if (aExp | aSig) { 1820 status->float_exception_flags |= float_flag_inexact; 1821 } 1822 return 0; 1823 } 1824 aSig64 = aSig | 0x00800000; 1825 aSig64 <<= 40; 1826 z = aSig64>>( - shiftCount ); 1827 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1828 status->float_exception_flags |= float_flag_inexact; 1829 } 1830 if ( aSign ) z = - z; 1831 return z; 1832 1833 } 1834 1835 /*---------------------------------------------------------------------------- 1836 | Returns the result of converting the single-precision floating-point value 1837 | `a' to the double-precision floating-point format. The conversion is 1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1839 | Arithmetic. 1840 *----------------------------------------------------------------------------*/ 1841 1842 float64 float32_to_float64(float32 a, float_status *status) 1843 { 1844 flag aSign; 1845 int aExp; 1846 uint32_t aSig; 1847 a = float32_squash_input_denormal(a, status); 1848 1849 aSig = extractFloat32Frac( a ); 1850 aExp = extractFloat32Exp( a ); 1851 aSign = extractFloat32Sign( a ); 1852 if ( aExp == 0xFF ) { 1853 if (aSig) { 1854 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1855 } 1856 return packFloat64( aSign, 0x7FF, 0 ); 1857 } 1858 if ( aExp == 0 ) { 1859 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1860 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1861 --aExp; 1862 } 1863 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1864 1865 } 1866 1867 /*---------------------------------------------------------------------------- 1868 | Returns the result of converting the single-precision floating-point value 1869 | `a' to the extended double-precision floating-point format. The conversion 1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1871 | Arithmetic. 1872 *----------------------------------------------------------------------------*/ 1873 1874 floatx80 float32_to_floatx80(float32 a, float_status *status) 1875 { 1876 flag aSign; 1877 int aExp; 1878 uint32_t aSig; 1879 1880 a = float32_squash_input_denormal(a, status); 1881 aSig = extractFloat32Frac( a ); 1882 aExp = extractFloat32Exp( a ); 1883 aSign = extractFloat32Sign( a ); 1884 if ( aExp == 0xFF ) { 1885 if (aSig) { 1886 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1887 } 1888 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1889 } 1890 if ( aExp == 0 ) { 1891 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1892 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1893 } 1894 aSig |= 0x00800000; 1895 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1896 1897 } 1898 1899 /*---------------------------------------------------------------------------- 1900 | Returns the result of converting the single-precision floating-point value 1901 | `a' to the double-precision floating-point format. The conversion is 1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1903 | Arithmetic. 1904 *----------------------------------------------------------------------------*/ 1905 1906 float128 float32_to_float128(float32 a, float_status *status) 1907 { 1908 flag aSign; 1909 int aExp; 1910 uint32_t aSig; 1911 1912 a = float32_squash_input_denormal(a, status); 1913 aSig = extractFloat32Frac( a ); 1914 aExp = extractFloat32Exp( a ); 1915 aSign = extractFloat32Sign( a ); 1916 if ( aExp == 0xFF ) { 1917 if (aSig) { 1918 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1919 } 1920 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1921 } 1922 if ( aExp == 0 ) { 1923 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1924 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1925 --aExp; 1926 } 1927 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1928 1929 } 1930 1931 /*---------------------------------------------------------------------------- 1932 | Rounds the single-precision floating-point value `a' to an integer, and 1933 | returns the result as a single-precision floating-point value. The 1934 | operation is performed according to the IEC/IEEE Standard for Binary 1935 | Floating-Point Arithmetic. 1936 *----------------------------------------------------------------------------*/ 1937 1938 float32 float32_round_to_int(float32 a, float_status *status) 1939 { 1940 flag aSign; 1941 int aExp; 1942 uint32_t lastBitMask, roundBitsMask; 1943 uint32_t z; 1944 a = float32_squash_input_denormal(a, status); 1945 1946 aExp = extractFloat32Exp( a ); 1947 if ( 0x96 <= aExp ) { 1948 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1949 return propagateFloat32NaN(a, a, status); 1950 } 1951 return a; 1952 } 1953 if ( aExp <= 0x7E ) { 1954 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1955 status->float_exception_flags |= float_flag_inexact; 1956 aSign = extractFloat32Sign( a ); 1957 switch (status->float_rounding_mode) { 1958 case float_round_nearest_even: 1959 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1960 return packFloat32( aSign, 0x7F, 0 ); 1961 } 1962 break; 1963 case float_round_ties_away: 1964 if (aExp == 0x7E) { 1965 return packFloat32(aSign, 0x7F, 0); 1966 } 1967 break; 1968 case float_round_down: 1969 return make_float32(aSign ? 0xBF800000 : 0); 1970 case float_round_up: 1971 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1972 } 1973 return packFloat32( aSign, 0, 0 ); 1974 } 1975 lastBitMask = 1; 1976 lastBitMask <<= 0x96 - aExp; 1977 roundBitsMask = lastBitMask - 1; 1978 z = float32_val(a); 1979 switch (status->float_rounding_mode) { 1980 case float_round_nearest_even: 1981 z += lastBitMask>>1; 1982 if ((z & roundBitsMask) == 0) { 1983 z &= ~lastBitMask; 1984 } 1985 break; 1986 case float_round_ties_away: 1987 z += lastBitMask >> 1; 1988 break; 1989 case float_round_to_zero: 1990 break; 1991 case float_round_up: 1992 if (!extractFloat32Sign(make_float32(z))) { 1993 z += roundBitsMask; 1994 } 1995 break; 1996 case float_round_down: 1997 if (extractFloat32Sign(make_float32(z))) { 1998 z += roundBitsMask; 1999 } 2000 break; 2001 default: 2002 abort(); 2003 } 2004 z &= ~ roundBitsMask; 2005 if (z != float32_val(a)) { 2006 status->float_exception_flags |= float_flag_inexact; 2007 } 2008 return make_float32(z); 2009 2010 } 2011 2012 /*---------------------------------------------------------------------------- 2013 | Returns the result of adding the absolute values of the single-precision 2014 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2015 | before being returned. `zSign' is ignored if the result is a NaN. 2016 | The addition is performed according to the IEC/IEEE Standard for Binary 2017 | Floating-Point Arithmetic. 2018 *----------------------------------------------------------------------------*/ 2019 2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 2021 float_status *status) 2022 { 2023 int aExp, bExp, zExp; 2024 uint32_t aSig, bSig, zSig; 2025 int expDiff; 2026 2027 aSig = extractFloat32Frac( a ); 2028 aExp = extractFloat32Exp( a ); 2029 bSig = extractFloat32Frac( b ); 2030 bExp = extractFloat32Exp( b ); 2031 expDiff = aExp - bExp; 2032 aSig <<= 6; 2033 bSig <<= 6; 2034 if ( 0 < expDiff ) { 2035 if ( aExp == 0xFF ) { 2036 if (aSig) { 2037 return propagateFloat32NaN(a, b, status); 2038 } 2039 return a; 2040 } 2041 if ( bExp == 0 ) { 2042 --expDiff; 2043 } 2044 else { 2045 bSig |= 0x20000000; 2046 } 2047 shift32RightJamming( bSig, expDiff, &bSig ); 2048 zExp = aExp; 2049 } 2050 else if ( expDiff < 0 ) { 2051 if ( bExp == 0xFF ) { 2052 if (bSig) { 2053 return propagateFloat32NaN(a, b, status); 2054 } 2055 return packFloat32( zSign, 0xFF, 0 ); 2056 } 2057 if ( aExp == 0 ) { 2058 ++expDiff; 2059 } 2060 else { 2061 aSig |= 0x20000000; 2062 } 2063 shift32RightJamming( aSig, - expDiff, &aSig ); 2064 zExp = bExp; 2065 } 2066 else { 2067 if ( aExp == 0xFF ) { 2068 if (aSig | bSig) { 2069 return propagateFloat32NaN(a, b, status); 2070 } 2071 return a; 2072 } 2073 if ( aExp == 0 ) { 2074 if (status->flush_to_zero) { 2075 if (aSig | bSig) { 2076 float_raise(float_flag_output_denormal, status); 2077 } 2078 return packFloat32(zSign, 0, 0); 2079 } 2080 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2081 } 2082 zSig = 0x40000000 + aSig + bSig; 2083 zExp = aExp; 2084 goto roundAndPack; 2085 } 2086 aSig |= 0x20000000; 2087 zSig = ( aSig + bSig )<<1; 2088 --zExp; 2089 if ( (int32_t) zSig < 0 ) { 2090 zSig = aSig + bSig; 2091 ++zExp; 2092 } 2093 roundAndPack: 2094 return roundAndPackFloat32(zSign, zExp, zSig, status); 2095 2096 } 2097 2098 /*---------------------------------------------------------------------------- 2099 | Returns the result of subtracting the absolute values of the single- 2100 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2101 | difference is negated before being returned. `zSign' is ignored if the 2102 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2103 | Standard for Binary Floating-Point Arithmetic. 2104 *----------------------------------------------------------------------------*/ 2105 2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2107 float_status *status) 2108 { 2109 int aExp, bExp, zExp; 2110 uint32_t aSig, bSig, zSig; 2111 int expDiff; 2112 2113 aSig = extractFloat32Frac( a ); 2114 aExp = extractFloat32Exp( a ); 2115 bSig = extractFloat32Frac( b ); 2116 bExp = extractFloat32Exp( b ); 2117 expDiff = aExp - bExp; 2118 aSig <<= 7; 2119 bSig <<= 7; 2120 if ( 0 < expDiff ) goto aExpBigger; 2121 if ( expDiff < 0 ) goto bExpBigger; 2122 if ( aExp == 0xFF ) { 2123 if (aSig | bSig) { 2124 return propagateFloat32NaN(a, b, status); 2125 } 2126 float_raise(float_flag_invalid, status); 2127 return float32_default_nan(status); 2128 } 2129 if ( aExp == 0 ) { 2130 aExp = 1; 2131 bExp = 1; 2132 } 2133 if ( bSig < aSig ) goto aBigger; 2134 if ( aSig < bSig ) goto bBigger; 2135 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2136 bExpBigger: 2137 if ( bExp == 0xFF ) { 2138 if (bSig) { 2139 return propagateFloat32NaN(a, b, status); 2140 } 2141 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2142 } 2143 if ( aExp == 0 ) { 2144 ++expDiff; 2145 } 2146 else { 2147 aSig |= 0x40000000; 2148 } 2149 shift32RightJamming( aSig, - expDiff, &aSig ); 2150 bSig |= 0x40000000; 2151 bBigger: 2152 zSig = bSig - aSig; 2153 zExp = bExp; 2154 zSign ^= 1; 2155 goto normalizeRoundAndPack; 2156 aExpBigger: 2157 if ( aExp == 0xFF ) { 2158 if (aSig) { 2159 return propagateFloat32NaN(a, b, status); 2160 } 2161 return a; 2162 } 2163 if ( bExp == 0 ) { 2164 --expDiff; 2165 } 2166 else { 2167 bSig |= 0x40000000; 2168 } 2169 shift32RightJamming( bSig, expDiff, &bSig ); 2170 aSig |= 0x40000000; 2171 aBigger: 2172 zSig = aSig - bSig; 2173 zExp = aExp; 2174 normalizeRoundAndPack: 2175 --zExp; 2176 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2177 2178 } 2179 2180 /*---------------------------------------------------------------------------- 2181 | Returns the result of adding the single-precision floating-point values `a' 2182 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2183 | Binary Floating-Point Arithmetic. 2184 *----------------------------------------------------------------------------*/ 2185 2186 float32 float32_add(float32 a, float32 b, float_status *status) 2187 { 2188 flag aSign, bSign; 2189 a = float32_squash_input_denormal(a, status); 2190 b = float32_squash_input_denormal(b, status); 2191 2192 aSign = extractFloat32Sign( a ); 2193 bSign = extractFloat32Sign( b ); 2194 if ( aSign == bSign ) { 2195 return addFloat32Sigs(a, b, aSign, status); 2196 } 2197 else { 2198 return subFloat32Sigs(a, b, aSign, status); 2199 } 2200 2201 } 2202 2203 /*---------------------------------------------------------------------------- 2204 | Returns the result of subtracting the single-precision floating-point values 2205 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2206 | for Binary Floating-Point Arithmetic. 2207 *----------------------------------------------------------------------------*/ 2208 2209 float32 float32_sub(float32 a, float32 b, float_status *status) 2210 { 2211 flag aSign, bSign; 2212 a = float32_squash_input_denormal(a, status); 2213 b = float32_squash_input_denormal(b, status); 2214 2215 aSign = extractFloat32Sign( a ); 2216 bSign = extractFloat32Sign( b ); 2217 if ( aSign == bSign ) { 2218 return subFloat32Sigs(a, b, aSign, status); 2219 } 2220 else { 2221 return addFloat32Sigs(a, b, aSign, status); 2222 } 2223 2224 } 2225 2226 /*---------------------------------------------------------------------------- 2227 | Returns the result of multiplying the single-precision floating-point values 2228 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2229 | for Binary Floating-Point Arithmetic. 2230 *----------------------------------------------------------------------------*/ 2231 2232 float32 float32_mul(float32 a, float32 b, float_status *status) 2233 { 2234 flag aSign, bSign, zSign; 2235 int aExp, bExp, zExp; 2236 uint32_t aSig, bSig; 2237 uint64_t zSig64; 2238 uint32_t zSig; 2239 2240 a = float32_squash_input_denormal(a, status); 2241 b = float32_squash_input_denormal(b, status); 2242 2243 aSig = extractFloat32Frac( a ); 2244 aExp = extractFloat32Exp( a ); 2245 aSign = extractFloat32Sign( a ); 2246 bSig = extractFloat32Frac( b ); 2247 bExp = extractFloat32Exp( b ); 2248 bSign = extractFloat32Sign( b ); 2249 zSign = aSign ^ bSign; 2250 if ( aExp == 0xFF ) { 2251 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2252 return propagateFloat32NaN(a, b, status); 2253 } 2254 if ( ( bExp | bSig ) == 0 ) { 2255 float_raise(float_flag_invalid, status); 2256 return float32_default_nan(status); 2257 } 2258 return packFloat32( zSign, 0xFF, 0 ); 2259 } 2260 if ( bExp == 0xFF ) { 2261 if (bSig) { 2262 return propagateFloat32NaN(a, b, status); 2263 } 2264 if ( ( aExp | aSig ) == 0 ) { 2265 float_raise(float_flag_invalid, status); 2266 return float32_default_nan(status); 2267 } 2268 return packFloat32( zSign, 0xFF, 0 ); 2269 } 2270 if ( aExp == 0 ) { 2271 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2272 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2273 } 2274 if ( bExp == 0 ) { 2275 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2276 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2277 } 2278 zExp = aExp + bExp - 0x7F; 2279 aSig = ( aSig | 0x00800000 )<<7; 2280 bSig = ( bSig | 0x00800000 )<<8; 2281 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2282 zSig = zSig64; 2283 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2284 zSig <<= 1; 2285 --zExp; 2286 } 2287 return roundAndPackFloat32(zSign, zExp, zSig, status); 2288 2289 } 2290 2291 /*---------------------------------------------------------------------------- 2292 | Returns the result of dividing the single-precision floating-point value `a' 2293 | by the corresponding value `b'. The operation is performed according to the 2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2295 *----------------------------------------------------------------------------*/ 2296 2297 float32 float32_div(float32 a, float32 b, float_status *status) 2298 { 2299 flag aSign, bSign, zSign; 2300 int aExp, bExp, zExp; 2301 uint32_t aSig, bSig, zSig; 2302 a = float32_squash_input_denormal(a, status); 2303 b = float32_squash_input_denormal(b, status); 2304 2305 aSig = extractFloat32Frac( a ); 2306 aExp = extractFloat32Exp( a ); 2307 aSign = extractFloat32Sign( a ); 2308 bSig = extractFloat32Frac( b ); 2309 bExp = extractFloat32Exp( b ); 2310 bSign = extractFloat32Sign( b ); 2311 zSign = aSign ^ bSign; 2312 if ( aExp == 0xFF ) { 2313 if (aSig) { 2314 return propagateFloat32NaN(a, b, status); 2315 } 2316 if ( bExp == 0xFF ) { 2317 if (bSig) { 2318 return propagateFloat32NaN(a, b, status); 2319 } 2320 float_raise(float_flag_invalid, status); 2321 return float32_default_nan(status); 2322 } 2323 return packFloat32( zSign, 0xFF, 0 ); 2324 } 2325 if ( bExp == 0xFF ) { 2326 if (bSig) { 2327 return propagateFloat32NaN(a, b, status); 2328 } 2329 return packFloat32( zSign, 0, 0 ); 2330 } 2331 if ( bExp == 0 ) { 2332 if ( bSig == 0 ) { 2333 if ( ( aExp | aSig ) == 0 ) { 2334 float_raise(float_flag_invalid, status); 2335 return float32_default_nan(status); 2336 } 2337 float_raise(float_flag_divbyzero, status); 2338 return packFloat32( zSign, 0xFF, 0 ); 2339 } 2340 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2341 } 2342 if ( aExp == 0 ) { 2343 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2344 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2345 } 2346 zExp = aExp - bExp + 0x7D; 2347 aSig = ( aSig | 0x00800000 )<<7; 2348 bSig = ( bSig | 0x00800000 )<<8; 2349 if ( bSig <= ( aSig + aSig ) ) { 2350 aSig >>= 1; 2351 ++zExp; 2352 } 2353 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2354 if ( ( zSig & 0x3F ) == 0 ) { 2355 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2356 } 2357 return roundAndPackFloat32(zSign, zExp, zSig, status); 2358 2359 } 2360 2361 /*---------------------------------------------------------------------------- 2362 | Returns the remainder of the single-precision floating-point value `a' 2363 | with respect to the corresponding value `b'. The operation is performed 2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2365 *----------------------------------------------------------------------------*/ 2366 2367 float32 float32_rem(float32 a, float32 b, float_status *status) 2368 { 2369 flag aSign, zSign; 2370 int aExp, bExp, expDiff; 2371 uint32_t aSig, bSig; 2372 uint32_t q; 2373 uint64_t aSig64, bSig64, q64; 2374 uint32_t alternateASig; 2375 int32_t sigMean; 2376 a = float32_squash_input_denormal(a, status); 2377 b = float32_squash_input_denormal(b, status); 2378 2379 aSig = extractFloat32Frac( a ); 2380 aExp = extractFloat32Exp( a ); 2381 aSign = extractFloat32Sign( a ); 2382 bSig = extractFloat32Frac( b ); 2383 bExp = extractFloat32Exp( b ); 2384 if ( aExp == 0xFF ) { 2385 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2386 return propagateFloat32NaN(a, b, status); 2387 } 2388 float_raise(float_flag_invalid, status); 2389 return float32_default_nan(status); 2390 } 2391 if ( bExp == 0xFF ) { 2392 if (bSig) { 2393 return propagateFloat32NaN(a, b, status); 2394 } 2395 return a; 2396 } 2397 if ( bExp == 0 ) { 2398 if ( bSig == 0 ) { 2399 float_raise(float_flag_invalid, status); 2400 return float32_default_nan(status); 2401 } 2402 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2403 } 2404 if ( aExp == 0 ) { 2405 if ( aSig == 0 ) return a; 2406 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2407 } 2408 expDiff = aExp - bExp; 2409 aSig |= 0x00800000; 2410 bSig |= 0x00800000; 2411 if ( expDiff < 32 ) { 2412 aSig <<= 8; 2413 bSig <<= 8; 2414 if ( expDiff < 0 ) { 2415 if ( expDiff < -1 ) return a; 2416 aSig >>= 1; 2417 } 2418 q = ( bSig <= aSig ); 2419 if ( q ) aSig -= bSig; 2420 if ( 0 < expDiff ) { 2421 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2422 q >>= 32 - expDiff; 2423 bSig >>= 2; 2424 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2425 } 2426 else { 2427 aSig >>= 2; 2428 bSig >>= 2; 2429 } 2430 } 2431 else { 2432 if ( bSig <= aSig ) aSig -= bSig; 2433 aSig64 = ( (uint64_t) aSig )<<40; 2434 bSig64 = ( (uint64_t) bSig )<<40; 2435 expDiff -= 64; 2436 while ( 0 < expDiff ) { 2437 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2438 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2439 aSig64 = - ( ( bSig * q64 )<<38 ); 2440 expDiff -= 62; 2441 } 2442 expDiff += 64; 2443 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2444 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2445 q = q64>>( 64 - expDiff ); 2446 bSig <<= 6; 2447 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2448 } 2449 do { 2450 alternateASig = aSig; 2451 ++q; 2452 aSig -= bSig; 2453 } while ( 0 <= (int32_t) aSig ); 2454 sigMean = aSig + alternateASig; 2455 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2456 aSig = alternateASig; 2457 } 2458 zSign = ( (int32_t) aSig < 0 ); 2459 if ( zSign ) aSig = - aSig; 2460 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2461 } 2462 2463 /*---------------------------------------------------------------------------- 2464 | Returns the result of multiplying the single-precision floating-point values 2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2466 | multiplication. The operation is performed according to the IEC/IEEE 2467 | Standard for Binary Floating-Point Arithmetic 754-2008. 2468 | The flags argument allows the caller to select negation of the 2469 | addend, the intermediate product, or the final result. (The difference 2470 | between this and having the caller do a separate negation is that negating 2471 | externally will flip the sign bit on NaNs.) 2472 *----------------------------------------------------------------------------*/ 2473 2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2475 float_status *status) 2476 { 2477 flag aSign, bSign, cSign, zSign; 2478 int aExp, bExp, cExp, pExp, zExp, expDiff; 2479 uint32_t aSig, bSig, cSig; 2480 flag pInf, pZero, pSign; 2481 uint64_t pSig64, cSig64, zSig64; 2482 uint32_t pSig; 2483 int shiftcount; 2484 flag signflip, infzero; 2485 2486 a = float32_squash_input_denormal(a, status); 2487 b = float32_squash_input_denormal(b, status); 2488 c = float32_squash_input_denormal(c, status); 2489 aSig = extractFloat32Frac(a); 2490 aExp = extractFloat32Exp(a); 2491 aSign = extractFloat32Sign(a); 2492 bSig = extractFloat32Frac(b); 2493 bExp = extractFloat32Exp(b); 2494 bSign = extractFloat32Sign(b); 2495 cSig = extractFloat32Frac(c); 2496 cExp = extractFloat32Exp(c); 2497 cSign = extractFloat32Sign(c); 2498 2499 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2500 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2501 2502 /* It is implementation-defined whether the cases of (0,inf,qnan) 2503 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2504 * they return if they do), so we have to hand this information 2505 * off to the target-specific pick-a-NaN routine. 2506 */ 2507 if (((aExp == 0xff) && aSig) || 2508 ((bExp == 0xff) && bSig) || 2509 ((cExp == 0xff) && cSig)) { 2510 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2511 } 2512 2513 if (infzero) { 2514 float_raise(float_flag_invalid, status); 2515 return float32_default_nan(status); 2516 } 2517 2518 if (flags & float_muladd_negate_c) { 2519 cSign ^= 1; 2520 } 2521 2522 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2523 2524 /* Work out the sign and type of the product */ 2525 pSign = aSign ^ bSign; 2526 if (flags & float_muladd_negate_product) { 2527 pSign ^= 1; 2528 } 2529 pInf = (aExp == 0xff) || (bExp == 0xff); 2530 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2531 2532 if (cExp == 0xff) { 2533 if (pInf && (pSign ^ cSign)) { 2534 /* addition of opposite-signed infinities => InvalidOperation */ 2535 float_raise(float_flag_invalid, status); 2536 return float32_default_nan(status); 2537 } 2538 /* Otherwise generate an infinity of the same sign */ 2539 return packFloat32(cSign ^ signflip, 0xff, 0); 2540 } 2541 2542 if (pInf) { 2543 return packFloat32(pSign ^ signflip, 0xff, 0); 2544 } 2545 2546 if (pZero) { 2547 if (cExp == 0) { 2548 if (cSig == 0) { 2549 /* Adding two exact zeroes */ 2550 if (pSign == cSign) { 2551 zSign = pSign; 2552 } else if (status->float_rounding_mode == float_round_down) { 2553 zSign = 1; 2554 } else { 2555 zSign = 0; 2556 } 2557 return packFloat32(zSign ^ signflip, 0, 0); 2558 } 2559 /* Exact zero plus a denorm */ 2560 if (status->flush_to_zero) { 2561 float_raise(float_flag_output_denormal, status); 2562 return packFloat32(cSign ^ signflip, 0, 0); 2563 } 2564 } 2565 /* Zero plus something non-zero : just return the something */ 2566 if (flags & float_muladd_halve_result) { 2567 if (cExp == 0) { 2568 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2569 } 2570 /* Subtract one to halve, and one again because roundAndPackFloat32 2571 * wants one less than the true exponent. 2572 */ 2573 cExp -= 2; 2574 cSig = (cSig | 0x00800000) << 7; 2575 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2576 } 2577 return packFloat32(cSign ^ signflip, cExp, cSig); 2578 } 2579 2580 if (aExp == 0) { 2581 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2582 } 2583 if (bExp == 0) { 2584 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2585 } 2586 2587 /* Calculate the actual result a * b + c */ 2588 2589 /* Multiply first; this is easy. */ 2590 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2591 * because we want the true exponent, not the "one-less-than" 2592 * flavour that roundAndPackFloat32() takes. 2593 */ 2594 pExp = aExp + bExp - 0x7e; 2595 aSig = (aSig | 0x00800000) << 7; 2596 bSig = (bSig | 0x00800000) << 8; 2597 pSig64 = (uint64_t)aSig * bSig; 2598 if ((int64_t)(pSig64 << 1) >= 0) { 2599 pSig64 <<= 1; 2600 pExp--; 2601 } 2602 2603 zSign = pSign ^ signflip; 2604 2605 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2606 * position 62. 2607 */ 2608 if (cExp == 0) { 2609 if (!cSig) { 2610 /* Throw out the special case of c being an exact zero now */ 2611 shift64RightJamming(pSig64, 32, &pSig64); 2612 pSig = pSig64; 2613 if (flags & float_muladd_halve_result) { 2614 pExp--; 2615 } 2616 return roundAndPackFloat32(zSign, pExp - 1, 2617 pSig, status); 2618 } 2619 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2620 } 2621 2622 cSig64 = (uint64_t)cSig << (62 - 23); 2623 cSig64 |= LIT64(0x4000000000000000); 2624 expDiff = pExp - cExp; 2625 2626 if (pSign == cSign) { 2627 /* Addition */ 2628 if (expDiff > 0) { 2629 /* scale c to match p */ 2630 shift64RightJamming(cSig64, expDiff, &cSig64); 2631 zExp = pExp; 2632 } else if (expDiff < 0) { 2633 /* scale p to match c */ 2634 shift64RightJamming(pSig64, -expDiff, &pSig64); 2635 zExp = cExp; 2636 } else { 2637 /* no scaling needed */ 2638 zExp = cExp; 2639 } 2640 /* Add significands and make sure explicit bit ends up in posn 62 */ 2641 zSig64 = pSig64 + cSig64; 2642 if ((int64_t)zSig64 < 0) { 2643 shift64RightJamming(zSig64, 1, &zSig64); 2644 } else { 2645 zExp--; 2646 } 2647 } else { 2648 /* Subtraction */ 2649 if (expDiff > 0) { 2650 shift64RightJamming(cSig64, expDiff, &cSig64); 2651 zSig64 = pSig64 - cSig64; 2652 zExp = pExp; 2653 } else if (expDiff < 0) { 2654 shift64RightJamming(pSig64, -expDiff, &pSig64); 2655 zSig64 = cSig64 - pSig64; 2656 zExp = cExp; 2657 zSign ^= 1; 2658 } else { 2659 zExp = pExp; 2660 if (cSig64 < pSig64) { 2661 zSig64 = pSig64 - cSig64; 2662 } else if (pSig64 < cSig64) { 2663 zSig64 = cSig64 - pSig64; 2664 zSign ^= 1; 2665 } else { 2666 /* Exact zero */ 2667 zSign = signflip; 2668 if (status->float_rounding_mode == float_round_down) { 2669 zSign ^= 1; 2670 } 2671 return packFloat32(zSign, 0, 0); 2672 } 2673 } 2674 --zExp; 2675 /* Normalize to put the explicit bit back into bit 62. */ 2676 shiftcount = countLeadingZeros64(zSig64) - 1; 2677 zSig64 <<= shiftcount; 2678 zExp -= shiftcount; 2679 } 2680 if (flags & float_muladd_halve_result) { 2681 zExp--; 2682 } 2683 2684 shift64RightJamming(zSig64, 32, &zSig64); 2685 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2686 } 2687 2688 2689 /*---------------------------------------------------------------------------- 2690 | Returns the square root of the single-precision floating-point value `a'. 2691 | The operation is performed according to the IEC/IEEE Standard for Binary 2692 | Floating-Point Arithmetic. 2693 *----------------------------------------------------------------------------*/ 2694 2695 float32 float32_sqrt(float32 a, float_status *status) 2696 { 2697 flag aSign; 2698 int aExp, zExp; 2699 uint32_t aSig, zSig; 2700 uint64_t rem, term; 2701 a = float32_squash_input_denormal(a, status); 2702 2703 aSig = extractFloat32Frac( a ); 2704 aExp = extractFloat32Exp( a ); 2705 aSign = extractFloat32Sign( a ); 2706 if ( aExp == 0xFF ) { 2707 if (aSig) { 2708 return propagateFloat32NaN(a, float32_zero, status); 2709 } 2710 if ( ! aSign ) return a; 2711 float_raise(float_flag_invalid, status); 2712 return float32_default_nan(status); 2713 } 2714 if ( aSign ) { 2715 if ( ( aExp | aSig ) == 0 ) return a; 2716 float_raise(float_flag_invalid, status); 2717 return float32_default_nan(status); 2718 } 2719 if ( aExp == 0 ) { 2720 if ( aSig == 0 ) return float32_zero; 2721 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2722 } 2723 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2724 aSig = ( aSig | 0x00800000 )<<8; 2725 zSig = estimateSqrt32( aExp, aSig ) + 2; 2726 if ( ( zSig & 0x7F ) <= 5 ) { 2727 if ( zSig < 2 ) { 2728 zSig = 0x7FFFFFFF; 2729 goto roundAndPack; 2730 } 2731 aSig >>= aExp & 1; 2732 term = ( (uint64_t) zSig ) * zSig; 2733 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2734 while ( (int64_t) rem < 0 ) { 2735 --zSig; 2736 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2737 } 2738 zSig |= ( rem != 0 ); 2739 } 2740 shift32RightJamming( zSig, 1, &zSig ); 2741 roundAndPack: 2742 return roundAndPackFloat32(0, zExp, zSig, status); 2743 2744 } 2745 2746 /*---------------------------------------------------------------------------- 2747 | Returns the binary exponential of the single-precision floating-point value 2748 | `a'. The operation is performed according to the IEC/IEEE Standard for 2749 | Binary Floating-Point Arithmetic. 2750 | 2751 | Uses the following identities: 2752 | 2753 | 1. ------------------------------------------------------------------------- 2754 | x x*ln(2) 2755 | 2 = e 2756 | 2757 | 2. ------------------------------------------------------------------------- 2758 | 2 3 4 5 n 2759 | x x x x x x x 2760 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2761 | 1! 2! 3! 4! 5! n! 2762 *----------------------------------------------------------------------------*/ 2763 2764 static const float64 float32_exp2_coefficients[15] = 2765 { 2766 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2767 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2768 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2769 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2770 const_float64( 0x3f81111111111111ll ), /* 5 */ 2771 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2772 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2773 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2774 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2775 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2776 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2777 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2778 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2779 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2780 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2781 }; 2782 2783 float32 float32_exp2(float32 a, float_status *status) 2784 { 2785 flag aSign; 2786 int aExp; 2787 uint32_t aSig; 2788 float64 r, x, xn; 2789 int i; 2790 a = float32_squash_input_denormal(a, status); 2791 2792 aSig = extractFloat32Frac( a ); 2793 aExp = extractFloat32Exp( a ); 2794 aSign = extractFloat32Sign( a ); 2795 2796 if ( aExp == 0xFF) { 2797 if (aSig) { 2798 return propagateFloat32NaN(a, float32_zero, status); 2799 } 2800 return (aSign) ? float32_zero : a; 2801 } 2802 if (aExp == 0) { 2803 if (aSig == 0) return float32_one; 2804 } 2805 2806 float_raise(float_flag_inexact, status); 2807 2808 /* ******************************* */ 2809 /* using float64 for approximation */ 2810 /* ******************************* */ 2811 x = float32_to_float64(a, status); 2812 x = float64_mul(x, float64_ln2, status); 2813 2814 xn = x; 2815 r = float64_one; 2816 for (i = 0 ; i < 15 ; i++) { 2817 float64 f; 2818 2819 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2820 r = float64_add(r, f, status); 2821 2822 xn = float64_mul(xn, x, status); 2823 } 2824 2825 return float64_to_float32(r, status); 2826 } 2827 2828 /*---------------------------------------------------------------------------- 2829 | Returns the binary log of the single-precision floating-point value `a'. 2830 | The operation is performed according to the IEC/IEEE Standard for Binary 2831 | Floating-Point Arithmetic. 2832 *----------------------------------------------------------------------------*/ 2833 float32 float32_log2(float32 a, float_status *status) 2834 { 2835 flag aSign, zSign; 2836 int aExp; 2837 uint32_t aSig, zSig, i; 2838 2839 a = float32_squash_input_denormal(a, status); 2840 aSig = extractFloat32Frac( a ); 2841 aExp = extractFloat32Exp( a ); 2842 aSign = extractFloat32Sign( a ); 2843 2844 if ( aExp == 0 ) { 2845 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2846 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2847 } 2848 if ( aSign ) { 2849 float_raise(float_flag_invalid, status); 2850 return float32_default_nan(status); 2851 } 2852 if ( aExp == 0xFF ) { 2853 if (aSig) { 2854 return propagateFloat32NaN(a, float32_zero, status); 2855 } 2856 return a; 2857 } 2858 2859 aExp -= 0x7F; 2860 aSig |= 0x00800000; 2861 zSign = aExp < 0; 2862 zSig = aExp << 23; 2863 2864 for (i = 1 << 22; i > 0; i >>= 1) { 2865 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2866 if ( aSig & 0x01000000 ) { 2867 aSig >>= 1; 2868 zSig |= i; 2869 } 2870 } 2871 2872 if ( zSign ) 2873 zSig = -zSig; 2874 2875 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2876 } 2877 2878 /*---------------------------------------------------------------------------- 2879 | Returns 1 if the single-precision floating-point value `a' is equal to 2880 | the corresponding value `b', and 0 otherwise. The invalid exception is 2881 | raised if either operand is a NaN. Otherwise, the comparison is performed 2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2883 *----------------------------------------------------------------------------*/ 2884 2885 int float32_eq(float32 a, float32 b, float_status *status) 2886 { 2887 uint32_t av, bv; 2888 a = float32_squash_input_denormal(a, status); 2889 b = float32_squash_input_denormal(b, status); 2890 2891 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2892 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2893 ) { 2894 float_raise(float_flag_invalid, status); 2895 return 0; 2896 } 2897 av = float32_val(a); 2898 bv = float32_val(b); 2899 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2900 } 2901 2902 /*---------------------------------------------------------------------------- 2903 | Returns 1 if the single-precision floating-point value `a' is less than 2904 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2905 | exception is raised if either operand is a NaN. The comparison is performed 2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2907 *----------------------------------------------------------------------------*/ 2908 2909 int float32_le(float32 a, float32 b, float_status *status) 2910 { 2911 flag aSign, bSign; 2912 uint32_t av, bv; 2913 a = float32_squash_input_denormal(a, status); 2914 b = float32_squash_input_denormal(b, status); 2915 2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2918 ) { 2919 float_raise(float_flag_invalid, status); 2920 return 0; 2921 } 2922 aSign = extractFloat32Sign( a ); 2923 bSign = extractFloat32Sign( b ); 2924 av = float32_val(a); 2925 bv = float32_val(b); 2926 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2927 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2928 2929 } 2930 2931 /*---------------------------------------------------------------------------- 2932 | Returns 1 if the single-precision floating-point value `a' is less than 2933 | the corresponding value `b', and 0 otherwise. The invalid exception is 2934 | raised if either operand is a NaN. The comparison is performed according 2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2936 *----------------------------------------------------------------------------*/ 2937 2938 int float32_lt(float32 a, float32 b, float_status *status) 2939 { 2940 flag aSign, bSign; 2941 uint32_t av, bv; 2942 a = float32_squash_input_denormal(a, status); 2943 b = float32_squash_input_denormal(b, status); 2944 2945 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2946 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2947 ) { 2948 float_raise(float_flag_invalid, status); 2949 return 0; 2950 } 2951 aSign = extractFloat32Sign( a ); 2952 bSign = extractFloat32Sign( b ); 2953 av = float32_val(a); 2954 bv = float32_val(b); 2955 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2956 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2957 2958 } 2959 2960 /*---------------------------------------------------------------------------- 2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2962 | be compared, and 0 otherwise. The invalid exception is raised if either 2963 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2964 | Standard for Binary Floating-Point Arithmetic. 2965 *----------------------------------------------------------------------------*/ 2966 2967 int float32_unordered(float32 a, float32 b, float_status *status) 2968 { 2969 a = float32_squash_input_denormal(a, status); 2970 b = float32_squash_input_denormal(b, status); 2971 2972 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2973 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2974 ) { 2975 float_raise(float_flag_invalid, status); 2976 return 1; 2977 } 2978 return 0; 2979 } 2980 2981 /*---------------------------------------------------------------------------- 2982 | Returns 1 if the single-precision floating-point value `a' is equal to 2983 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2984 | exception. The comparison is performed according to the IEC/IEEE Standard 2985 | for Binary Floating-Point Arithmetic. 2986 *----------------------------------------------------------------------------*/ 2987 2988 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2989 { 2990 a = float32_squash_input_denormal(a, status); 2991 b = float32_squash_input_denormal(b, status); 2992 2993 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2994 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2995 ) { 2996 if (float32_is_signaling_nan(a, status) 2997 || float32_is_signaling_nan(b, status)) { 2998 float_raise(float_flag_invalid, status); 2999 } 3000 return 0; 3001 } 3002 return ( float32_val(a) == float32_val(b) ) || 3003 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3004 } 3005 3006 /*---------------------------------------------------------------------------- 3007 | Returns 1 if the single-precision floating-point value `a' is less than or 3008 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3009 | cause an exception. Otherwise, the comparison is performed according to the 3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3011 *----------------------------------------------------------------------------*/ 3012 3013 int float32_le_quiet(float32 a, float32 b, float_status *status) 3014 { 3015 flag aSign, bSign; 3016 uint32_t av, bv; 3017 a = float32_squash_input_denormal(a, status); 3018 b = float32_squash_input_denormal(b, status); 3019 3020 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3021 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3022 ) { 3023 if (float32_is_signaling_nan(a, status) 3024 || float32_is_signaling_nan(b, status)) { 3025 float_raise(float_flag_invalid, status); 3026 } 3027 return 0; 3028 } 3029 aSign = extractFloat32Sign( a ); 3030 bSign = extractFloat32Sign( b ); 3031 av = float32_val(a); 3032 bv = float32_val(b); 3033 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3034 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3035 3036 } 3037 3038 /*---------------------------------------------------------------------------- 3039 | Returns 1 if the single-precision floating-point value `a' is less than 3040 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3041 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3042 | Standard for Binary Floating-Point Arithmetic. 3043 *----------------------------------------------------------------------------*/ 3044 3045 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3046 { 3047 flag aSign, bSign; 3048 uint32_t av, bv; 3049 a = float32_squash_input_denormal(a, status); 3050 b = float32_squash_input_denormal(b, status); 3051 3052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3054 ) { 3055 if (float32_is_signaling_nan(a, status) 3056 || float32_is_signaling_nan(b, status)) { 3057 float_raise(float_flag_invalid, status); 3058 } 3059 return 0; 3060 } 3061 aSign = extractFloat32Sign( a ); 3062 bSign = extractFloat32Sign( b ); 3063 av = float32_val(a); 3064 bv = float32_val(b); 3065 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3066 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3067 3068 } 3069 3070 /*---------------------------------------------------------------------------- 3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3072 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3073 | comparison is performed according to the IEC/IEEE Standard for Binary 3074 | Floating-Point Arithmetic. 3075 *----------------------------------------------------------------------------*/ 3076 3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3078 { 3079 a = float32_squash_input_denormal(a, status); 3080 b = float32_squash_input_denormal(b, status); 3081 3082 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3083 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3084 ) { 3085 if (float32_is_signaling_nan(a, status) 3086 || float32_is_signaling_nan(b, status)) { 3087 float_raise(float_flag_invalid, status); 3088 } 3089 return 1; 3090 } 3091 return 0; 3092 } 3093 3094 /*---------------------------------------------------------------------------- 3095 | Returns the result of converting the double-precision floating-point value 3096 | `a' to the 32-bit two's complement integer format. The conversion is 3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3098 | Arithmetic---which means in particular that the conversion is rounded 3099 | according to the current rounding mode. If `a' is a NaN, the largest 3100 | positive integer is returned. Otherwise, if the conversion overflows, the 3101 | largest integer with the same sign as `a' is returned. 3102 *----------------------------------------------------------------------------*/ 3103 3104 int32_t float64_to_int32(float64 a, float_status *status) 3105 { 3106 flag aSign; 3107 int aExp; 3108 int shiftCount; 3109 uint64_t aSig; 3110 a = float64_squash_input_denormal(a, status); 3111 3112 aSig = extractFloat64Frac( a ); 3113 aExp = extractFloat64Exp( a ); 3114 aSign = extractFloat64Sign( a ); 3115 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3116 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3117 shiftCount = 0x42C - aExp; 3118 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3119 return roundAndPackInt32(aSign, aSig, status); 3120 3121 } 3122 3123 /*---------------------------------------------------------------------------- 3124 | Returns the result of converting the double-precision floating-point value 3125 | `a' to the 32-bit two's complement integer format. The conversion is 3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3127 | Arithmetic, except that the conversion is always rounded toward zero. 3128 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3129 | the conversion overflows, the largest integer with the same sign as `a' is 3130 | returned. 3131 *----------------------------------------------------------------------------*/ 3132 3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3134 { 3135 flag aSign; 3136 int aExp; 3137 int shiftCount; 3138 uint64_t aSig, savedASig; 3139 int32_t z; 3140 a = float64_squash_input_denormal(a, status); 3141 3142 aSig = extractFloat64Frac( a ); 3143 aExp = extractFloat64Exp( a ); 3144 aSign = extractFloat64Sign( a ); 3145 if ( 0x41E < aExp ) { 3146 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3147 goto invalid; 3148 } 3149 else if ( aExp < 0x3FF ) { 3150 if (aExp || aSig) { 3151 status->float_exception_flags |= float_flag_inexact; 3152 } 3153 return 0; 3154 } 3155 aSig |= LIT64( 0x0010000000000000 ); 3156 shiftCount = 0x433 - aExp; 3157 savedASig = aSig; 3158 aSig >>= shiftCount; 3159 z = aSig; 3160 if ( aSign ) z = - z; 3161 if ( ( z < 0 ) ^ aSign ) { 3162 invalid: 3163 float_raise(float_flag_invalid, status); 3164 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3165 } 3166 if ( ( aSig<<shiftCount ) != savedASig ) { 3167 status->float_exception_flags |= float_flag_inexact; 3168 } 3169 return z; 3170 3171 } 3172 3173 /*---------------------------------------------------------------------------- 3174 | Returns the result of converting the double-precision floating-point value 3175 | `a' to the 16-bit two's complement integer format. The conversion is 3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3177 | Arithmetic, except that the conversion is always rounded toward zero. 3178 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3179 | the conversion overflows, the largest integer with the same sign as `a' is 3180 | returned. 3181 *----------------------------------------------------------------------------*/ 3182 3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3184 { 3185 flag aSign; 3186 int aExp; 3187 int shiftCount; 3188 uint64_t aSig, savedASig; 3189 int32_t z; 3190 3191 aSig = extractFloat64Frac( a ); 3192 aExp = extractFloat64Exp( a ); 3193 aSign = extractFloat64Sign( a ); 3194 if ( 0x40E < aExp ) { 3195 if ( ( aExp == 0x7FF ) && aSig ) { 3196 aSign = 0; 3197 } 3198 goto invalid; 3199 } 3200 else if ( aExp < 0x3FF ) { 3201 if ( aExp || aSig ) { 3202 status->float_exception_flags |= float_flag_inexact; 3203 } 3204 return 0; 3205 } 3206 aSig |= LIT64( 0x0010000000000000 ); 3207 shiftCount = 0x433 - aExp; 3208 savedASig = aSig; 3209 aSig >>= shiftCount; 3210 z = aSig; 3211 if ( aSign ) { 3212 z = - z; 3213 } 3214 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3215 invalid: 3216 float_raise(float_flag_invalid, status); 3217 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3218 } 3219 if ( ( aSig<<shiftCount ) != savedASig ) { 3220 status->float_exception_flags |= float_flag_inexact; 3221 } 3222 return z; 3223 } 3224 3225 /*---------------------------------------------------------------------------- 3226 | Returns the result of converting the double-precision floating-point value 3227 | `a' to the 64-bit two's complement integer format. The conversion is 3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3229 | Arithmetic---which means in particular that the conversion is rounded 3230 | according to the current rounding mode. If `a' is a NaN, the largest 3231 | positive integer is returned. Otherwise, if the conversion overflows, the 3232 | largest integer with the same sign as `a' is returned. 3233 *----------------------------------------------------------------------------*/ 3234 3235 int64_t float64_to_int64(float64 a, float_status *status) 3236 { 3237 flag aSign; 3238 int aExp; 3239 int shiftCount; 3240 uint64_t aSig, aSigExtra; 3241 a = float64_squash_input_denormal(a, status); 3242 3243 aSig = extractFloat64Frac( a ); 3244 aExp = extractFloat64Exp( a ); 3245 aSign = extractFloat64Sign( a ); 3246 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3247 shiftCount = 0x433 - aExp; 3248 if ( shiftCount <= 0 ) { 3249 if ( 0x43E < aExp ) { 3250 float_raise(float_flag_invalid, status); 3251 if ( ! aSign 3252 || ( ( aExp == 0x7FF ) 3253 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3254 ) { 3255 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3256 } 3257 return (int64_t) LIT64( 0x8000000000000000 ); 3258 } 3259 aSigExtra = 0; 3260 aSig <<= - shiftCount; 3261 } 3262 else { 3263 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3264 } 3265 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3266 3267 } 3268 3269 /*---------------------------------------------------------------------------- 3270 | Returns the result of converting the double-precision floating-point value 3271 | `a' to the 64-bit two's complement integer format. The conversion is 3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3273 | Arithmetic, except that the conversion is always rounded toward zero. 3274 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3275 | the conversion overflows, the largest integer with the same sign as `a' is 3276 | returned. 3277 *----------------------------------------------------------------------------*/ 3278 3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3280 { 3281 flag aSign; 3282 int aExp; 3283 int shiftCount; 3284 uint64_t aSig; 3285 int64_t z; 3286 a = float64_squash_input_denormal(a, status); 3287 3288 aSig = extractFloat64Frac( a ); 3289 aExp = extractFloat64Exp( a ); 3290 aSign = extractFloat64Sign( a ); 3291 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3292 shiftCount = aExp - 0x433; 3293 if ( 0 <= shiftCount ) { 3294 if ( 0x43E <= aExp ) { 3295 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3296 float_raise(float_flag_invalid, status); 3297 if ( ! aSign 3298 || ( ( aExp == 0x7FF ) 3299 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3300 ) { 3301 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3302 } 3303 } 3304 return (int64_t) LIT64( 0x8000000000000000 ); 3305 } 3306 z = aSig<<shiftCount; 3307 } 3308 else { 3309 if ( aExp < 0x3FE ) { 3310 if (aExp | aSig) { 3311 status->float_exception_flags |= float_flag_inexact; 3312 } 3313 return 0; 3314 } 3315 z = aSig>>( - shiftCount ); 3316 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3317 status->float_exception_flags |= float_flag_inexact; 3318 } 3319 } 3320 if ( aSign ) z = - z; 3321 return z; 3322 3323 } 3324 3325 /*---------------------------------------------------------------------------- 3326 | Returns the result of converting the double-precision floating-point value 3327 | `a' to the single-precision floating-point format. The conversion is 3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3329 | Arithmetic. 3330 *----------------------------------------------------------------------------*/ 3331 3332 float32 float64_to_float32(float64 a, float_status *status) 3333 { 3334 flag aSign; 3335 int aExp; 3336 uint64_t aSig; 3337 uint32_t zSig; 3338 a = float64_squash_input_denormal(a, status); 3339 3340 aSig = extractFloat64Frac( a ); 3341 aExp = extractFloat64Exp( a ); 3342 aSign = extractFloat64Sign( a ); 3343 if ( aExp == 0x7FF ) { 3344 if (aSig) { 3345 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3346 } 3347 return packFloat32( aSign, 0xFF, 0 ); 3348 } 3349 shift64RightJamming( aSig, 22, &aSig ); 3350 zSig = aSig; 3351 if ( aExp || zSig ) { 3352 zSig |= 0x40000000; 3353 aExp -= 0x381; 3354 } 3355 return roundAndPackFloat32(aSign, aExp, zSig, status); 3356 3357 } 3358 3359 3360 /*---------------------------------------------------------------------------- 3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3362 | half-precision floating-point value, returning the result. After being 3363 | shifted into the proper positions, the three fields are simply added 3364 | together to form the result. This means that any integer portion of `zSig' 3365 | will be added into the exponent. Since a properly normalized significand 3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3367 | than the desired result exponent whenever `zSig' is a complete, normalized 3368 | significand. 3369 *----------------------------------------------------------------------------*/ 3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3371 { 3372 return make_float16( 3373 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3374 } 3375 3376 /*---------------------------------------------------------------------------- 3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3378 | and significand `zSig', and returns the proper half-precision floating- 3379 | point value corresponding to the abstract input. Ordinarily, the abstract 3380 | value is simply rounded and packed into the half-precision format, with 3381 | the inexact exception raised if the abstract input cannot be represented 3382 | exactly. However, if the abstract value is too large, the overflow and 3383 | inexact exceptions are raised and an infinity or maximal finite value is 3384 | returned. If the abstract value is too small, the input value is rounded to 3385 | a subnormal number, and the underflow and inexact exceptions are raised if 3386 | the abstract input cannot be represented exactly as a subnormal half- 3387 | precision floating-point number. 3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3389 | ARM-style "alternative representation", which omits the NaN and Inf 3390 | encodings in order to raise the maximum representable exponent by one. 3391 | The input significand `zSig' has its binary point between bits 22 3392 | and 23, which is 13 bits to the left of the usual location. This shifted 3393 | significand must be normalized or smaller. If `zSig' is not normalized, 3394 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3395 | and it must not require rounding. In the usual case that `zSig' is 3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3397 | Note the slightly odd position of the binary point in zSig compared with the 3398 | other roundAndPackFloat functions. This should probably be fixed if we 3399 | need to implement more float16 routines than just conversion. 3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3401 | Binary Floating-Point Arithmetic. 3402 *----------------------------------------------------------------------------*/ 3403 3404 static float16 roundAndPackFloat16(flag zSign, int zExp, 3405 uint32_t zSig, flag ieee, 3406 float_status *status) 3407 { 3408 int maxexp = ieee ? 29 : 30; 3409 uint32_t mask; 3410 uint32_t increment; 3411 bool rounding_bumps_exp; 3412 bool is_tiny = false; 3413 3414 /* Calculate the mask of bits of the mantissa which are not 3415 * representable in half-precision and will be lost. 3416 */ 3417 if (zExp < 1) { 3418 /* Will be denormal in halfprec */ 3419 mask = 0x00ffffff; 3420 if (zExp >= -11) { 3421 mask >>= 11 + zExp; 3422 } 3423 } else { 3424 /* Normal number in halfprec */ 3425 mask = 0x00001fff; 3426 } 3427 3428 switch (status->float_rounding_mode) { 3429 case float_round_nearest_even: 3430 increment = (mask + 1) >> 1; 3431 if ((zSig & mask) == increment) { 3432 increment = zSig & (increment << 1); 3433 } 3434 break; 3435 case float_round_ties_away: 3436 increment = (mask + 1) >> 1; 3437 break; 3438 case float_round_up: 3439 increment = zSign ? 0 : mask; 3440 break; 3441 case float_round_down: 3442 increment = zSign ? mask : 0; 3443 break; 3444 default: /* round_to_zero */ 3445 increment = 0; 3446 break; 3447 } 3448 3449 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3450 3451 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3452 if (ieee) { 3453 float_raise(float_flag_overflow | float_flag_inexact, status); 3454 return packFloat16(zSign, 0x1f, 0); 3455 } else { 3456 float_raise(float_flag_invalid, status); 3457 return packFloat16(zSign, 0x1f, 0x3ff); 3458 } 3459 } 3460 3461 if (zExp < 0) { 3462 /* Note that flush-to-zero does not affect half-precision results */ 3463 is_tiny = 3464 (status->float_detect_tininess == float_tininess_before_rounding) 3465 || (zExp < -1) 3466 || (!rounding_bumps_exp); 3467 } 3468 if (zSig & mask) { 3469 float_raise(float_flag_inexact, status); 3470 if (is_tiny) { 3471 float_raise(float_flag_underflow, status); 3472 } 3473 } 3474 3475 zSig += increment; 3476 if (rounding_bumps_exp) { 3477 zSig >>= 1; 3478 zExp++; 3479 } 3480 3481 if (zExp < -10) { 3482 return packFloat16(zSign, 0, 0); 3483 } 3484 if (zExp < 0) { 3485 zSig >>= -zExp; 3486 zExp = 0; 3487 } 3488 return packFloat16(zSign, zExp, zSig >> 13); 3489 } 3490 3491 /*---------------------------------------------------------------------------- 3492 | If `a' is denormal and we are in flush-to-zero mode then set the 3493 | input-denormal exception and return zero. Otherwise just return the value. 3494 *----------------------------------------------------------------------------*/ 3495 float16 float16_squash_input_denormal(float16 a, float_status *status) 3496 { 3497 if (status->flush_inputs_to_zero) { 3498 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3499 float_raise(float_flag_input_denormal, status); 3500 return make_float16(float16_val(a) & 0x8000); 3501 } 3502 } 3503 return a; 3504 } 3505 3506 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3507 uint32_t *zSigPtr) 3508 { 3509 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3510 *zSigPtr = aSig << shiftCount; 3511 *zExpPtr = 1 - shiftCount; 3512 } 3513 3514 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3515 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3516 3517 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3518 { 3519 flag aSign; 3520 int aExp; 3521 uint32_t aSig; 3522 3523 aSign = extractFloat16Sign(a); 3524 aExp = extractFloat16Exp(a); 3525 aSig = extractFloat16Frac(a); 3526 3527 if (aExp == 0x1f && ieee) { 3528 if (aSig) { 3529 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3530 } 3531 return packFloat32(aSign, 0xff, 0); 3532 } 3533 if (aExp == 0) { 3534 if (aSig == 0) { 3535 return packFloat32(aSign, 0, 0); 3536 } 3537 3538 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3539 aExp--; 3540 } 3541 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3542 } 3543 3544 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3545 { 3546 flag aSign; 3547 int aExp; 3548 uint32_t aSig; 3549 3550 a = float32_squash_input_denormal(a, status); 3551 3552 aSig = extractFloat32Frac( a ); 3553 aExp = extractFloat32Exp( a ); 3554 aSign = extractFloat32Sign( a ); 3555 if ( aExp == 0xFF ) { 3556 if (aSig) { 3557 /* Input is a NaN */ 3558 if (!ieee) { 3559 float_raise(float_flag_invalid, status); 3560 return packFloat16(aSign, 0, 0); 3561 } 3562 return commonNaNToFloat16( 3563 float32ToCommonNaN(a, status), status); 3564 } 3565 /* Infinity */ 3566 if (!ieee) { 3567 float_raise(float_flag_invalid, status); 3568 return packFloat16(aSign, 0x1f, 0x3ff); 3569 } 3570 return packFloat16(aSign, 0x1f, 0); 3571 } 3572 if (aExp == 0 && aSig == 0) { 3573 return packFloat16(aSign, 0, 0); 3574 } 3575 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3576 * even if the input is denormal; however this is harmless because 3577 * the largest possible single-precision denormal is still smaller 3578 * than the smallest representable half-precision denormal, and so we 3579 * will end up ignoring aSig and returning via the "always return zero" 3580 * codepath. 3581 */ 3582 aSig |= 0x00800000; 3583 aExp -= 0x71; 3584 3585 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3586 } 3587 3588 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3589 { 3590 flag aSign; 3591 int aExp; 3592 uint32_t aSig; 3593 3594 aSign = extractFloat16Sign(a); 3595 aExp = extractFloat16Exp(a); 3596 aSig = extractFloat16Frac(a); 3597 3598 if (aExp == 0x1f && ieee) { 3599 if (aSig) { 3600 return commonNaNToFloat64( 3601 float16ToCommonNaN(a, status), status); 3602 } 3603 return packFloat64(aSign, 0x7ff, 0); 3604 } 3605 if (aExp == 0) { 3606 if (aSig == 0) { 3607 return packFloat64(aSign, 0, 0); 3608 } 3609 3610 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3611 aExp--; 3612 } 3613 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3614 } 3615 3616 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3617 { 3618 flag aSign; 3619 int aExp; 3620 uint64_t aSig; 3621 uint32_t zSig; 3622 3623 a = float64_squash_input_denormal(a, status); 3624 3625 aSig = extractFloat64Frac(a); 3626 aExp = extractFloat64Exp(a); 3627 aSign = extractFloat64Sign(a); 3628 if (aExp == 0x7FF) { 3629 if (aSig) { 3630 /* Input is a NaN */ 3631 if (!ieee) { 3632 float_raise(float_flag_invalid, status); 3633 return packFloat16(aSign, 0, 0); 3634 } 3635 return commonNaNToFloat16( 3636 float64ToCommonNaN(a, status), status); 3637 } 3638 /* Infinity */ 3639 if (!ieee) { 3640 float_raise(float_flag_invalid, status); 3641 return packFloat16(aSign, 0x1f, 0x3ff); 3642 } 3643 return packFloat16(aSign, 0x1f, 0); 3644 } 3645 shift64RightJamming(aSig, 29, &aSig); 3646 zSig = aSig; 3647 if (aExp == 0 && zSig == 0) { 3648 return packFloat16(aSign, 0, 0); 3649 } 3650 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3651 * even if the input is denormal; however this is harmless because 3652 * the largest possible single-precision denormal is still smaller 3653 * than the smallest representable half-precision denormal, and so we 3654 * will end up ignoring aSig and returning via the "always return zero" 3655 * codepath. 3656 */ 3657 zSig |= 0x00800000; 3658 aExp -= 0x3F1; 3659 3660 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3661 } 3662 3663 /*---------------------------------------------------------------------------- 3664 | Returns the result of converting the double-precision floating-point value 3665 | `a' to the extended double-precision floating-point format. The conversion 3666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3667 | Arithmetic. 3668 *----------------------------------------------------------------------------*/ 3669 3670 floatx80 float64_to_floatx80(float64 a, float_status *status) 3671 { 3672 flag aSign; 3673 int aExp; 3674 uint64_t aSig; 3675 3676 a = float64_squash_input_denormal(a, status); 3677 aSig = extractFloat64Frac( a ); 3678 aExp = extractFloat64Exp( a ); 3679 aSign = extractFloat64Sign( a ); 3680 if ( aExp == 0x7FF ) { 3681 if (aSig) { 3682 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3683 } 3684 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3685 } 3686 if ( aExp == 0 ) { 3687 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3688 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3689 } 3690 return 3691 packFloatx80( 3692 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3693 3694 } 3695 3696 /*---------------------------------------------------------------------------- 3697 | Returns the result of converting the double-precision floating-point value 3698 | `a' to the quadruple-precision floating-point format. The conversion is 3699 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3700 | Arithmetic. 3701 *----------------------------------------------------------------------------*/ 3702 3703 float128 float64_to_float128(float64 a, float_status *status) 3704 { 3705 flag aSign; 3706 int aExp; 3707 uint64_t aSig, zSig0, zSig1; 3708 3709 a = float64_squash_input_denormal(a, status); 3710 aSig = extractFloat64Frac( a ); 3711 aExp = extractFloat64Exp( a ); 3712 aSign = extractFloat64Sign( a ); 3713 if ( aExp == 0x7FF ) { 3714 if (aSig) { 3715 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3716 } 3717 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3718 } 3719 if ( aExp == 0 ) { 3720 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3721 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3722 --aExp; 3723 } 3724 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3725 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3726 3727 } 3728 3729 /*---------------------------------------------------------------------------- 3730 | Rounds the double-precision floating-point value `a' to an integer, and 3731 | returns the result as a double-precision floating-point value. The 3732 | operation is performed according to the IEC/IEEE Standard for Binary 3733 | Floating-Point Arithmetic. 3734 *----------------------------------------------------------------------------*/ 3735 3736 float64 float64_round_to_int(float64 a, float_status *status) 3737 { 3738 flag aSign; 3739 int aExp; 3740 uint64_t lastBitMask, roundBitsMask; 3741 uint64_t z; 3742 a = float64_squash_input_denormal(a, status); 3743 3744 aExp = extractFloat64Exp( a ); 3745 if ( 0x433 <= aExp ) { 3746 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3747 return propagateFloat64NaN(a, a, status); 3748 } 3749 return a; 3750 } 3751 if ( aExp < 0x3FF ) { 3752 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3753 status->float_exception_flags |= float_flag_inexact; 3754 aSign = extractFloat64Sign( a ); 3755 switch (status->float_rounding_mode) { 3756 case float_round_nearest_even: 3757 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3758 return packFloat64( aSign, 0x3FF, 0 ); 3759 } 3760 break; 3761 case float_round_ties_away: 3762 if (aExp == 0x3FE) { 3763 return packFloat64(aSign, 0x3ff, 0); 3764 } 3765 break; 3766 case float_round_down: 3767 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3768 case float_round_up: 3769 return make_float64( 3770 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3771 } 3772 return packFloat64( aSign, 0, 0 ); 3773 } 3774 lastBitMask = 1; 3775 lastBitMask <<= 0x433 - aExp; 3776 roundBitsMask = lastBitMask - 1; 3777 z = float64_val(a); 3778 switch (status->float_rounding_mode) { 3779 case float_round_nearest_even: 3780 z += lastBitMask >> 1; 3781 if ((z & roundBitsMask) == 0) { 3782 z &= ~lastBitMask; 3783 } 3784 break; 3785 case float_round_ties_away: 3786 z += lastBitMask >> 1; 3787 break; 3788 case float_round_to_zero: 3789 break; 3790 case float_round_up: 3791 if (!extractFloat64Sign(make_float64(z))) { 3792 z += roundBitsMask; 3793 } 3794 break; 3795 case float_round_down: 3796 if (extractFloat64Sign(make_float64(z))) { 3797 z += roundBitsMask; 3798 } 3799 break; 3800 default: 3801 abort(); 3802 } 3803 z &= ~ roundBitsMask; 3804 if (z != float64_val(a)) { 3805 status->float_exception_flags |= float_flag_inexact; 3806 } 3807 return make_float64(z); 3808 3809 } 3810 3811 float64 float64_trunc_to_int(float64 a, float_status *status) 3812 { 3813 int oldmode; 3814 float64 res; 3815 oldmode = status->float_rounding_mode; 3816 status->float_rounding_mode = float_round_to_zero; 3817 res = float64_round_to_int(a, status); 3818 status->float_rounding_mode = oldmode; 3819 return res; 3820 } 3821 3822 /*---------------------------------------------------------------------------- 3823 | Returns the result of adding the absolute values of the double-precision 3824 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3825 | before being returned. `zSign' is ignored if the result is a NaN. 3826 | The addition is performed according to the IEC/IEEE Standard for Binary 3827 | Floating-Point Arithmetic. 3828 *----------------------------------------------------------------------------*/ 3829 3830 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3831 float_status *status) 3832 { 3833 int aExp, bExp, zExp; 3834 uint64_t aSig, bSig, zSig; 3835 int expDiff; 3836 3837 aSig = extractFloat64Frac( a ); 3838 aExp = extractFloat64Exp( a ); 3839 bSig = extractFloat64Frac( b ); 3840 bExp = extractFloat64Exp( b ); 3841 expDiff = aExp - bExp; 3842 aSig <<= 9; 3843 bSig <<= 9; 3844 if ( 0 < expDiff ) { 3845 if ( aExp == 0x7FF ) { 3846 if (aSig) { 3847 return propagateFloat64NaN(a, b, status); 3848 } 3849 return a; 3850 } 3851 if ( bExp == 0 ) { 3852 --expDiff; 3853 } 3854 else { 3855 bSig |= LIT64( 0x2000000000000000 ); 3856 } 3857 shift64RightJamming( bSig, expDiff, &bSig ); 3858 zExp = aExp; 3859 } 3860 else if ( expDiff < 0 ) { 3861 if ( bExp == 0x7FF ) { 3862 if (bSig) { 3863 return propagateFloat64NaN(a, b, status); 3864 } 3865 return packFloat64( zSign, 0x7FF, 0 ); 3866 } 3867 if ( aExp == 0 ) { 3868 ++expDiff; 3869 } 3870 else { 3871 aSig |= LIT64( 0x2000000000000000 ); 3872 } 3873 shift64RightJamming( aSig, - expDiff, &aSig ); 3874 zExp = bExp; 3875 } 3876 else { 3877 if ( aExp == 0x7FF ) { 3878 if (aSig | bSig) { 3879 return propagateFloat64NaN(a, b, status); 3880 } 3881 return a; 3882 } 3883 if ( aExp == 0 ) { 3884 if (status->flush_to_zero) { 3885 if (aSig | bSig) { 3886 float_raise(float_flag_output_denormal, status); 3887 } 3888 return packFloat64(zSign, 0, 0); 3889 } 3890 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3891 } 3892 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3893 zExp = aExp; 3894 goto roundAndPack; 3895 } 3896 aSig |= LIT64( 0x2000000000000000 ); 3897 zSig = ( aSig + bSig )<<1; 3898 --zExp; 3899 if ( (int64_t) zSig < 0 ) { 3900 zSig = aSig + bSig; 3901 ++zExp; 3902 } 3903 roundAndPack: 3904 return roundAndPackFloat64(zSign, zExp, zSig, status); 3905 3906 } 3907 3908 /*---------------------------------------------------------------------------- 3909 | Returns the result of subtracting the absolute values of the double- 3910 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3911 | difference is negated before being returned. `zSign' is ignored if the 3912 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3913 | Standard for Binary Floating-Point Arithmetic. 3914 *----------------------------------------------------------------------------*/ 3915 3916 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3917 float_status *status) 3918 { 3919 int aExp, bExp, zExp; 3920 uint64_t aSig, bSig, zSig; 3921 int expDiff; 3922 3923 aSig = extractFloat64Frac( a ); 3924 aExp = extractFloat64Exp( a ); 3925 bSig = extractFloat64Frac( b ); 3926 bExp = extractFloat64Exp( b ); 3927 expDiff = aExp - bExp; 3928 aSig <<= 10; 3929 bSig <<= 10; 3930 if ( 0 < expDiff ) goto aExpBigger; 3931 if ( expDiff < 0 ) goto bExpBigger; 3932 if ( aExp == 0x7FF ) { 3933 if (aSig | bSig) { 3934 return propagateFloat64NaN(a, b, status); 3935 } 3936 float_raise(float_flag_invalid, status); 3937 return float64_default_nan(status); 3938 } 3939 if ( aExp == 0 ) { 3940 aExp = 1; 3941 bExp = 1; 3942 } 3943 if ( bSig < aSig ) goto aBigger; 3944 if ( aSig < bSig ) goto bBigger; 3945 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3946 bExpBigger: 3947 if ( bExp == 0x7FF ) { 3948 if (bSig) { 3949 return propagateFloat64NaN(a, b, status); 3950 } 3951 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3952 } 3953 if ( aExp == 0 ) { 3954 ++expDiff; 3955 } 3956 else { 3957 aSig |= LIT64( 0x4000000000000000 ); 3958 } 3959 shift64RightJamming( aSig, - expDiff, &aSig ); 3960 bSig |= LIT64( 0x4000000000000000 ); 3961 bBigger: 3962 zSig = bSig - aSig; 3963 zExp = bExp; 3964 zSign ^= 1; 3965 goto normalizeRoundAndPack; 3966 aExpBigger: 3967 if ( aExp == 0x7FF ) { 3968 if (aSig) { 3969 return propagateFloat64NaN(a, b, status); 3970 } 3971 return a; 3972 } 3973 if ( bExp == 0 ) { 3974 --expDiff; 3975 } 3976 else { 3977 bSig |= LIT64( 0x4000000000000000 ); 3978 } 3979 shift64RightJamming( bSig, expDiff, &bSig ); 3980 aSig |= LIT64( 0x4000000000000000 ); 3981 aBigger: 3982 zSig = aSig - bSig; 3983 zExp = aExp; 3984 normalizeRoundAndPack: 3985 --zExp; 3986 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3987 3988 } 3989 3990 /*---------------------------------------------------------------------------- 3991 | Returns the result of adding the double-precision floating-point values `a' 3992 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3993 | Binary Floating-Point Arithmetic. 3994 *----------------------------------------------------------------------------*/ 3995 3996 float64 float64_add(float64 a, float64 b, float_status *status) 3997 { 3998 flag aSign, bSign; 3999 a = float64_squash_input_denormal(a, status); 4000 b = float64_squash_input_denormal(b, status); 4001 4002 aSign = extractFloat64Sign( a ); 4003 bSign = extractFloat64Sign( b ); 4004 if ( aSign == bSign ) { 4005 return addFloat64Sigs(a, b, aSign, status); 4006 } 4007 else { 4008 return subFloat64Sigs(a, b, aSign, status); 4009 } 4010 4011 } 4012 4013 /*---------------------------------------------------------------------------- 4014 | Returns the result of subtracting the double-precision floating-point values 4015 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4016 | for Binary Floating-Point Arithmetic. 4017 *----------------------------------------------------------------------------*/ 4018 4019 float64 float64_sub(float64 a, float64 b, float_status *status) 4020 { 4021 flag aSign, bSign; 4022 a = float64_squash_input_denormal(a, status); 4023 b = float64_squash_input_denormal(b, status); 4024 4025 aSign = extractFloat64Sign( a ); 4026 bSign = extractFloat64Sign( b ); 4027 if ( aSign == bSign ) { 4028 return subFloat64Sigs(a, b, aSign, status); 4029 } 4030 else { 4031 return addFloat64Sigs(a, b, aSign, status); 4032 } 4033 4034 } 4035 4036 /*---------------------------------------------------------------------------- 4037 | Returns the result of multiplying the double-precision floating-point values 4038 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4039 | for Binary Floating-Point Arithmetic. 4040 *----------------------------------------------------------------------------*/ 4041 4042 float64 float64_mul(float64 a, float64 b, float_status *status) 4043 { 4044 flag aSign, bSign, zSign; 4045 int aExp, bExp, zExp; 4046 uint64_t aSig, bSig, zSig0, zSig1; 4047 4048 a = float64_squash_input_denormal(a, status); 4049 b = float64_squash_input_denormal(b, status); 4050 4051 aSig = extractFloat64Frac( a ); 4052 aExp = extractFloat64Exp( a ); 4053 aSign = extractFloat64Sign( a ); 4054 bSig = extractFloat64Frac( b ); 4055 bExp = extractFloat64Exp( b ); 4056 bSign = extractFloat64Sign( b ); 4057 zSign = aSign ^ bSign; 4058 if ( aExp == 0x7FF ) { 4059 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4060 return propagateFloat64NaN(a, b, status); 4061 } 4062 if ( ( bExp | bSig ) == 0 ) { 4063 float_raise(float_flag_invalid, status); 4064 return float64_default_nan(status); 4065 } 4066 return packFloat64( zSign, 0x7FF, 0 ); 4067 } 4068 if ( bExp == 0x7FF ) { 4069 if (bSig) { 4070 return propagateFloat64NaN(a, b, status); 4071 } 4072 if ( ( aExp | aSig ) == 0 ) { 4073 float_raise(float_flag_invalid, status); 4074 return float64_default_nan(status); 4075 } 4076 return packFloat64( zSign, 0x7FF, 0 ); 4077 } 4078 if ( aExp == 0 ) { 4079 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4080 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4081 } 4082 if ( bExp == 0 ) { 4083 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4084 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4085 } 4086 zExp = aExp + bExp - 0x3FF; 4087 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4088 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4089 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4090 zSig0 |= ( zSig1 != 0 ); 4091 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4092 zSig0 <<= 1; 4093 --zExp; 4094 } 4095 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4096 4097 } 4098 4099 /*---------------------------------------------------------------------------- 4100 | Returns the result of dividing the double-precision floating-point value `a' 4101 | by the corresponding value `b'. The operation is performed according to 4102 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4103 *----------------------------------------------------------------------------*/ 4104 4105 float64 float64_div(float64 a, float64 b, float_status *status) 4106 { 4107 flag aSign, bSign, zSign; 4108 int aExp, bExp, zExp; 4109 uint64_t aSig, bSig, zSig; 4110 uint64_t rem0, rem1; 4111 uint64_t term0, term1; 4112 a = float64_squash_input_denormal(a, status); 4113 b = float64_squash_input_denormal(b, status); 4114 4115 aSig = extractFloat64Frac( a ); 4116 aExp = extractFloat64Exp( a ); 4117 aSign = extractFloat64Sign( a ); 4118 bSig = extractFloat64Frac( b ); 4119 bExp = extractFloat64Exp( b ); 4120 bSign = extractFloat64Sign( b ); 4121 zSign = aSign ^ bSign; 4122 if ( aExp == 0x7FF ) { 4123 if (aSig) { 4124 return propagateFloat64NaN(a, b, status); 4125 } 4126 if ( bExp == 0x7FF ) { 4127 if (bSig) { 4128 return propagateFloat64NaN(a, b, status); 4129 } 4130 float_raise(float_flag_invalid, status); 4131 return float64_default_nan(status); 4132 } 4133 return packFloat64( zSign, 0x7FF, 0 ); 4134 } 4135 if ( bExp == 0x7FF ) { 4136 if (bSig) { 4137 return propagateFloat64NaN(a, b, status); 4138 } 4139 return packFloat64( zSign, 0, 0 ); 4140 } 4141 if ( bExp == 0 ) { 4142 if ( bSig == 0 ) { 4143 if ( ( aExp | aSig ) == 0 ) { 4144 float_raise(float_flag_invalid, status); 4145 return float64_default_nan(status); 4146 } 4147 float_raise(float_flag_divbyzero, status); 4148 return packFloat64( zSign, 0x7FF, 0 ); 4149 } 4150 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4151 } 4152 if ( aExp == 0 ) { 4153 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4154 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4155 } 4156 zExp = aExp - bExp + 0x3FD; 4157 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4158 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4159 if ( bSig <= ( aSig + aSig ) ) { 4160 aSig >>= 1; 4161 ++zExp; 4162 } 4163 zSig = estimateDiv128To64( aSig, 0, bSig ); 4164 if ( ( zSig & 0x1FF ) <= 2 ) { 4165 mul64To128( bSig, zSig, &term0, &term1 ); 4166 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4167 while ( (int64_t) rem0 < 0 ) { 4168 --zSig; 4169 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4170 } 4171 zSig |= ( rem1 != 0 ); 4172 } 4173 return roundAndPackFloat64(zSign, zExp, zSig, status); 4174 4175 } 4176 4177 /*---------------------------------------------------------------------------- 4178 | Returns the remainder of the double-precision floating-point value `a' 4179 | with respect to the corresponding value `b'. The operation is performed 4180 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4181 *----------------------------------------------------------------------------*/ 4182 4183 float64 float64_rem(float64 a, float64 b, float_status *status) 4184 { 4185 flag aSign, zSign; 4186 int aExp, bExp, expDiff; 4187 uint64_t aSig, bSig; 4188 uint64_t q, alternateASig; 4189 int64_t sigMean; 4190 4191 a = float64_squash_input_denormal(a, status); 4192 b = float64_squash_input_denormal(b, status); 4193 aSig = extractFloat64Frac( a ); 4194 aExp = extractFloat64Exp( a ); 4195 aSign = extractFloat64Sign( a ); 4196 bSig = extractFloat64Frac( b ); 4197 bExp = extractFloat64Exp( b ); 4198 if ( aExp == 0x7FF ) { 4199 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4200 return propagateFloat64NaN(a, b, status); 4201 } 4202 float_raise(float_flag_invalid, status); 4203 return float64_default_nan(status); 4204 } 4205 if ( bExp == 0x7FF ) { 4206 if (bSig) { 4207 return propagateFloat64NaN(a, b, status); 4208 } 4209 return a; 4210 } 4211 if ( bExp == 0 ) { 4212 if ( bSig == 0 ) { 4213 float_raise(float_flag_invalid, status); 4214 return float64_default_nan(status); 4215 } 4216 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4217 } 4218 if ( aExp == 0 ) { 4219 if ( aSig == 0 ) return a; 4220 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4221 } 4222 expDiff = aExp - bExp; 4223 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4224 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4225 if ( expDiff < 0 ) { 4226 if ( expDiff < -1 ) return a; 4227 aSig >>= 1; 4228 } 4229 q = ( bSig <= aSig ); 4230 if ( q ) aSig -= bSig; 4231 expDiff -= 64; 4232 while ( 0 < expDiff ) { 4233 q = estimateDiv128To64( aSig, 0, bSig ); 4234 q = ( 2 < q ) ? q - 2 : 0; 4235 aSig = - ( ( bSig>>2 ) * q ); 4236 expDiff -= 62; 4237 } 4238 expDiff += 64; 4239 if ( 0 < expDiff ) { 4240 q = estimateDiv128To64( aSig, 0, bSig ); 4241 q = ( 2 < q ) ? q - 2 : 0; 4242 q >>= 64 - expDiff; 4243 bSig >>= 2; 4244 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4245 } 4246 else { 4247 aSig >>= 2; 4248 bSig >>= 2; 4249 } 4250 do { 4251 alternateASig = aSig; 4252 ++q; 4253 aSig -= bSig; 4254 } while ( 0 <= (int64_t) aSig ); 4255 sigMean = aSig + alternateASig; 4256 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4257 aSig = alternateASig; 4258 } 4259 zSign = ( (int64_t) aSig < 0 ); 4260 if ( zSign ) aSig = - aSig; 4261 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4262 4263 } 4264 4265 /*---------------------------------------------------------------------------- 4266 | Returns the result of multiplying the double-precision floating-point values 4267 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4268 | multiplication. The operation is performed according to the IEC/IEEE 4269 | Standard for Binary Floating-Point Arithmetic 754-2008. 4270 | The flags argument allows the caller to select negation of the 4271 | addend, the intermediate product, or the final result. (The difference 4272 | between this and having the caller do a separate negation is that negating 4273 | externally will flip the sign bit on NaNs.) 4274 *----------------------------------------------------------------------------*/ 4275 4276 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4277 float_status *status) 4278 { 4279 flag aSign, bSign, cSign, zSign; 4280 int aExp, bExp, cExp, pExp, zExp, expDiff; 4281 uint64_t aSig, bSig, cSig; 4282 flag pInf, pZero, pSign; 4283 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4284 int shiftcount; 4285 flag signflip, infzero; 4286 4287 a = float64_squash_input_denormal(a, status); 4288 b = float64_squash_input_denormal(b, status); 4289 c = float64_squash_input_denormal(c, status); 4290 aSig = extractFloat64Frac(a); 4291 aExp = extractFloat64Exp(a); 4292 aSign = extractFloat64Sign(a); 4293 bSig = extractFloat64Frac(b); 4294 bExp = extractFloat64Exp(b); 4295 bSign = extractFloat64Sign(b); 4296 cSig = extractFloat64Frac(c); 4297 cExp = extractFloat64Exp(c); 4298 cSign = extractFloat64Sign(c); 4299 4300 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4301 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4302 4303 /* It is implementation-defined whether the cases of (0,inf,qnan) 4304 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4305 * they return if they do), so we have to hand this information 4306 * off to the target-specific pick-a-NaN routine. 4307 */ 4308 if (((aExp == 0x7ff) && aSig) || 4309 ((bExp == 0x7ff) && bSig) || 4310 ((cExp == 0x7ff) && cSig)) { 4311 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4312 } 4313 4314 if (infzero) { 4315 float_raise(float_flag_invalid, status); 4316 return float64_default_nan(status); 4317 } 4318 4319 if (flags & float_muladd_negate_c) { 4320 cSign ^= 1; 4321 } 4322 4323 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4324 4325 /* Work out the sign and type of the product */ 4326 pSign = aSign ^ bSign; 4327 if (flags & float_muladd_negate_product) { 4328 pSign ^= 1; 4329 } 4330 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4331 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4332 4333 if (cExp == 0x7ff) { 4334 if (pInf && (pSign ^ cSign)) { 4335 /* addition of opposite-signed infinities => InvalidOperation */ 4336 float_raise(float_flag_invalid, status); 4337 return float64_default_nan(status); 4338 } 4339 /* Otherwise generate an infinity of the same sign */ 4340 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4341 } 4342 4343 if (pInf) { 4344 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4345 } 4346 4347 if (pZero) { 4348 if (cExp == 0) { 4349 if (cSig == 0) { 4350 /* Adding two exact zeroes */ 4351 if (pSign == cSign) { 4352 zSign = pSign; 4353 } else if (status->float_rounding_mode == float_round_down) { 4354 zSign = 1; 4355 } else { 4356 zSign = 0; 4357 } 4358 return packFloat64(zSign ^ signflip, 0, 0); 4359 } 4360 /* Exact zero plus a denorm */ 4361 if (status->flush_to_zero) { 4362 float_raise(float_flag_output_denormal, status); 4363 return packFloat64(cSign ^ signflip, 0, 0); 4364 } 4365 } 4366 /* Zero plus something non-zero : just return the something */ 4367 if (flags & float_muladd_halve_result) { 4368 if (cExp == 0) { 4369 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4370 } 4371 /* Subtract one to halve, and one again because roundAndPackFloat64 4372 * wants one less than the true exponent. 4373 */ 4374 cExp -= 2; 4375 cSig = (cSig | 0x0010000000000000ULL) << 10; 4376 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4377 } 4378 return packFloat64(cSign ^ signflip, cExp, cSig); 4379 } 4380 4381 if (aExp == 0) { 4382 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4383 } 4384 if (bExp == 0) { 4385 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4386 } 4387 4388 /* Calculate the actual result a * b + c */ 4389 4390 /* Multiply first; this is easy. */ 4391 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4392 * because we want the true exponent, not the "one-less-than" 4393 * flavour that roundAndPackFloat64() takes. 4394 */ 4395 pExp = aExp + bExp - 0x3fe; 4396 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4397 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4398 mul64To128(aSig, bSig, &pSig0, &pSig1); 4399 if ((int64_t)(pSig0 << 1) >= 0) { 4400 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4401 pExp--; 4402 } 4403 4404 zSign = pSign ^ signflip; 4405 4406 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4407 * bit in position 126. 4408 */ 4409 if (cExp == 0) { 4410 if (!cSig) { 4411 /* Throw out the special case of c being an exact zero now */ 4412 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4413 if (flags & float_muladd_halve_result) { 4414 pExp--; 4415 } 4416 return roundAndPackFloat64(zSign, pExp - 1, 4417 pSig1, status); 4418 } 4419 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4420 } 4421 4422 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4423 * significand of the addend, with the explicit bit in position 126. 4424 */ 4425 cSig0 = cSig << (126 - 64 - 52); 4426 cSig1 = 0; 4427 cSig0 |= LIT64(0x4000000000000000); 4428 expDiff = pExp - cExp; 4429 4430 if (pSign == cSign) { 4431 /* Addition */ 4432 if (expDiff > 0) { 4433 /* scale c to match p */ 4434 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4435 zExp = pExp; 4436 } else if (expDiff < 0) { 4437 /* scale p to match c */ 4438 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4439 zExp = cExp; 4440 } else { 4441 /* no scaling needed */ 4442 zExp = cExp; 4443 } 4444 /* Add significands and make sure explicit bit ends up in posn 126 */ 4445 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4446 if ((int64_t)zSig0 < 0) { 4447 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4448 } else { 4449 zExp--; 4450 } 4451 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4452 if (flags & float_muladd_halve_result) { 4453 zExp--; 4454 } 4455 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4456 } else { 4457 /* Subtraction */ 4458 if (expDiff > 0) { 4459 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4460 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4461 zExp = pExp; 4462 } else if (expDiff < 0) { 4463 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4464 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4465 zExp = cExp; 4466 zSign ^= 1; 4467 } else { 4468 zExp = pExp; 4469 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4470 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4471 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4472 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4473 zSign ^= 1; 4474 } else { 4475 /* Exact zero */ 4476 zSign = signflip; 4477 if (status->float_rounding_mode == float_round_down) { 4478 zSign ^= 1; 4479 } 4480 return packFloat64(zSign, 0, 0); 4481 } 4482 } 4483 --zExp; 4484 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4485 * starting with the significand in a pair of uint64_t. 4486 */ 4487 if (zSig0) { 4488 shiftcount = countLeadingZeros64(zSig0) - 1; 4489 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4490 if (zSig1) { 4491 zSig0 |= 1; 4492 } 4493 zExp -= shiftcount; 4494 } else { 4495 shiftcount = countLeadingZeros64(zSig1); 4496 if (shiftcount == 0) { 4497 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4498 zExp -= 63; 4499 } else { 4500 shiftcount--; 4501 zSig0 = zSig1 << shiftcount; 4502 zExp -= (shiftcount + 64); 4503 } 4504 } 4505 if (flags & float_muladd_halve_result) { 4506 zExp--; 4507 } 4508 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4509 } 4510 } 4511 4512 /*---------------------------------------------------------------------------- 4513 | Returns the square root of the double-precision floating-point value `a'. 4514 | The operation is performed according to the IEC/IEEE Standard for Binary 4515 | Floating-Point Arithmetic. 4516 *----------------------------------------------------------------------------*/ 4517 4518 float64 float64_sqrt(float64 a, float_status *status) 4519 { 4520 flag aSign; 4521 int aExp, zExp; 4522 uint64_t aSig, zSig, doubleZSig; 4523 uint64_t rem0, rem1, term0, term1; 4524 a = float64_squash_input_denormal(a, status); 4525 4526 aSig = extractFloat64Frac( a ); 4527 aExp = extractFloat64Exp( a ); 4528 aSign = extractFloat64Sign( a ); 4529 if ( aExp == 0x7FF ) { 4530 if (aSig) { 4531 return propagateFloat64NaN(a, a, status); 4532 } 4533 if ( ! aSign ) return a; 4534 float_raise(float_flag_invalid, status); 4535 return float64_default_nan(status); 4536 } 4537 if ( aSign ) { 4538 if ( ( aExp | aSig ) == 0 ) return a; 4539 float_raise(float_flag_invalid, status); 4540 return float64_default_nan(status); 4541 } 4542 if ( aExp == 0 ) { 4543 if ( aSig == 0 ) return float64_zero; 4544 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4545 } 4546 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4547 aSig |= LIT64( 0x0010000000000000 ); 4548 zSig = estimateSqrt32( aExp, aSig>>21 ); 4549 aSig <<= 9 - ( aExp & 1 ); 4550 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4551 if ( ( zSig & 0x1FF ) <= 5 ) { 4552 doubleZSig = zSig<<1; 4553 mul64To128( zSig, zSig, &term0, &term1 ); 4554 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4555 while ( (int64_t) rem0 < 0 ) { 4556 --zSig; 4557 doubleZSig -= 2; 4558 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4559 } 4560 zSig |= ( ( rem0 | rem1 ) != 0 ); 4561 } 4562 return roundAndPackFloat64(0, zExp, zSig, status); 4563 4564 } 4565 4566 /*---------------------------------------------------------------------------- 4567 | Returns the binary log of the double-precision floating-point value `a'. 4568 | The operation is performed according to the IEC/IEEE Standard for Binary 4569 | Floating-Point Arithmetic. 4570 *----------------------------------------------------------------------------*/ 4571 float64 float64_log2(float64 a, float_status *status) 4572 { 4573 flag aSign, zSign; 4574 int aExp; 4575 uint64_t aSig, aSig0, aSig1, zSig, i; 4576 a = float64_squash_input_denormal(a, status); 4577 4578 aSig = extractFloat64Frac( a ); 4579 aExp = extractFloat64Exp( a ); 4580 aSign = extractFloat64Sign( a ); 4581 4582 if ( aExp == 0 ) { 4583 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4584 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4585 } 4586 if ( aSign ) { 4587 float_raise(float_flag_invalid, status); 4588 return float64_default_nan(status); 4589 } 4590 if ( aExp == 0x7FF ) { 4591 if (aSig) { 4592 return propagateFloat64NaN(a, float64_zero, status); 4593 } 4594 return a; 4595 } 4596 4597 aExp -= 0x3FF; 4598 aSig |= LIT64( 0x0010000000000000 ); 4599 zSign = aExp < 0; 4600 zSig = (uint64_t)aExp << 52; 4601 for (i = 1LL << 51; i > 0; i >>= 1) { 4602 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4603 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4604 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4605 aSig >>= 1; 4606 zSig |= i; 4607 } 4608 } 4609 4610 if ( zSign ) 4611 zSig = -zSig; 4612 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4613 } 4614 4615 /*---------------------------------------------------------------------------- 4616 | Returns 1 if the double-precision floating-point value `a' is equal to the 4617 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4618 | if either operand is a NaN. Otherwise, the comparison is performed 4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4620 *----------------------------------------------------------------------------*/ 4621 4622 int float64_eq(float64 a, float64 b, float_status *status) 4623 { 4624 uint64_t av, bv; 4625 a = float64_squash_input_denormal(a, status); 4626 b = float64_squash_input_denormal(b, status); 4627 4628 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4629 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4630 ) { 4631 float_raise(float_flag_invalid, status); 4632 return 0; 4633 } 4634 av = float64_val(a); 4635 bv = float64_val(b); 4636 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4637 4638 } 4639 4640 /*---------------------------------------------------------------------------- 4641 | Returns 1 if the double-precision floating-point value `a' is less than or 4642 | equal to the corresponding value `b', and 0 otherwise. The invalid 4643 | exception is raised if either operand is a NaN. The comparison is performed 4644 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4645 *----------------------------------------------------------------------------*/ 4646 4647 int float64_le(float64 a, float64 b, float_status *status) 4648 { 4649 flag aSign, bSign; 4650 uint64_t av, bv; 4651 a = float64_squash_input_denormal(a, status); 4652 b = float64_squash_input_denormal(b, status); 4653 4654 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4655 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4656 ) { 4657 float_raise(float_flag_invalid, status); 4658 return 0; 4659 } 4660 aSign = extractFloat64Sign( a ); 4661 bSign = extractFloat64Sign( b ); 4662 av = float64_val(a); 4663 bv = float64_val(b); 4664 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4665 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4666 4667 } 4668 4669 /*---------------------------------------------------------------------------- 4670 | Returns 1 if the double-precision floating-point value `a' is less than 4671 | the corresponding value `b', and 0 otherwise. The invalid exception is 4672 | raised if either operand is a NaN. The comparison is performed according 4673 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4674 *----------------------------------------------------------------------------*/ 4675 4676 int float64_lt(float64 a, float64 b, float_status *status) 4677 { 4678 flag aSign, bSign; 4679 uint64_t av, bv; 4680 4681 a = float64_squash_input_denormal(a, status); 4682 b = float64_squash_input_denormal(b, status); 4683 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4684 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4685 ) { 4686 float_raise(float_flag_invalid, status); 4687 return 0; 4688 } 4689 aSign = extractFloat64Sign( a ); 4690 bSign = extractFloat64Sign( b ); 4691 av = float64_val(a); 4692 bv = float64_val(b); 4693 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4694 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4695 4696 } 4697 4698 /*---------------------------------------------------------------------------- 4699 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4700 | be compared, and 0 otherwise. The invalid exception is raised if either 4701 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4702 | Standard for Binary Floating-Point Arithmetic. 4703 *----------------------------------------------------------------------------*/ 4704 4705 int float64_unordered(float64 a, float64 b, float_status *status) 4706 { 4707 a = float64_squash_input_denormal(a, status); 4708 b = float64_squash_input_denormal(b, status); 4709 4710 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4711 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4712 ) { 4713 float_raise(float_flag_invalid, status); 4714 return 1; 4715 } 4716 return 0; 4717 } 4718 4719 /*---------------------------------------------------------------------------- 4720 | Returns 1 if the double-precision floating-point value `a' is equal to the 4721 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4722 | exception.The comparison is performed according to the IEC/IEEE Standard 4723 | for Binary Floating-Point Arithmetic. 4724 *----------------------------------------------------------------------------*/ 4725 4726 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4727 { 4728 uint64_t av, bv; 4729 a = float64_squash_input_denormal(a, status); 4730 b = float64_squash_input_denormal(b, status); 4731 4732 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4733 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4734 ) { 4735 if (float64_is_signaling_nan(a, status) 4736 || float64_is_signaling_nan(b, status)) { 4737 float_raise(float_flag_invalid, status); 4738 } 4739 return 0; 4740 } 4741 av = float64_val(a); 4742 bv = float64_val(b); 4743 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4744 4745 } 4746 4747 /*---------------------------------------------------------------------------- 4748 | Returns 1 if the double-precision floating-point value `a' is less than or 4749 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4750 | cause an exception. Otherwise, the comparison is performed according to the 4751 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4752 *----------------------------------------------------------------------------*/ 4753 4754 int float64_le_quiet(float64 a, float64 b, float_status *status) 4755 { 4756 flag aSign, bSign; 4757 uint64_t av, bv; 4758 a = float64_squash_input_denormal(a, status); 4759 b = float64_squash_input_denormal(b, status); 4760 4761 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4762 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4763 ) { 4764 if (float64_is_signaling_nan(a, status) 4765 || float64_is_signaling_nan(b, status)) { 4766 float_raise(float_flag_invalid, status); 4767 } 4768 return 0; 4769 } 4770 aSign = extractFloat64Sign( a ); 4771 bSign = extractFloat64Sign( b ); 4772 av = float64_val(a); 4773 bv = float64_val(b); 4774 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4775 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4776 4777 } 4778 4779 /*---------------------------------------------------------------------------- 4780 | Returns 1 if the double-precision floating-point value `a' is less than 4781 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4782 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4783 | Standard for Binary Floating-Point Arithmetic. 4784 *----------------------------------------------------------------------------*/ 4785 4786 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4787 { 4788 flag aSign, bSign; 4789 uint64_t av, bv; 4790 a = float64_squash_input_denormal(a, status); 4791 b = float64_squash_input_denormal(b, status); 4792 4793 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4794 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4795 ) { 4796 if (float64_is_signaling_nan(a, status) 4797 || float64_is_signaling_nan(b, status)) { 4798 float_raise(float_flag_invalid, status); 4799 } 4800 return 0; 4801 } 4802 aSign = extractFloat64Sign( a ); 4803 bSign = extractFloat64Sign( b ); 4804 av = float64_val(a); 4805 bv = float64_val(b); 4806 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4807 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4808 4809 } 4810 4811 /*---------------------------------------------------------------------------- 4812 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4813 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4814 | comparison is performed according to the IEC/IEEE Standard for Binary 4815 | Floating-Point Arithmetic. 4816 *----------------------------------------------------------------------------*/ 4817 4818 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4819 { 4820 a = float64_squash_input_denormal(a, status); 4821 b = float64_squash_input_denormal(b, status); 4822 4823 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4824 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4825 ) { 4826 if (float64_is_signaling_nan(a, status) 4827 || float64_is_signaling_nan(b, status)) { 4828 float_raise(float_flag_invalid, status); 4829 } 4830 return 1; 4831 } 4832 return 0; 4833 } 4834 4835 /*---------------------------------------------------------------------------- 4836 | Returns the result of converting the extended double-precision floating- 4837 | point value `a' to the 32-bit two's complement integer format. The 4838 | conversion is performed according to the IEC/IEEE Standard for Binary 4839 | Floating-Point Arithmetic---which means in particular that the conversion 4840 | is rounded according to the current rounding mode. If `a' is a NaN, the 4841 | largest positive integer is returned. Otherwise, if the conversion 4842 | overflows, the largest integer with the same sign as `a' is returned. 4843 *----------------------------------------------------------------------------*/ 4844 4845 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4846 { 4847 flag aSign; 4848 int32_t aExp, shiftCount; 4849 uint64_t aSig; 4850 4851 if (floatx80_invalid_encoding(a)) { 4852 float_raise(float_flag_invalid, status); 4853 return 1 << 31; 4854 } 4855 aSig = extractFloatx80Frac( a ); 4856 aExp = extractFloatx80Exp( a ); 4857 aSign = extractFloatx80Sign( a ); 4858 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4859 shiftCount = 0x4037 - aExp; 4860 if ( shiftCount <= 0 ) shiftCount = 1; 4861 shift64RightJamming( aSig, shiftCount, &aSig ); 4862 return roundAndPackInt32(aSign, aSig, status); 4863 4864 } 4865 4866 /*---------------------------------------------------------------------------- 4867 | Returns the result of converting the extended double-precision floating- 4868 | point value `a' to the 32-bit two's complement integer format. The 4869 | conversion is performed according to the IEC/IEEE Standard for Binary 4870 | Floating-Point Arithmetic, except that the conversion is always rounded 4871 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4872 | Otherwise, if the conversion overflows, the largest integer with the same 4873 | sign as `a' is returned. 4874 *----------------------------------------------------------------------------*/ 4875 4876 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4877 { 4878 flag aSign; 4879 int32_t aExp, shiftCount; 4880 uint64_t aSig, savedASig; 4881 int32_t z; 4882 4883 if (floatx80_invalid_encoding(a)) { 4884 float_raise(float_flag_invalid, status); 4885 return 1 << 31; 4886 } 4887 aSig = extractFloatx80Frac( a ); 4888 aExp = extractFloatx80Exp( a ); 4889 aSign = extractFloatx80Sign( a ); 4890 if ( 0x401E < aExp ) { 4891 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4892 goto invalid; 4893 } 4894 else if ( aExp < 0x3FFF ) { 4895 if (aExp || aSig) { 4896 status->float_exception_flags |= float_flag_inexact; 4897 } 4898 return 0; 4899 } 4900 shiftCount = 0x403E - aExp; 4901 savedASig = aSig; 4902 aSig >>= shiftCount; 4903 z = aSig; 4904 if ( aSign ) z = - z; 4905 if ( ( z < 0 ) ^ aSign ) { 4906 invalid: 4907 float_raise(float_flag_invalid, status); 4908 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4909 } 4910 if ( ( aSig<<shiftCount ) != savedASig ) { 4911 status->float_exception_flags |= float_flag_inexact; 4912 } 4913 return z; 4914 4915 } 4916 4917 /*---------------------------------------------------------------------------- 4918 | Returns the result of converting the extended double-precision floating- 4919 | point value `a' to the 64-bit two's complement integer format. The 4920 | conversion is performed according to the IEC/IEEE Standard for Binary 4921 | Floating-Point Arithmetic---which means in particular that the conversion 4922 | is rounded according to the current rounding mode. If `a' is a NaN, 4923 | the largest positive integer is returned. Otherwise, if the conversion 4924 | overflows, the largest integer with the same sign as `a' is returned. 4925 *----------------------------------------------------------------------------*/ 4926 4927 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4928 { 4929 flag aSign; 4930 int32_t aExp, shiftCount; 4931 uint64_t aSig, aSigExtra; 4932 4933 if (floatx80_invalid_encoding(a)) { 4934 float_raise(float_flag_invalid, status); 4935 return 1ULL << 63; 4936 } 4937 aSig = extractFloatx80Frac( a ); 4938 aExp = extractFloatx80Exp( a ); 4939 aSign = extractFloatx80Sign( a ); 4940 shiftCount = 0x403E - aExp; 4941 if ( shiftCount <= 0 ) { 4942 if ( shiftCount ) { 4943 float_raise(float_flag_invalid, status); 4944 if ( ! aSign 4945 || ( ( aExp == 0x7FFF ) 4946 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4947 ) { 4948 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4949 } 4950 return (int64_t) LIT64( 0x8000000000000000 ); 4951 } 4952 aSigExtra = 0; 4953 } 4954 else { 4955 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4956 } 4957 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4958 4959 } 4960 4961 /*---------------------------------------------------------------------------- 4962 | Returns the result of converting the extended double-precision floating- 4963 | point value `a' to the 64-bit two's complement integer format. The 4964 | conversion is performed according to the IEC/IEEE Standard for Binary 4965 | Floating-Point Arithmetic, except that the conversion is always rounded 4966 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4967 | Otherwise, if the conversion overflows, the largest integer with the same 4968 | sign as `a' is returned. 4969 *----------------------------------------------------------------------------*/ 4970 4971 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4972 { 4973 flag aSign; 4974 int32_t aExp, shiftCount; 4975 uint64_t aSig; 4976 int64_t z; 4977 4978 if (floatx80_invalid_encoding(a)) { 4979 float_raise(float_flag_invalid, status); 4980 return 1ULL << 63; 4981 } 4982 aSig = extractFloatx80Frac( a ); 4983 aExp = extractFloatx80Exp( a ); 4984 aSign = extractFloatx80Sign( a ); 4985 shiftCount = aExp - 0x403E; 4986 if ( 0 <= shiftCount ) { 4987 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4988 if ( ( a.high != 0xC03E ) || aSig ) { 4989 float_raise(float_flag_invalid, status); 4990 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4991 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4992 } 4993 } 4994 return (int64_t) LIT64( 0x8000000000000000 ); 4995 } 4996 else if ( aExp < 0x3FFF ) { 4997 if (aExp | aSig) { 4998 status->float_exception_flags |= float_flag_inexact; 4999 } 5000 return 0; 5001 } 5002 z = aSig>>( - shiftCount ); 5003 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 5004 status->float_exception_flags |= float_flag_inexact; 5005 } 5006 if ( aSign ) z = - z; 5007 return z; 5008 5009 } 5010 5011 /*---------------------------------------------------------------------------- 5012 | Returns the result of converting the extended double-precision floating- 5013 | point value `a' to the single-precision floating-point format. The 5014 | conversion is performed according to the IEC/IEEE Standard for Binary 5015 | Floating-Point Arithmetic. 5016 *----------------------------------------------------------------------------*/ 5017 5018 float32 floatx80_to_float32(floatx80 a, float_status *status) 5019 { 5020 flag aSign; 5021 int32_t aExp; 5022 uint64_t aSig; 5023 5024 if (floatx80_invalid_encoding(a)) { 5025 float_raise(float_flag_invalid, status); 5026 return float32_default_nan(status); 5027 } 5028 aSig = extractFloatx80Frac( a ); 5029 aExp = extractFloatx80Exp( a ); 5030 aSign = extractFloatx80Sign( a ); 5031 if ( aExp == 0x7FFF ) { 5032 if ( (uint64_t) ( aSig<<1 ) ) { 5033 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5034 } 5035 return packFloat32( aSign, 0xFF, 0 ); 5036 } 5037 shift64RightJamming( aSig, 33, &aSig ); 5038 if ( aExp || aSig ) aExp -= 0x3F81; 5039 return roundAndPackFloat32(aSign, aExp, aSig, status); 5040 5041 } 5042 5043 /*---------------------------------------------------------------------------- 5044 | Returns the result of converting the extended double-precision floating- 5045 | point value `a' to the double-precision floating-point format. The 5046 | conversion is performed according to the IEC/IEEE Standard for Binary 5047 | Floating-Point Arithmetic. 5048 *----------------------------------------------------------------------------*/ 5049 5050 float64 floatx80_to_float64(floatx80 a, float_status *status) 5051 { 5052 flag aSign; 5053 int32_t aExp; 5054 uint64_t aSig, zSig; 5055 5056 if (floatx80_invalid_encoding(a)) { 5057 float_raise(float_flag_invalid, status); 5058 return float64_default_nan(status); 5059 } 5060 aSig = extractFloatx80Frac( a ); 5061 aExp = extractFloatx80Exp( a ); 5062 aSign = extractFloatx80Sign( a ); 5063 if ( aExp == 0x7FFF ) { 5064 if ( (uint64_t) ( aSig<<1 ) ) { 5065 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5066 } 5067 return packFloat64( aSign, 0x7FF, 0 ); 5068 } 5069 shift64RightJamming( aSig, 1, &zSig ); 5070 if ( aExp || aSig ) aExp -= 0x3C01; 5071 return roundAndPackFloat64(aSign, aExp, zSig, status); 5072 5073 } 5074 5075 /*---------------------------------------------------------------------------- 5076 | Returns the result of converting the extended double-precision floating- 5077 | point value `a' to the quadruple-precision floating-point format. The 5078 | conversion is performed according to the IEC/IEEE Standard for Binary 5079 | Floating-Point Arithmetic. 5080 *----------------------------------------------------------------------------*/ 5081 5082 float128 floatx80_to_float128(floatx80 a, float_status *status) 5083 { 5084 flag aSign; 5085 int aExp; 5086 uint64_t aSig, zSig0, zSig1; 5087 5088 if (floatx80_invalid_encoding(a)) { 5089 float_raise(float_flag_invalid, status); 5090 return float128_default_nan(status); 5091 } 5092 aSig = extractFloatx80Frac( a ); 5093 aExp = extractFloatx80Exp( a ); 5094 aSign = extractFloatx80Sign( a ); 5095 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5096 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5097 } 5098 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5099 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5100 5101 } 5102 5103 /*---------------------------------------------------------------------------- 5104 | Rounds the extended double-precision floating-point value `a' 5105 | to the precision provided by floatx80_rounding_precision and returns the 5106 | result as an extended double-precision floating-point value. 5107 | The operation is performed according to the IEC/IEEE Standard for Binary 5108 | Floating-Point Arithmetic. 5109 *----------------------------------------------------------------------------*/ 5110 5111 floatx80 floatx80_round(floatx80 a, float_status *status) 5112 { 5113 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5114 extractFloatx80Sign(a), 5115 extractFloatx80Exp(a), 5116 extractFloatx80Frac(a), 0, status); 5117 } 5118 5119 /*---------------------------------------------------------------------------- 5120 | Rounds the extended double-precision floating-point value `a' to an integer, 5121 | and returns the result as an extended quadruple-precision floating-point 5122 | value. The operation is performed according to the IEC/IEEE Standard for 5123 | Binary Floating-Point Arithmetic. 5124 *----------------------------------------------------------------------------*/ 5125 5126 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5127 { 5128 flag aSign; 5129 int32_t aExp; 5130 uint64_t lastBitMask, roundBitsMask; 5131 floatx80 z; 5132 5133 if (floatx80_invalid_encoding(a)) { 5134 float_raise(float_flag_invalid, status); 5135 return floatx80_default_nan(status); 5136 } 5137 aExp = extractFloatx80Exp( a ); 5138 if ( 0x403E <= aExp ) { 5139 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5140 return propagateFloatx80NaN(a, a, status); 5141 } 5142 return a; 5143 } 5144 if ( aExp < 0x3FFF ) { 5145 if ( ( aExp == 0 ) 5146 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5147 return a; 5148 } 5149 status->float_exception_flags |= float_flag_inexact; 5150 aSign = extractFloatx80Sign( a ); 5151 switch (status->float_rounding_mode) { 5152 case float_round_nearest_even: 5153 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5154 ) { 5155 return 5156 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5157 } 5158 break; 5159 case float_round_ties_away: 5160 if (aExp == 0x3FFE) { 5161 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5162 } 5163 break; 5164 case float_round_down: 5165 return 5166 aSign ? 5167 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5168 : packFloatx80( 0, 0, 0 ); 5169 case float_round_up: 5170 return 5171 aSign ? packFloatx80( 1, 0, 0 ) 5172 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5173 } 5174 return packFloatx80( aSign, 0, 0 ); 5175 } 5176 lastBitMask = 1; 5177 lastBitMask <<= 0x403E - aExp; 5178 roundBitsMask = lastBitMask - 1; 5179 z = a; 5180 switch (status->float_rounding_mode) { 5181 case float_round_nearest_even: 5182 z.low += lastBitMask>>1; 5183 if ((z.low & roundBitsMask) == 0) { 5184 z.low &= ~lastBitMask; 5185 } 5186 break; 5187 case float_round_ties_away: 5188 z.low += lastBitMask >> 1; 5189 break; 5190 case float_round_to_zero: 5191 break; 5192 case float_round_up: 5193 if (!extractFloatx80Sign(z)) { 5194 z.low += roundBitsMask; 5195 } 5196 break; 5197 case float_round_down: 5198 if (extractFloatx80Sign(z)) { 5199 z.low += roundBitsMask; 5200 } 5201 break; 5202 default: 5203 abort(); 5204 } 5205 z.low &= ~ roundBitsMask; 5206 if ( z.low == 0 ) { 5207 ++z.high; 5208 z.low = LIT64( 0x8000000000000000 ); 5209 } 5210 if (z.low != a.low) { 5211 status->float_exception_flags |= float_flag_inexact; 5212 } 5213 return z; 5214 5215 } 5216 5217 /*---------------------------------------------------------------------------- 5218 | Returns the result of adding the absolute values of the extended double- 5219 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5220 | negated before being returned. `zSign' is ignored if the result is a NaN. 5221 | The addition is performed according to the IEC/IEEE Standard for Binary 5222 | Floating-Point Arithmetic. 5223 *----------------------------------------------------------------------------*/ 5224 5225 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5226 float_status *status) 5227 { 5228 int32_t aExp, bExp, zExp; 5229 uint64_t aSig, bSig, zSig0, zSig1; 5230 int32_t expDiff; 5231 5232 aSig = extractFloatx80Frac( a ); 5233 aExp = extractFloatx80Exp( a ); 5234 bSig = extractFloatx80Frac( b ); 5235 bExp = extractFloatx80Exp( b ); 5236 expDiff = aExp - bExp; 5237 if ( 0 < expDiff ) { 5238 if ( aExp == 0x7FFF ) { 5239 if ((uint64_t)(aSig << 1)) { 5240 return propagateFloatx80NaN(a, b, status); 5241 } 5242 return a; 5243 } 5244 if ( bExp == 0 ) --expDiff; 5245 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5246 zExp = aExp; 5247 } 5248 else if ( expDiff < 0 ) { 5249 if ( bExp == 0x7FFF ) { 5250 if ((uint64_t)(bSig << 1)) { 5251 return propagateFloatx80NaN(a, b, status); 5252 } 5253 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5254 } 5255 if ( aExp == 0 ) ++expDiff; 5256 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5257 zExp = bExp; 5258 } 5259 else { 5260 if ( aExp == 0x7FFF ) { 5261 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5262 return propagateFloatx80NaN(a, b, status); 5263 } 5264 return a; 5265 } 5266 zSig1 = 0; 5267 zSig0 = aSig + bSig; 5268 if ( aExp == 0 ) { 5269 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5270 goto roundAndPack; 5271 } 5272 zExp = aExp; 5273 goto shiftRight1; 5274 } 5275 zSig0 = aSig + bSig; 5276 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5277 shiftRight1: 5278 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5279 zSig0 |= LIT64( 0x8000000000000000 ); 5280 ++zExp; 5281 roundAndPack: 5282 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5283 zSign, zExp, zSig0, zSig1, status); 5284 } 5285 5286 /*---------------------------------------------------------------------------- 5287 | Returns the result of subtracting the absolute values of the extended 5288 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5289 | difference is negated before being returned. `zSign' is ignored if the 5290 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5291 | Standard for Binary Floating-Point Arithmetic. 5292 *----------------------------------------------------------------------------*/ 5293 5294 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5295 float_status *status) 5296 { 5297 int32_t aExp, bExp, zExp; 5298 uint64_t aSig, bSig, zSig0, zSig1; 5299 int32_t expDiff; 5300 5301 aSig = extractFloatx80Frac( a ); 5302 aExp = extractFloatx80Exp( a ); 5303 bSig = extractFloatx80Frac( b ); 5304 bExp = extractFloatx80Exp( b ); 5305 expDiff = aExp - bExp; 5306 if ( 0 < expDiff ) goto aExpBigger; 5307 if ( expDiff < 0 ) goto bExpBigger; 5308 if ( aExp == 0x7FFF ) { 5309 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5310 return propagateFloatx80NaN(a, b, status); 5311 } 5312 float_raise(float_flag_invalid, status); 5313 return floatx80_default_nan(status); 5314 } 5315 if ( aExp == 0 ) { 5316 aExp = 1; 5317 bExp = 1; 5318 } 5319 zSig1 = 0; 5320 if ( bSig < aSig ) goto aBigger; 5321 if ( aSig < bSig ) goto bBigger; 5322 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5323 bExpBigger: 5324 if ( bExp == 0x7FFF ) { 5325 if ((uint64_t)(bSig << 1)) { 5326 return propagateFloatx80NaN(a, b, status); 5327 } 5328 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5329 } 5330 if ( aExp == 0 ) ++expDiff; 5331 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5332 bBigger: 5333 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5334 zExp = bExp; 5335 zSign ^= 1; 5336 goto normalizeRoundAndPack; 5337 aExpBigger: 5338 if ( aExp == 0x7FFF ) { 5339 if ((uint64_t)(aSig << 1)) { 5340 return propagateFloatx80NaN(a, b, status); 5341 } 5342 return a; 5343 } 5344 if ( bExp == 0 ) --expDiff; 5345 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5346 aBigger: 5347 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5348 zExp = aExp; 5349 normalizeRoundAndPack: 5350 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5351 zSign, zExp, zSig0, zSig1, status); 5352 } 5353 5354 /*---------------------------------------------------------------------------- 5355 | Returns the result of adding the extended double-precision floating-point 5356 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5357 | Standard for Binary Floating-Point Arithmetic. 5358 *----------------------------------------------------------------------------*/ 5359 5360 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5361 { 5362 flag aSign, bSign; 5363 5364 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5365 float_raise(float_flag_invalid, status); 5366 return floatx80_default_nan(status); 5367 } 5368 aSign = extractFloatx80Sign( a ); 5369 bSign = extractFloatx80Sign( b ); 5370 if ( aSign == bSign ) { 5371 return addFloatx80Sigs(a, b, aSign, status); 5372 } 5373 else { 5374 return subFloatx80Sigs(a, b, aSign, status); 5375 } 5376 5377 } 5378 5379 /*---------------------------------------------------------------------------- 5380 | Returns the result of subtracting the extended double-precision floating- 5381 | point values `a' and `b'. The operation is performed according to the 5382 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5383 *----------------------------------------------------------------------------*/ 5384 5385 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5386 { 5387 flag aSign, bSign; 5388 5389 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5390 float_raise(float_flag_invalid, status); 5391 return floatx80_default_nan(status); 5392 } 5393 aSign = extractFloatx80Sign( a ); 5394 bSign = extractFloatx80Sign( b ); 5395 if ( aSign == bSign ) { 5396 return subFloatx80Sigs(a, b, aSign, status); 5397 } 5398 else { 5399 return addFloatx80Sigs(a, b, aSign, status); 5400 } 5401 5402 } 5403 5404 /*---------------------------------------------------------------------------- 5405 | Returns the result of multiplying the extended double-precision floating- 5406 | point values `a' and `b'. The operation is performed according to the 5407 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5408 *----------------------------------------------------------------------------*/ 5409 5410 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5411 { 5412 flag aSign, bSign, zSign; 5413 int32_t aExp, bExp, zExp; 5414 uint64_t aSig, bSig, zSig0, zSig1; 5415 5416 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5417 float_raise(float_flag_invalid, status); 5418 return floatx80_default_nan(status); 5419 } 5420 aSig = extractFloatx80Frac( a ); 5421 aExp = extractFloatx80Exp( a ); 5422 aSign = extractFloatx80Sign( a ); 5423 bSig = extractFloatx80Frac( b ); 5424 bExp = extractFloatx80Exp( b ); 5425 bSign = extractFloatx80Sign( b ); 5426 zSign = aSign ^ bSign; 5427 if ( aExp == 0x7FFF ) { 5428 if ( (uint64_t) ( aSig<<1 ) 5429 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5430 return propagateFloatx80NaN(a, b, status); 5431 } 5432 if ( ( bExp | bSig ) == 0 ) goto invalid; 5433 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5434 } 5435 if ( bExp == 0x7FFF ) { 5436 if ((uint64_t)(bSig << 1)) { 5437 return propagateFloatx80NaN(a, b, status); 5438 } 5439 if ( ( aExp | aSig ) == 0 ) { 5440 invalid: 5441 float_raise(float_flag_invalid, status); 5442 return floatx80_default_nan(status); 5443 } 5444 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5445 } 5446 if ( aExp == 0 ) { 5447 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5448 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5449 } 5450 if ( bExp == 0 ) { 5451 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5452 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5453 } 5454 zExp = aExp + bExp - 0x3FFE; 5455 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5456 if ( 0 < (int64_t) zSig0 ) { 5457 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5458 --zExp; 5459 } 5460 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5461 zSign, zExp, zSig0, zSig1, status); 5462 } 5463 5464 /*---------------------------------------------------------------------------- 5465 | Returns the result of dividing the extended double-precision floating-point 5466 | value `a' by the corresponding value `b'. The operation is performed 5467 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5468 *----------------------------------------------------------------------------*/ 5469 5470 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5471 { 5472 flag aSign, bSign, zSign; 5473 int32_t aExp, bExp, zExp; 5474 uint64_t aSig, bSig, zSig0, zSig1; 5475 uint64_t rem0, rem1, rem2, term0, term1, term2; 5476 5477 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5478 float_raise(float_flag_invalid, status); 5479 return floatx80_default_nan(status); 5480 } 5481 aSig = extractFloatx80Frac( a ); 5482 aExp = extractFloatx80Exp( a ); 5483 aSign = extractFloatx80Sign( a ); 5484 bSig = extractFloatx80Frac( b ); 5485 bExp = extractFloatx80Exp( b ); 5486 bSign = extractFloatx80Sign( b ); 5487 zSign = aSign ^ bSign; 5488 if ( aExp == 0x7FFF ) { 5489 if ((uint64_t)(aSig << 1)) { 5490 return propagateFloatx80NaN(a, b, status); 5491 } 5492 if ( bExp == 0x7FFF ) { 5493 if ((uint64_t)(bSig << 1)) { 5494 return propagateFloatx80NaN(a, b, status); 5495 } 5496 goto invalid; 5497 } 5498 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5499 } 5500 if ( bExp == 0x7FFF ) { 5501 if ((uint64_t)(bSig << 1)) { 5502 return propagateFloatx80NaN(a, b, status); 5503 } 5504 return packFloatx80( zSign, 0, 0 ); 5505 } 5506 if ( bExp == 0 ) { 5507 if ( bSig == 0 ) { 5508 if ( ( aExp | aSig ) == 0 ) { 5509 invalid: 5510 float_raise(float_flag_invalid, status); 5511 return floatx80_default_nan(status); 5512 } 5513 float_raise(float_flag_divbyzero, status); 5514 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5515 } 5516 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5517 } 5518 if ( aExp == 0 ) { 5519 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5520 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5521 } 5522 zExp = aExp - bExp + 0x3FFE; 5523 rem1 = 0; 5524 if ( bSig <= aSig ) { 5525 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5526 ++zExp; 5527 } 5528 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5529 mul64To128( bSig, zSig0, &term0, &term1 ); 5530 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5531 while ( (int64_t) rem0 < 0 ) { 5532 --zSig0; 5533 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5534 } 5535 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5536 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5537 mul64To128( bSig, zSig1, &term1, &term2 ); 5538 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5539 while ( (int64_t) rem1 < 0 ) { 5540 --zSig1; 5541 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5542 } 5543 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5544 } 5545 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5546 zSign, zExp, zSig0, zSig1, status); 5547 } 5548 5549 /*---------------------------------------------------------------------------- 5550 | Returns the remainder of the extended double-precision floating-point value 5551 | `a' with respect to the corresponding value `b'. The operation is performed 5552 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5553 *----------------------------------------------------------------------------*/ 5554 5555 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5556 { 5557 flag aSign, zSign; 5558 int32_t aExp, bExp, expDiff; 5559 uint64_t aSig0, aSig1, bSig; 5560 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5561 5562 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5563 float_raise(float_flag_invalid, status); 5564 return floatx80_default_nan(status); 5565 } 5566 aSig0 = extractFloatx80Frac( a ); 5567 aExp = extractFloatx80Exp( a ); 5568 aSign = extractFloatx80Sign( a ); 5569 bSig = extractFloatx80Frac( b ); 5570 bExp = extractFloatx80Exp( b ); 5571 if ( aExp == 0x7FFF ) { 5572 if ( (uint64_t) ( aSig0<<1 ) 5573 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5574 return propagateFloatx80NaN(a, b, status); 5575 } 5576 goto invalid; 5577 } 5578 if ( bExp == 0x7FFF ) { 5579 if ((uint64_t)(bSig << 1)) { 5580 return propagateFloatx80NaN(a, b, status); 5581 } 5582 return a; 5583 } 5584 if ( bExp == 0 ) { 5585 if ( bSig == 0 ) { 5586 invalid: 5587 float_raise(float_flag_invalid, status); 5588 return floatx80_default_nan(status); 5589 } 5590 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5591 } 5592 if ( aExp == 0 ) { 5593 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5594 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5595 } 5596 bSig |= LIT64( 0x8000000000000000 ); 5597 zSign = aSign; 5598 expDiff = aExp - bExp; 5599 aSig1 = 0; 5600 if ( expDiff < 0 ) { 5601 if ( expDiff < -1 ) return a; 5602 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5603 expDiff = 0; 5604 } 5605 q = ( bSig <= aSig0 ); 5606 if ( q ) aSig0 -= bSig; 5607 expDiff -= 64; 5608 while ( 0 < expDiff ) { 5609 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5610 q = ( 2 < q ) ? q - 2 : 0; 5611 mul64To128( bSig, q, &term0, &term1 ); 5612 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5613 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5614 expDiff -= 62; 5615 } 5616 expDiff += 64; 5617 if ( 0 < expDiff ) { 5618 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5619 q = ( 2 < q ) ? q - 2 : 0; 5620 q >>= 64 - expDiff; 5621 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5622 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5623 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5624 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5625 ++q; 5626 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5627 } 5628 } 5629 else { 5630 term1 = 0; 5631 term0 = bSig; 5632 } 5633 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5634 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5635 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5636 && ( q & 1 ) ) 5637 ) { 5638 aSig0 = alternateASig0; 5639 aSig1 = alternateASig1; 5640 zSign = ! zSign; 5641 } 5642 return 5643 normalizeRoundAndPackFloatx80( 5644 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5645 5646 } 5647 5648 /*---------------------------------------------------------------------------- 5649 | Returns the square root of the extended double-precision floating-point 5650 | value `a'. The operation is performed according to the IEC/IEEE Standard 5651 | for Binary Floating-Point Arithmetic. 5652 *----------------------------------------------------------------------------*/ 5653 5654 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5655 { 5656 flag aSign; 5657 int32_t aExp, zExp; 5658 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5659 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5660 5661 if (floatx80_invalid_encoding(a)) { 5662 float_raise(float_flag_invalid, status); 5663 return floatx80_default_nan(status); 5664 } 5665 aSig0 = extractFloatx80Frac( a ); 5666 aExp = extractFloatx80Exp( a ); 5667 aSign = extractFloatx80Sign( a ); 5668 if ( aExp == 0x7FFF ) { 5669 if ((uint64_t)(aSig0 << 1)) { 5670 return propagateFloatx80NaN(a, a, status); 5671 } 5672 if ( ! aSign ) return a; 5673 goto invalid; 5674 } 5675 if ( aSign ) { 5676 if ( ( aExp | aSig0 ) == 0 ) return a; 5677 invalid: 5678 float_raise(float_flag_invalid, status); 5679 return floatx80_default_nan(status); 5680 } 5681 if ( aExp == 0 ) { 5682 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5683 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5684 } 5685 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5686 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5687 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5688 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5689 doubleZSig0 = zSig0<<1; 5690 mul64To128( zSig0, zSig0, &term0, &term1 ); 5691 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5692 while ( (int64_t) rem0 < 0 ) { 5693 --zSig0; 5694 doubleZSig0 -= 2; 5695 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5696 } 5697 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5698 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5699 if ( zSig1 == 0 ) zSig1 = 1; 5700 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5701 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5702 mul64To128( zSig1, zSig1, &term2, &term3 ); 5703 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5704 while ( (int64_t) rem1 < 0 ) { 5705 --zSig1; 5706 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5707 term3 |= 1; 5708 term2 |= doubleZSig0; 5709 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5710 } 5711 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5712 } 5713 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5714 zSig0 |= doubleZSig0; 5715 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5716 0, zExp, zSig0, zSig1, status); 5717 } 5718 5719 /*---------------------------------------------------------------------------- 5720 | Returns 1 if the extended double-precision floating-point value `a' is equal 5721 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5722 | raised if either operand is a NaN. Otherwise, the comparison is performed 5723 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5724 *----------------------------------------------------------------------------*/ 5725 5726 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5727 { 5728 5729 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5730 || (extractFloatx80Exp(a) == 0x7FFF 5731 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5732 || (extractFloatx80Exp(b) == 0x7FFF 5733 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5734 ) { 5735 float_raise(float_flag_invalid, status); 5736 return 0; 5737 } 5738 return 5739 ( a.low == b.low ) 5740 && ( ( a.high == b.high ) 5741 || ( ( a.low == 0 ) 5742 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5743 ); 5744 5745 } 5746 5747 /*---------------------------------------------------------------------------- 5748 | Returns 1 if the extended double-precision floating-point value `a' is 5749 | less than or equal to the corresponding value `b', and 0 otherwise. The 5750 | invalid exception is raised if either operand is a NaN. The comparison is 5751 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5752 | Arithmetic. 5753 *----------------------------------------------------------------------------*/ 5754 5755 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5756 { 5757 flag aSign, bSign; 5758 5759 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5760 || (extractFloatx80Exp(a) == 0x7FFF 5761 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5762 || (extractFloatx80Exp(b) == 0x7FFF 5763 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5764 ) { 5765 float_raise(float_flag_invalid, status); 5766 return 0; 5767 } 5768 aSign = extractFloatx80Sign( a ); 5769 bSign = extractFloatx80Sign( b ); 5770 if ( aSign != bSign ) { 5771 return 5772 aSign 5773 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5774 == 0 ); 5775 } 5776 return 5777 aSign ? le128( b.high, b.low, a.high, a.low ) 5778 : le128( a.high, a.low, b.high, b.low ); 5779 5780 } 5781 5782 /*---------------------------------------------------------------------------- 5783 | Returns 1 if the extended double-precision floating-point value `a' is 5784 | less than the corresponding value `b', and 0 otherwise. The invalid 5785 | exception is raised if either operand is a NaN. The comparison is performed 5786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5787 *----------------------------------------------------------------------------*/ 5788 5789 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5790 { 5791 flag aSign, bSign; 5792 5793 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5794 || (extractFloatx80Exp(a) == 0x7FFF 5795 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5796 || (extractFloatx80Exp(b) == 0x7FFF 5797 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5798 ) { 5799 float_raise(float_flag_invalid, status); 5800 return 0; 5801 } 5802 aSign = extractFloatx80Sign( a ); 5803 bSign = extractFloatx80Sign( b ); 5804 if ( aSign != bSign ) { 5805 return 5806 aSign 5807 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5808 != 0 ); 5809 } 5810 return 5811 aSign ? lt128( b.high, b.low, a.high, a.low ) 5812 : lt128( a.high, a.low, b.high, b.low ); 5813 5814 } 5815 5816 /*---------------------------------------------------------------------------- 5817 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5818 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5819 | either operand is a NaN. The comparison is performed according to the 5820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5821 *----------------------------------------------------------------------------*/ 5822 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5823 { 5824 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5825 || (extractFloatx80Exp(a) == 0x7FFF 5826 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5827 || (extractFloatx80Exp(b) == 0x7FFF 5828 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5829 ) { 5830 float_raise(float_flag_invalid, status); 5831 return 1; 5832 } 5833 return 0; 5834 } 5835 5836 /*---------------------------------------------------------------------------- 5837 | Returns 1 if the extended double-precision floating-point value `a' is 5838 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5839 | cause an exception. The comparison is performed according to the IEC/IEEE 5840 | Standard for Binary Floating-Point Arithmetic. 5841 *----------------------------------------------------------------------------*/ 5842 5843 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5844 { 5845 5846 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5847 float_raise(float_flag_invalid, status); 5848 return 0; 5849 } 5850 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5851 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5852 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5853 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5854 ) { 5855 if (floatx80_is_signaling_nan(a, status) 5856 || floatx80_is_signaling_nan(b, status)) { 5857 float_raise(float_flag_invalid, status); 5858 } 5859 return 0; 5860 } 5861 return 5862 ( a.low == b.low ) 5863 && ( ( a.high == b.high ) 5864 || ( ( a.low == 0 ) 5865 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5866 ); 5867 5868 } 5869 5870 /*---------------------------------------------------------------------------- 5871 | Returns 1 if the extended double-precision floating-point value `a' is less 5872 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5873 | do not cause an exception. Otherwise, the comparison is performed according 5874 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5875 *----------------------------------------------------------------------------*/ 5876 5877 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5878 { 5879 flag aSign, bSign; 5880 5881 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5882 float_raise(float_flag_invalid, status); 5883 return 0; 5884 } 5885 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5886 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5887 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5888 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5889 ) { 5890 if (floatx80_is_signaling_nan(a, status) 5891 || floatx80_is_signaling_nan(b, status)) { 5892 float_raise(float_flag_invalid, status); 5893 } 5894 return 0; 5895 } 5896 aSign = extractFloatx80Sign( a ); 5897 bSign = extractFloatx80Sign( b ); 5898 if ( aSign != bSign ) { 5899 return 5900 aSign 5901 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5902 == 0 ); 5903 } 5904 return 5905 aSign ? le128( b.high, b.low, a.high, a.low ) 5906 : le128( a.high, a.low, b.high, b.low ); 5907 5908 } 5909 5910 /*---------------------------------------------------------------------------- 5911 | Returns 1 if the extended double-precision floating-point value `a' is less 5912 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5913 | an exception. Otherwise, the comparison is performed according to the 5914 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5915 *----------------------------------------------------------------------------*/ 5916 5917 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5918 { 5919 flag aSign, bSign; 5920 5921 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5922 float_raise(float_flag_invalid, status); 5923 return 0; 5924 } 5925 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5926 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5927 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5928 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5929 ) { 5930 if (floatx80_is_signaling_nan(a, status) 5931 || floatx80_is_signaling_nan(b, status)) { 5932 float_raise(float_flag_invalid, status); 5933 } 5934 return 0; 5935 } 5936 aSign = extractFloatx80Sign( a ); 5937 bSign = extractFloatx80Sign( b ); 5938 if ( aSign != bSign ) { 5939 return 5940 aSign 5941 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5942 != 0 ); 5943 } 5944 return 5945 aSign ? lt128( b.high, b.low, a.high, a.low ) 5946 : lt128( a.high, a.low, b.high, b.low ); 5947 5948 } 5949 5950 /*---------------------------------------------------------------------------- 5951 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5952 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5953 | The comparison is performed according to the IEC/IEEE Standard for Binary 5954 | Floating-Point Arithmetic. 5955 *----------------------------------------------------------------------------*/ 5956 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5957 { 5958 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5959 float_raise(float_flag_invalid, status); 5960 return 1; 5961 } 5962 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5963 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5964 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5965 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5966 ) { 5967 if (floatx80_is_signaling_nan(a, status) 5968 || floatx80_is_signaling_nan(b, status)) { 5969 float_raise(float_flag_invalid, status); 5970 } 5971 return 1; 5972 } 5973 return 0; 5974 } 5975 5976 /*---------------------------------------------------------------------------- 5977 | Returns the result of converting the quadruple-precision floating-point 5978 | value `a' to the 32-bit two's complement integer format. The conversion 5979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5980 | Arithmetic---which means in particular that the conversion is rounded 5981 | according to the current rounding mode. If `a' is a NaN, the largest 5982 | positive integer is returned. Otherwise, if the conversion overflows, the 5983 | largest integer with the same sign as `a' is returned. 5984 *----------------------------------------------------------------------------*/ 5985 5986 int32_t float128_to_int32(float128 a, float_status *status) 5987 { 5988 flag aSign; 5989 int32_t aExp, shiftCount; 5990 uint64_t aSig0, aSig1; 5991 5992 aSig1 = extractFloat128Frac1( a ); 5993 aSig0 = extractFloat128Frac0( a ); 5994 aExp = extractFloat128Exp( a ); 5995 aSign = extractFloat128Sign( a ); 5996 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5997 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5998 aSig0 |= ( aSig1 != 0 ); 5999 shiftCount = 0x4028 - aExp; 6000 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 6001 return roundAndPackInt32(aSign, aSig0, status); 6002 6003 } 6004 6005 /*---------------------------------------------------------------------------- 6006 | Returns the result of converting the quadruple-precision floating-point 6007 | value `a' to the 32-bit two's complement integer format. The conversion 6008 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6009 | Arithmetic, except that the conversion is always rounded toward zero. If 6010 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 6011 | conversion overflows, the largest integer with the same sign as `a' is 6012 | returned. 6013 *----------------------------------------------------------------------------*/ 6014 6015 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6016 { 6017 flag aSign; 6018 int32_t aExp, shiftCount; 6019 uint64_t aSig0, aSig1, savedASig; 6020 int32_t z; 6021 6022 aSig1 = extractFloat128Frac1( a ); 6023 aSig0 = extractFloat128Frac0( a ); 6024 aExp = extractFloat128Exp( a ); 6025 aSign = extractFloat128Sign( a ); 6026 aSig0 |= ( aSig1 != 0 ); 6027 if ( 0x401E < aExp ) { 6028 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6029 goto invalid; 6030 } 6031 else if ( aExp < 0x3FFF ) { 6032 if (aExp || aSig0) { 6033 status->float_exception_flags |= float_flag_inexact; 6034 } 6035 return 0; 6036 } 6037 aSig0 |= LIT64( 0x0001000000000000 ); 6038 shiftCount = 0x402F - aExp; 6039 savedASig = aSig0; 6040 aSig0 >>= shiftCount; 6041 z = aSig0; 6042 if ( aSign ) z = - z; 6043 if ( ( z < 0 ) ^ aSign ) { 6044 invalid: 6045 float_raise(float_flag_invalid, status); 6046 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6047 } 6048 if ( ( aSig0<<shiftCount ) != savedASig ) { 6049 status->float_exception_flags |= float_flag_inexact; 6050 } 6051 return z; 6052 6053 } 6054 6055 /*---------------------------------------------------------------------------- 6056 | Returns the result of converting the quadruple-precision floating-point 6057 | value `a' to the 64-bit two's complement integer format. The conversion 6058 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6059 | Arithmetic---which means in particular that the conversion is rounded 6060 | according to the current rounding mode. If `a' is a NaN, the largest 6061 | positive integer is returned. Otherwise, if the conversion overflows, the 6062 | largest integer with the same sign as `a' is returned. 6063 *----------------------------------------------------------------------------*/ 6064 6065 int64_t float128_to_int64(float128 a, float_status *status) 6066 { 6067 flag aSign; 6068 int32_t aExp, shiftCount; 6069 uint64_t aSig0, aSig1; 6070 6071 aSig1 = extractFloat128Frac1( a ); 6072 aSig0 = extractFloat128Frac0( a ); 6073 aExp = extractFloat128Exp( a ); 6074 aSign = extractFloat128Sign( a ); 6075 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6076 shiftCount = 0x402F - aExp; 6077 if ( shiftCount <= 0 ) { 6078 if ( 0x403E < aExp ) { 6079 float_raise(float_flag_invalid, status); 6080 if ( ! aSign 6081 || ( ( aExp == 0x7FFF ) 6082 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6083 ) 6084 ) { 6085 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6086 } 6087 return (int64_t) LIT64( 0x8000000000000000 ); 6088 } 6089 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6090 } 6091 else { 6092 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6093 } 6094 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6095 6096 } 6097 6098 /*---------------------------------------------------------------------------- 6099 | Returns the result of converting the quadruple-precision floating-point 6100 | value `a' to the 64-bit two's complement integer format. The conversion 6101 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6102 | Arithmetic, except that the conversion is always rounded toward zero. 6103 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6104 | the conversion overflows, the largest integer with the same sign as `a' is 6105 | returned. 6106 *----------------------------------------------------------------------------*/ 6107 6108 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6109 { 6110 flag aSign; 6111 int32_t aExp, shiftCount; 6112 uint64_t aSig0, aSig1; 6113 int64_t z; 6114 6115 aSig1 = extractFloat128Frac1( a ); 6116 aSig0 = extractFloat128Frac0( a ); 6117 aExp = extractFloat128Exp( a ); 6118 aSign = extractFloat128Sign( a ); 6119 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6120 shiftCount = aExp - 0x402F; 6121 if ( 0 < shiftCount ) { 6122 if ( 0x403E <= aExp ) { 6123 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6124 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6125 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6126 if (aSig1) { 6127 status->float_exception_flags |= float_flag_inexact; 6128 } 6129 } 6130 else { 6131 float_raise(float_flag_invalid, status); 6132 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6133 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6134 } 6135 } 6136 return (int64_t) LIT64( 0x8000000000000000 ); 6137 } 6138 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6139 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6140 status->float_exception_flags |= float_flag_inexact; 6141 } 6142 } 6143 else { 6144 if ( aExp < 0x3FFF ) { 6145 if ( aExp | aSig0 | aSig1 ) { 6146 status->float_exception_flags |= float_flag_inexact; 6147 } 6148 return 0; 6149 } 6150 z = aSig0>>( - shiftCount ); 6151 if ( aSig1 6152 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6153 status->float_exception_flags |= float_flag_inexact; 6154 } 6155 } 6156 if ( aSign ) z = - z; 6157 return z; 6158 6159 } 6160 6161 /*---------------------------------------------------------------------------- 6162 | Returns the result of converting the quadruple-precision floating-point value 6163 | `a' to the 64-bit unsigned integer format. The conversion is 6164 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6165 | Arithmetic---which means in particular that the conversion is rounded 6166 | according to the current rounding mode. If `a' is a NaN, the largest 6167 | positive integer is returned. If the conversion overflows, the 6168 | largest unsigned integer is returned. If 'a' is negative, the value is 6169 | rounded and zero is returned; negative values that do not round to zero 6170 | will raise the inexact exception. 6171 *----------------------------------------------------------------------------*/ 6172 6173 uint64_t float128_to_uint64(float128 a, float_status *status) 6174 { 6175 flag aSign; 6176 int aExp; 6177 int shiftCount; 6178 uint64_t aSig0, aSig1; 6179 6180 aSig0 = extractFloat128Frac0(a); 6181 aSig1 = extractFloat128Frac1(a); 6182 aExp = extractFloat128Exp(a); 6183 aSign = extractFloat128Sign(a); 6184 if (aSign && (aExp > 0x3FFE)) { 6185 float_raise(float_flag_invalid, status); 6186 if (float128_is_any_nan(a)) { 6187 return LIT64(0xFFFFFFFFFFFFFFFF); 6188 } else { 6189 return 0; 6190 } 6191 } 6192 if (aExp) { 6193 aSig0 |= LIT64(0x0001000000000000); 6194 } 6195 shiftCount = 0x402F - aExp; 6196 if (shiftCount <= 0) { 6197 if (0x403E < aExp) { 6198 float_raise(float_flag_invalid, status); 6199 return LIT64(0xFFFFFFFFFFFFFFFF); 6200 } 6201 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6202 } else { 6203 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6204 } 6205 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6206 } 6207 6208 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6209 { 6210 uint64_t v; 6211 signed char current_rounding_mode = status->float_rounding_mode; 6212 6213 set_float_rounding_mode(float_round_to_zero, status); 6214 v = float128_to_uint64(a, status); 6215 set_float_rounding_mode(current_rounding_mode, status); 6216 6217 return v; 6218 } 6219 6220 /*---------------------------------------------------------------------------- 6221 | Returns the result of converting the quadruple-precision floating-point 6222 | value `a' to the 32-bit unsigned integer format. The conversion 6223 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6224 | Arithmetic except that the conversion is always rounded toward zero. 6225 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6226 | if the conversion overflows, the largest unsigned integer is returned. 6227 | If 'a' is negative, the value is rounded and zero is returned; negative 6228 | values that do not round to zero will raise the inexact exception. 6229 *----------------------------------------------------------------------------*/ 6230 6231 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6232 { 6233 uint64_t v; 6234 uint32_t res; 6235 int old_exc_flags = get_float_exception_flags(status); 6236 6237 v = float128_to_uint64_round_to_zero(a, status); 6238 if (v > 0xffffffff) { 6239 res = 0xffffffff; 6240 } else { 6241 return v; 6242 } 6243 set_float_exception_flags(old_exc_flags, status); 6244 float_raise(float_flag_invalid, status); 6245 return res; 6246 } 6247 6248 /*---------------------------------------------------------------------------- 6249 | Returns the result of converting the quadruple-precision floating-point 6250 | value `a' to the single-precision floating-point format. The conversion 6251 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6252 | Arithmetic. 6253 *----------------------------------------------------------------------------*/ 6254 6255 float32 float128_to_float32(float128 a, float_status *status) 6256 { 6257 flag aSign; 6258 int32_t aExp; 6259 uint64_t aSig0, aSig1; 6260 uint32_t zSig; 6261 6262 aSig1 = extractFloat128Frac1( a ); 6263 aSig0 = extractFloat128Frac0( a ); 6264 aExp = extractFloat128Exp( a ); 6265 aSign = extractFloat128Sign( a ); 6266 if ( aExp == 0x7FFF ) { 6267 if ( aSig0 | aSig1 ) { 6268 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6269 } 6270 return packFloat32( aSign, 0xFF, 0 ); 6271 } 6272 aSig0 |= ( aSig1 != 0 ); 6273 shift64RightJamming( aSig0, 18, &aSig0 ); 6274 zSig = aSig0; 6275 if ( aExp || zSig ) { 6276 zSig |= 0x40000000; 6277 aExp -= 0x3F81; 6278 } 6279 return roundAndPackFloat32(aSign, aExp, zSig, status); 6280 6281 } 6282 6283 /*---------------------------------------------------------------------------- 6284 | Returns the result of converting the quadruple-precision floating-point 6285 | value `a' to the double-precision floating-point format. The conversion 6286 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6287 | Arithmetic. 6288 *----------------------------------------------------------------------------*/ 6289 6290 float64 float128_to_float64(float128 a, float_status *status) 6291 { 6292 flag aSign; 6293 int32_t aExp; 6294 uint64_t aSig0, aSig1; 6295 6296 aSig1 = extractFloat128Frac1( a ); 6297 aSig0 = extractFloat128Frac0( a ); 6298 aExp = extractFloat128Exp( a ); 6299 aSign = extractFloat128Sign( a ); 6300 if ( aExp == 0x7FFF ) { 6301 if ( aSig0 | aSig1 ) { 6302 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6303 } 6304 return packFloat64( aSign, 0x7FF, 0 ); 6305 } 6306 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6307 aSig0 |= ( aSig1 != 0 ); 6308 if ( aExp || aSig0 ) { 6309 aSig0 |= LIT64( 0x4000000000000000 ); 6310 aExp -= 0x3C01; 6311 } 6312 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6313 6314 } 6315 6316 /*---------------------------------------------------------------------------- 6317 | Returns the result of converting the quadruple-precision floating-point 6318 | value `a' to the extended double-precision floating-point format. The 6319 | conversion is performed according to the IEC/IEEE Standard for Binary 6320 | Floating-Point Arithmetic. 6321 *----------------------------------------------------------------------------*/ 6322 6323 floatx80 float128_to_floatx80(float128 a, float_status *status) 6324 { 6325 flag aSign; 6326 int32_t aExp; 6327 uint64_t aSig0, aSig1; 6328 6329 aSig1 = extractFloat128Frac1( a ); 6330 aSig0 = extractFloat128Frac0( a ); 6331 aExp = extractFloat128Exp( a ); 6332 aSign = extractFloat128Sign( a ); 6333 if ( aExp == 0x7FFF ) { 6334 if ( aSig0 | aSig1 ) { 6335 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6336 } 6337 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6338 } 6339 if ( aExp == 0 ) { 6340 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6341 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6342 } 6343 else { 6344 aSig0 |= LIT64( 0x0001000000000000 ); 6345 } 6346 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6347 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6348 6349 } 6350 6351 /*---------------------------------------------------------------------------- 6352 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6353 | returns the result as a quadruple-precision floating-point value. The 6354 | operation is performed according to the IEC/IEEE Standard for Binary 6355 | Floating-Point Arithmetic. 6356 *----------------------------------------------------------------------------*/ 6357 6358 float128 float128_round_to_int(float128 a, float_status *status) 6359 { 6360 flag aSign; 6361 int32_t aExp; 6362 uint64_t lastBitMask, roundBitsMask; 6363 float128 z; 6364 6365 aExp = extractFloat128Exp( a ); 6366 if ( 0x402F <= aExp ) { 6367 if ( 0x406F <= aExp ) { 6368 if ( ( aExp == 0x7FFF ) 6369 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6370 ) { 6371 return propagateFloat128NaN(a, a, status); 6372 } 6373 return a; 6374 } 6375 lastBitMask = 1; 6376 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6377 roundBitsMask = lastBitMask - 1; 6378 z = a; 6379 switch (status->float_rounding_mode) { 6380 case float_round_nearest_even: 6381 if ( lastBitMask ) { 6382 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6383 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6384 } 6385 else { 6386 if ( (int64_t) z.low < 0 ) { 6387 ++z.high; 6388 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6389 } 6390 } 6391 break; 6392 case float_round_ties_away: 6393 if (lastBitMask) { 6394 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6395 } else { 6396 if ((int64_t) z.low < 0) { 6397 ++z.high; 6398 } 6399 } 6400 break; 6401 case float_round_to_zero: 6402 break; 6403 case float_round_up: 6404 if (!extractFloat128Sign(z)) { 6405 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6406 } 6407 break; 6408 case float_round_down: 6409 if (extractFloat128Sign(z)) { 6410 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6411 } 6412 break; 6413 default: 6414 abort(); 6415 } 6416 z.low &= ~ roundBitsMask; 6417 } 6418 else { 6419 if ( aExp < 0x3FFF ) { 6420 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6421 status->float_exception_flags |= float_flag_inexact; 6422 aSign = extractFloat128Sign( a ); 6423 switch (status->float_rounding_mode) { 6424 case float_round_nearest_even: 6425 if ( ( aExp == 0x3FFE ) 6426 && ( extractFloat128Frac0( a ) 6427 | extractFloat128Frac1( a ) ) 6428 ) { 6429 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6430 } 6431 break; 6432 case float_round_ties_away: 6433 if (aExp == 0x3FFE) { 6434 return packFloat128(aSign, 0x3FFF, 0, 0); 6435 } 6436 break; 6437 case float_round_down: 6438 return 6439 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6440 : packFloat128( 0, 0, 0, 0 ); 6441 case float_round_up: 6442 return 6443 aSign ? packFloat128( 1, 0, 0, 0 ) 6444 : packFloat128( 0, 0x3FFF, 0, 0 ); 6445 } 6446 return packFloat128( aSign, 0, 0, 0 ); 6447 } 6448 lastBitMask = 1; 6449 lastBitMask <<= 0x402F - aExp; 6450 roundBitsMask = lastBitMask - 1; 6451 z.low = 0; 6452 z.high = a.high; 6453 switch (status->float_rounding_mode) { 6454 case float_round_nearest_even: 6455 z.high += lastBitMask>>1; 6456 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6457 z.high &= ~ lastBitMask; 6458 } 6459 break; 6460 case float_round_ties_away: 6461 z.high += lastBitMask>>1; 6462 break; 6463 case float_round_to_zero: 6464 break; 6465 case float_round_up: 6466 if (!extractFloat128Sign(z)) { 6467 z.high |= ( a.low != 0 ); 6468 z.high += roundBitsMask; 6469 } 6470 break; 6471 case float_round_down: 6472 if (extractFloat128Sign(z)) { 6473 z.high |= (a.low != 0); 6474 z.high += roundBitsMask; 6475 } 6476 break; 6477 default: 6478 abort(); 6479 } 6480 z.high &= ~ roundBitsMask; 6481 } 6482 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6483 status->float_exception_flags |= float_flag_inexact; 6484 } 6485 return z; 6486 6487 } 6488 6489 /*---------------------------------------------------------------------------- 6490 | Returns the result of adding the absolute values of the quadruple-precision 6491 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6492 | before being returned. `zSign' is ignored if the result is a NaN. 6493 | The addition is performed according to the IEC/IEEE Standard for Binary 6494 | Floating-Point Arithmetic. 6495 *----------------------------------------------------------------------------*/ 6496 6497 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6498 float_status *status) 6499 { 6500 int32_t aExp, bExp, zExp; 6501 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6502 int32_t expDiff; 6503 6504 aSig1 = extractFloat128Frac1( a ); 6505 aSig0 = extractFloat128Frac0( a ); 6506 aExp = extractFloat128Exp( a ); 6507 bSig1 = extractFloat128Frac1( b ); 6508 bSig0 = extractFloat128Frac0( b ); 6509 bExp = extractFloat128Exp( b ); 6510 expDiff = aExp - bExp; 6511 if ( 0 < expDiff ) { 6512 if ( aExp == 0x7FFF ) { 6513 if (aSig0 | aSig1) { 6514 return propagateFloat128NaN(a, b, status); 6515 } 6516 return a; 6517 } 6518 if ( bExp == 0 ) { 6519 --expDiff; 6520 } 6521 else { 6522 bSig0 |= LIT64( 0x0001000000000000 ); 6523 } 6524 shift128ExtraRightJamming( 6525 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6526 zExp = aExp; 6527 } 6528 else if ( expDiff < 0 ) { 6529 if ( bExp == 0x7FFF ) { 6530 if (bSig0 | bSig1) { 6531 return propagateFloat128NaN(a, b, status); 6532 } 6533 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6534 } 6535 if ( aExp == 0 ) { 6536 ++expDiff; 6537 } 6538 else { 6539 aSig0 |= LIT64( 0x0001000000000000 ); 6540 } 6541 shift128ExtraRightJamming( 6542 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6543 zExp = bExp; 6544 } 6545 else { 6546 if ( aExp == 0x7FFF ) { 6547 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6548 return propagateFloat128NaN(a, b, status); 6549 } 6550 return a; 6551 } 6552 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6553 if ( aExp == 0 ) { 6554 if (status->flush_to_zero) { 6555 if (zSig0 | zSig1) { 6556 float_raise(float_flag_output_denormal, status); 6557 } 6558 return packFloat128(zSign, 0, 0, 0); 6559 } 6560 return packFloat128( zSign, 0, zSig0, zSig1 ); 6561 } 6562 zSig2 = 0; 6563 zSig0 |= LIT64( 0x0002000000000000 ); 6564 zExp = aExp; 6565 goto shiftRight1; 6566 } 6567 aSig0 |= LIT64( 0x0001000000000000 ); 6568 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6569 --zExp; 6570 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6571 ++zExp; 6572 shiftRight1: 6573 shift128ExtraRightJamming( 6574 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6575 roundAndPack: 6576 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6577 6578 } 6579 6580 /*---------------------------------------------------------------------------- 6581 | Returns the result of subtracting the absolute values of the quadruple- 6582 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6583 | difference is negated before being returned. `zSign' is ignored if the 6584 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6585 | Standard for Binary Floating-Point Arithmetic. 6586 *----------------------------------------------------------------------------*/ 6587 6588 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6589 float_status *status) 6590 { 6591 int32_t aExp, bExp, zExp; 6592 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6593 int32_t expDiff; 6594 6595 aSig1 = extractFloat128Frac1( a ); 6596 aSig0 = extractFloat128Frac0( a ); 6597 aExp = extractFloat128Exp( a ); 6598 bSig1 = extractFloat128Frac1( b ); 6599 bSig0 = extractFloat128Frac0( b ); 6600 bExp = extractFloat128Exp( b ); 6601 expDiff = aExp - bExp; 6602 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6603 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6604 if ( 0 < expDiff ) goto aExpBigger; 6605 if ( expDiff < 0 ) goto bExpBigger; 6606 if ( aExp == 0x7FFF ) { 6607 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6608 return propagateFloat128NaN(a, b, status); 6609 } 6610 float_raise(float_flag_invalid, status); 6611 return float128_default_nan(status); 6612 } 6613 if ( aExp == 0 ) { 6614 aExp = 1; 6615 bExp = 1; 6616 } 6617 if ( bSig0 < aSig0 ) goto aBigger; 6618 if ( aSig0 < bSig0 ) goto bBigger; 6619 if ( bSig1 < aSig1 ) goto aBigger; 6620 if ( aSig1 < bSig1 ) goto bBigger; 6621 return packFloat128(status->float_rounding_mode == float_round_down, 6622 0, 0, 0); 6623 bExpBigger: 6624 if ( bExp == 0x7FFF ) { 6625 if (bSig0 | bSig1) { 6626 return propagateFloat128NaN(a, b, status); 6627 } 6628 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6629 } 6630 if ( aExp == 0 ) { 6631 ++expDiff; 6632 } 6633 else { 6634 aSig0 |= LIT64( 0x4000000000000000 ); 6635 } 6636 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6637 bSig0 |= LIT64( 0x4000000000000000 ); 6638 bBigger: 6639 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6640 zExp = bExp; 6641 zSign ^= 1; 6642 goto normalizeRoundAndPack; 6643 aExpBigger: 6644 if ( aExp == 0x7FFF ) { 6645 if (aSig0 | aSig1) { 6646 return propagateFloat128NaN(a, b, status); 6647 } 6648 return a; 6649 } 6650 if ( bExp == 0 ) { 6651 --expDiff; 6652 } 6653 else { 6654 bSig0 |= LIT64( 0x4000000000000000 ); 6655 } 6656 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6657 aSig0 |= LIT64( 0x4000000000000000 ); 6658 aBigger: 6659 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6660 zExp = aExp; 6661 normalizeRoundAndPack: 6662 --zExp; 6663 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6664 status); 6665 6666 } 6667 6668 /*---------------------------------------------------------------------------- 6669 | Returns the result of adding the quadruple-precision floating-point values 6670 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6671 | for Binary Floating-Point Arithmetic. 6672 *----------------------------------------------------------------------------*/ 6673 6674 float128 float128_add(float128 a, float128 b, float_status *status) 6675 { 6676 flag aSign, bSign; 6677 6678 aSign = extractFloat128Sign( a ); 6679 bSign = extractFloat128Sign( b ); 6680 if ( aSign == bSign ) { 6681 return addFloat128Sigs(a, b, aSign, status); 6682 } 6683 else { 6684 return subFloat128Sigs(a, b, aSign, status); 6685 } 6686 6687 } 6688 6689 /*---------------------------------------------------------------------------- 6690 | Returns the result of subtracting the quadruple-precision floating-point 6691 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6692 | Standard for Binary Floating-Point Arithmetic. 6693 *----------------------------------------------------------------------------*/ 6694 6695 float128 float128_sub(float128 a, float128 b, float_status *status) 6696 { 6697 flag aSign, bSign; 6698 6699 aSign = extractFloat128Sign( a ); 6700 bSign = extractFloat128Sign( b ); 6701 if ( aSign == bSign ) { 6702 return subFloat128Sigs(a, b, aSign, status); 6703 } 6704 else { 6705 return addFloat128Sigs(a, b, aSign, status); 6706 } 6707 6708 } 6709 6710 /*---------------------------------------------------------------------------- 6711 | Returns the result of multiplying the quadruple-precision floating-point 6712 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6713 | Standard for Binary Floating-Point Arithmetic. 6714 *----------------------------------------------------------------------------*/ 6715 6716 float128 float128_mul(float128 a, float128 b, float_status *status) 6717 { 6718 flag aSign, bSign, zSign; 6719 int32_t aExp, bExp, zExp; 6720 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6721 6722 aSig1 = extractFloat128Frac1( a ); 6723 aSig0 = extractFloat128Frac0( a ); 6724 aExp = extractFloat128Exp( a ); 6725 aSign = extractFloat128Sign( a ); 6726 bSig1 = extractFloat128Frac1( b ); 6727 bSig0 = extractFloat128Frac0( b ); 6728 bExp = extractFloat128Exp( b ); 6729 bSign = extractFloat128Sign( b ); 6730 zSign = aSign ^ bSign; 6731 if ( aExp == 0x7FFF ) { 6732 if ( ( aSig0 | aSig1 ) 6733 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6734 return propagateFloat128NaN(a, b, status); 6735 } 6736 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6737 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6738 } 6739 if ( bExp == 0x7FFF ) { 6740 if (bSig0 | bSig1) { 6741 return propagateFloat128NaN(a, b, status); 6742 } 6743 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6744 invalid: 6745 float_raise(float_flag_invalid, status); 6746 return float128_default_nan(status); 6747 } 6748 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6749 } 6750 if ( aExp == 0 ) { 6751 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6752 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6753 } 6754 if ( bExp == 0 ) { 6755 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6756 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6757 } 6758 zExp = aExp + bExp - 0x4000; 6759 aSig0 |= LIT64( 0x0001000000000000 ); 6760 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6761 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6762 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6763 zSig2 |= ( zSig3 != 0 ); 6764 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6765 shift128ExtraRightJamming( 6766 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6767 ++zExp; 6768 } 6769 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6770 6771 } 6772 6773 /*---------------------------------------------------------------------------- 6774 | Returns the result of dividing the quadruple-precision floating-point value 6775 | `a' by the corresponding value `b'. The operation is performed according to 6776 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6777 *----------------------------------------------------------------------------*/ 6778 6779 float128 float128_div(float128 a, float128 b, float_status *status) 6780 { 6781 flag aSign, bSign, zSign; 6782 int32_t aExp, bExp, zExp; 6783 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6784 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6785 6786 aSig1 = extractFloat128Frac1( a ); 6787 aSig0 = extractFloat128Frac0( a ); 6788 aExp = extractFloat128Exp( a ); 6789 aSign = extractFloat128Sign( a ); 6790 bSig1 = extractFloat128Frac1( b ); 6791 bSig0 = extractFloat128Frac0( b ); 6792 bExp = extractFloat128Exp( b ); 6793 bSign = extractFloat128Sign( b ); 6794 zSign = aSign ^ bSign; 6795 if ( aExp == 0x7FFF ) { 6796 if (aSig0 | aSig1) { 6797 return propagateFloat128NaN(a, b, status); 6798 } 6799 if ( bExp == 0x7FFF ) { 6800 if (bSig0 | bSig1) { 6801 return propagateFloat128NaN(a, b, status); 6802 } 6803 goto invalid; 6804 } 6805 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6806 } 6807 if ( bExp == 0x7FFF ) { 6808 if (bSig0 | bSig1) { 6809 return propagateFloat128NaN(a, b, status); 6810 } 6811 return packFloat128( zSign, 0, 0, 0 ); 6812 } 6813 if ( bExp == 0 ) { 6814 if ( ( bSig0 | bSig1 ) == 0 ) { 6815 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6816 invalid: 6817 float_raise(float_flag_invalid, status); 6818 return float128_default_nan(status); 6819 } 6820 float_raise(float_flag_divbyzero, status); 6821 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6822 } 6823 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6824 } 6825 if ( aExp == 0 ) { 6826 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6827 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6828 } 6829 zExp = aExp - bExp + 0x3FFD; 6830 shortShift128Left( 6831 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6832 shortShift128Left( 6833 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6834 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6835 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6836 ++zExp; 6837 } 6838 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6839 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6840 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6841 while ( (int64_t) rem0 < 0 ) { 6842 --zSig0; 6843 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6844 } 6845 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6846 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6847 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6848 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6849 while ( (int64_t) rem1 < 0 ) { 6850 --zSig1; 6851 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6852 } 6853 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6854 } 6855 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6856 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6857 6858 } 6859 6860 /*---------------------------------------------------------------------------- 6861 | Returns the remainder of the quadruple-precision floating-point value `a' 6862 | with respect to the corresponding value `b'. The operation is performed 6863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6864 *----------------------------------------------------------------------------*/ 6865 6866 float128 float128_rem(float128 a, float128 b, float_status *status) 6867 { 6868 flag aSign, zSign; 6869 int32_t aExp, bExp, expDiff; 6870 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6871 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6872 int64_t sigMean0; 6873 6874 aSig1 = extractFloat128Frac1( a ); 6875 aSig0 = extractFloat128Frac0( a ); 6876 aExp = extractFloat128Exp( a ); 6877 aSign = extractFloat128Sign( a ); 6878 bSig1 = extractFloat128Frac1( b ); 6879 bSig0 = extractFloat128Frac0( b ); 6880 bExp = extractFloat128Exp( b ); 6881 if ( aExp == 0x7FFF ) { 6882 if ( ( aSig0 | aSig1 ) 6883 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6884 return propagateFloat128NaN(a, b, status); 6885 } 6886 goto invalid; 6887 } 6888 if ( bExp == 0x7FFF ) { 6889 if (bSig0 | bSig1) { 6890 return propagateFloat128NaN(a, b, status); 6891 } 6892 return a; 6893 } 6894 if ( bExp == 0 ) { 6895 if ( ( bSig0 | bSig1 ) == 0 ) { 6896 invalid: 6897 float_raise(float_flag_invalid, status); 6898 return float128_default_nan(status); 6899 } 6900 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6901 } 6902 if ( aExp == 0 ) { 6903 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6904 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6905 } 6906 expDiff = aExp - bExp; 6907 if ( expDiff < -1 ) return a; 6908 shortShift128Left( 6909 aSig0 | LIT64( 0x0001000000000000 ), 6910 aSig1, 6911 15 - ( expDiff < 0 ), 6912 &aSig0, 6913 &aSig1 6914 ); 6915 shortShift128Left( 6916 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6917 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6918 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6919 expDiff -= 64; 6920 while ( 0 < expDiff ) { 6921 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6922 q = ( 4 < q ) ? q - 4 : 0; 6923 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6924 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6925 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6926 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6927 expDiff -= 61; 6928 } 6929 if ( -64 < expDiff ) { 6930 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6931 q = ( 4 < q ) ? q - 4 : 0; 6932 q >>= - expDiff; 6933 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6934 expDiff += 52; 6935 if ( expDiff < 0 ) { 6936 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6937 } 6938 else { 6939 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6940 } 6941 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6942 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6943 } 6944 else { 6945 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6946 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6947 } 6948 do { 6949 alternateASig0 = aSig0; 6950 alternateASig1 = aSig1; 6951 ++q; 6952 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6953 } while ( 0 <= (int64_t) aSig0 ); 6954 add128( 6955 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6956 if ( ( sigMean0 < 0 ) 6957 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6958 aSig0 = alternateASig0; 6959 aSig1 = alternateASig1; 6960 } 6961 zSign = ( (int64_t) aSig0 < 0 ); 6962 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6963 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6964 status); 6965 } 6966 6967 /*---------------------------------------------------------------------------- 6968 | Returns the square root of the quadruple-precision floating-point value `a'. 6969 | The operation is performed according to the IEC/IEEE Standard for Binary 6970 | Floating-Point Arithmetic. 6971 *----------------------------------------------------------------------------*/ 6972 6973 float128 float128_sqrt(float128 a, float_status *status) 6974 { 6975 flag aSign; 6976 int32_t aExp, zExp; 6977 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6978 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6979 6980 aSig1 = extractFloat128Frac1( a ); 6981 aSig0 = extractFloat128Frac0( a ); 6982 aExp = extractFloat128Exp( a ); 6983 aSign = extractFloat128Sign( a ); 6984 if ( aExp == 0x7FFF ) { 6985 if (aSig0 | aSig1) { 6986 return propagateFloat128NaN(a, a, status); 6987 } 6988 if ( ! aSign ) return a; 6989 goto invalid; 6990 } 6991 if ( aSign ) { 6992 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6993 invalid: 6994 float_raise(float_flag_invalid, status); 6995 return float128_default_nan(status); 6996 } 6997 if ( aExp == 0 ) { 6998 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6999 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 7000 } 7001 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 7002 aSig0 |= LIT64( 0x0001000000000000 ); 7003 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 7004 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 7005 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 7006 doubleZSig0 = zSig0<<1; 7007 mul64To128( zSig0, zSig0, &term0, &term1 ); 7008 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 7009 while ( (int64_t) rem0 < 0 ) { 7010 --zSig0; 7011 doubleZSig0 -= 2; 7012 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7013 } 7014 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7015 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7016 if ( zSig1 == 0 ) zSig1 = 1; 7017 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7018 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7019 mul64To128( zSig1, zSig1, &term2, &term3 ); 7020 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7021 while ( (int64_t) rem1 < 0 ) { 7022 --zSig1; 7023 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7024 term3 |= 1; 7025 term2 |= doubleZSig0; 7026 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7027 } 7028 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7029 } 7030 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7031 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7032 7033 } 7034 7035 /*---------------------------------------------------------------------------- 7036 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7037 | the corresponding value `b', and 0 otherwise. The invalid exception is 7038 | raised if either operand is a NaN. Otherwise, the comparison is performed 7039 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7040 *----------------------------------------------------------------------------*/ 7041 7042 int float128_eq(float128 a, float128 b, float_status *status) 7043 { 7044 7045 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7046 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7047 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7048 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7049 ) { 7050 float_raise(float_flag_invalid, status); 7051 return 0; 7052 } 7053 return 7054 ( a.low == b.low ) 7055 && ( ( a.high == b.high ) 7056 || ( ( a.low == 0 ) 7057 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7058 ); 7059 7060 } 7061 7062 /*---------------------------------------------------------------------------- 7063 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7064 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7065 | exception is raised if either operand is a NaN. The comparison is performed 7066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7067 *----------------------------------------------------------------------------*/ 7068 7069 int float128_le(float128 a, float128 b, float_status *status) 7070 { 7071 flag aSign, bSign; 7072 7073 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7074 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7075 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7076 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7077 ) { 7078 float_raise(float_flag_invalid, status); 7079 return 0; 7080 } 7081 aSign = extractFloat128Sign( a ); 7082 bSign = extractFloat128Sign( b ); 7083 if ( aSign != bSign ) { 7084 return 7085 aSign 7086 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7087 == 0 ); 7088 } 7089 return 7090 aSign ? le128( b.high, b.low, a.high, a.low ) 7091 : le128( a.high, a.low, b.high, b.low ); 7092 7093 } 7094 7095 /*---------------------------------------------------------------------------- 7096 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7097 | the corresponding value `b', and 0 otherwise. The invalid exception is 7098 | raised if either operand is a NaN. The comparison is performed according 7099 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7100 *----------------------------------------------------------------------------*/ 7101 7102 int float128_lt(float128 a, float128 b, float_status *status) 7103 { 7104 flag aSign, bSign; 7105 7106 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7107 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7108 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7109 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7110 ) { 7111 float_raise(float_flag_invalid, status); 7112 return 0; 7113 } 7114 aSign = extractFloat128Sign( a ); 7115 bSign = extractFloat128Sign( b ); 7116 if ( aSign != bSign ) { 7117 return 7118 aSign 7119 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7120 != 0 ); 7121 } 7122 return 7123 aSign ? lt128( b.high, b.low, a.high, a.low ) 7124 : lt128( a.high, a.low, b.high, b.low ); 7125 7126 } 7127 7128 /*---------------------------------------------------------------------------- 7129 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7130 | be compared, and 0 otherwise. The invalid exception is raised if either 7131 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7132 | Standard for Binary Floating-Point Arithmetic. 7133 *----------------------------------------------------------------------------*/ 7134 7135 int float128_unordered(float128 a, float128 b, float_status *status) 7136 { 7137 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7138 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7139 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7140 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7141 ) { 7142 float_raise(float_flag_invalid, status); 7143 return 1; 7144 } 7145 return 0; 7146 } 7147 7148 /*---------------------------------------------------------------------------- 7149 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7150 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7151 | exception. The comparison is performed according to the IEC/IEEE Standard 7152 | for Binary Floating-Point Arithmetic. 7153 *----------------------------------------------------------------------------*/ 7154 7155 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7156 { 7157 7158 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7159 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7160 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7161 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7162 ) { 7163 if (float128_is_signaling_nan(a, status) 7164 || float128_is_signaling_nan(b, status)) { 7165 float_raise(float_flag_invalid, status); 7166 } 7167 return 0; 7168 } 7169 return 7170 ( a.low == b.low ) 7171 && ( ( a.high == b.high ) 7172 || ( ( a.low == 0 ) 7173 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7174 ); 7175 7176 } 7177 7178 /*---------------------------------------------------------------------------- 7179 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7180 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7181 | cause an exception. Otherwise, the comparison is performed according to the 7182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7183 *----------------------------------------------------------------------------*/ 7184 7185 int float128_le_quiet(float128 a, float128 b, float_status *status) 7186 { 7187 flag aSign, bSign; 7188 7189 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7190 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7191 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7192 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7193 ) { 7194 if (float128_is_signaling_nan(a, status) 7195 || float128_is_signaling_nan(b, status)) { 7196 float_raise(float_flag_invalid, status); 7197 } 7198 return 0; 7199 } 7200 aSign = extractFloat128Sign( a ); 7201 bSign = extractFloat128Sign( b ); 7202 if ( aSign != bSign ) { 7203 return 7204 aSign 7205 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7206 == 0 ); 7207 } 7208 return 7209 aSign ? le128( b.high, b.low, a.high, a.low ) 7210 : le128( a.high, a.low, b.high, b.low ); 7211 7212 } 7213 7214 /*---------------------------------------------------------------------------- 7215 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7216 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7217 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7218 | Standard for Binary Floating-Point Arithmetic. 7219 *----------------------------------------------------------------------------*/ 7220 7221 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7222 { 7223 flag aSign, bSign; 7224 7225 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7226 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7227 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7228 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7229 ) { 7230 if (float128_is_signaling_nan(a, status) 7231 || float128_is_signaling_nan(b, status)) { 7232 float_raise(float_flag_invalid, status); 7233 } 7234 return 0; 7235 } 7236 aSign = extractFloat128Sign( a ); 7237 bSign = extractFloat128Sign( b ); 7238 if ( aSign != bSign ) { 7239 return 7240 aSign 7241 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7242 != 0 ); 7243 } 7244 return 7245 aSign ? lt128( b.high, b.low, a.high, a.low ) 7246 : lt128( a.high, a.low, b.high, b.low ); 7247 7248 } 7249 7250 /*---------------------------------------------------------------------------- 7251 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7252 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7253 | comparison is performed according to the IEC/IEEE Standard for Binary 7254 | Floating-Point Arithmetic. 7255 *----------------------------------------------------------------------------*/ 7256 7257 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7258 { 7259 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7260 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7261 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7262 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7263 ) { 7264 if (float128_is_signaling_nan(a, status) 7265 || float128_is_signaling_nan(b, status)) { 7266 float_raise(float_flag_invalid, status); 7267 } 7268 return 1; 7269 } 7270 return 0; 7271 } 7272 7273 /* misc functions */ 7274 float32 uint32_to_float32(uint32_t a, float_status *status) 7275 { 7276 return int64_to_float32(a, status); 7277 } 7278 7279 float64 uint32_to_float64(uint32_t a, float_status *status) 7280 { 7281 return int64_to_float64(a, status); 7282 } 7283 7284 uint32_t float32_to_uint32(float32 a, float_status *status) 7285 { 7286 int64_t v; 7287 uint32_t res; 7288 int old_exc_flags = get_float_exception_flags(status); 7289 7290 v = float32_to_int64(a, status); 7291 if (v < 0) { 7292 res = 0; 7293 } else if (v > 0xffffffff) { 7294 res = 0xffffffff; 7295 } else { 7296 return v; 7297 } 7298 set_float_exception_flags(old_exc_flags, status); 7299 float_raise(float_flag_invalid, status); 7300 return res; 7301 } 7302 7303 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7304 { 7305 int64_t v; 7306 uint32_t res; 7307 int old_exc_flags = get_float_exception_flags(status); 7308 7309 v = float32_to_int64_round_to_zero(a, status); 7310 if (v < 0) { 7311 res = 0; 7312 } else if (v > 0xffffffff) { 7313 res = 0xffffffff; 7314 } else { 7315 return v; 7316 } 7317 set_float_exception_flags(old_exc_flags, status); 7318 float_raise(float_flag_invalid, status); 7319 return res; 7320 } 7321 7322 int16_t float32_to_int16(float32 a, float_status *status) 7323 { 7324 int32_t v; 7325 int16_t res; 7326 int old_exc_flags = get_float_exception_flags(status); 7327 7328 v = float32_to_int32(a, status); 7329 if (v < -0x8000) { 7330 res = -0x8000; 7331 } else if (v > 0x7fff) { 7332 res = 0x7fff; 7333 } else { 7334 return v; 7335 } 7336 7337 set_float_exception_flags(old_exc_flags, status); 7338 float_raise(float_flag_invalid, status); 7339 return res; 7340 } 7341 7342 uint16_t float32_to_uint16(float32 a, float_status *status) 7343 { 7344 int32_t v; 7345 uint16_t res; 7346 int old_exc_flags = get_float_exception_flags(status); 7347 7348 v = float32_to_int32(a, status); 7349 if (v < 0) { 7350 res = 0; 7351 } else if (v > 0xffff) { 7352 res = 0xffff; 7353 } else { 7354 return v; 7355 } 7356 7357 set_float_exception_flags(old_exc_flags, status); 7358 float_raise(float_flag_invalid, status); 7359 return res; 7360 } 7361 7362 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7363 { 7364 int64_t v; 7365 uint16_t res; 7366 int old_exc_flags = get_float_exception_flags(status); 7367 7368 v = float32_to_int64_round_to_zero(a, status); 7369 if (v < 0) { 7370 res = 0; 7371 } else if (v > 0xffff) { 7372 res = 0xffff; 7373 } else { 7374 return v; 7375 } 7376 set_float_exception_flags(old_exc_flags, status); 7377 float_raise(float_flag_invalid, status); 7378 return res; 7379 } 7380 7381 uint32_t float64_to_uint32(float64 a, float_status *status) 7382 { 7383 uint64_t v; 7384 uint32_t res; 7385 int old_exc_flags = get_float_exception_flags(status); 7386 7387 v = float64_to_uint64(a, status); 7388 if (v > 0xffffffff) { 7389 res = 0xffffffff; 7390 } else { 7391 return v; 7392 } 7393 set_float_exception_flags(old_exc_flags, status); 7394 float_raise(float_flag_invalid, status); 7395 return res; 7396 } 7397 7398 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7399 { 7400 uint64_t v; 7401 uint32_t res; 7402 int old_exc_flags = get_float_exception_flags(status); 7403 7404 v = float64_to_uint64_round_to_zero(a, status); 7405 if (v > 0xffffffff) { 7406 res = 0xffffffff; 7407 } else { 7408 return v; 7409 } 7410 set_float_exception_flags(old_exc_flags, status); 7411 float_raise(float_flag_invalid, status); 7412 return res; 7413 } 7414 7415 int16_t float64_to_int16(float64 a, float_status *status) 7416 { 7417 int64_t v; 7418 int16_t res; 7419 int old_exc_flags = get_float_exception_flags(status); 7420 7421 v = float64_to_int32(a, status); 7422 if (v < -0x8000) { 7423 res = -0x8000; 7424 } else if (v > 0x7fff) { 7425 res = 0x7fff; 7426 } else { 7427 return v; 7428 } 7429 7430 set_float_exception_flags(old_exc_flags, status); 7431 float_raise(float_flag_invalid, status); 7432 return res; 7433 } 7434 7435 uint16_t float64_to_uint16(float64 a, float_status *status) 7436 { 7437 int64_t v; 7438 uint16_t res; 7439 int old_exc_flags = get_float_exception_flags(status); 7440 7441 v = float64_to_int32(a, status); 7442 if (v < 0) { 7443 res = 0; 7444 } else if (v > 0xffff) { 7445 res = 0xffff; 7446 } else { 7447 return v; 7448 } 7449 7450 set_float_exception_flags(old_exc_flags, status); 7451 float_raise(float_flag_invalid, status); 7452 return res; 7453 } 7454 7455 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7456 { 7457 int64_t v; 7458 uint16_t res; 7459 int old_exc_flags = get_float_exception_flags(status); 7460 7461 v = float64_to_int64_round_to_zero(a, status); 7462 if (v < 0) { 7463 res = 0; 7464 } else if (v > 0xffff) { 7465 res = 0xffff; 7466 } else { 7467 return v; 7468 } 7469 set_float_exception_flags(old_exc_flags, status); 7470 float_raise(float_flag_invalid, status); 7471 return res; 7472 } 7473 7474 /*---------------------------------------------------------------------------- 7475 | Returns the result of converting the double-precision floating-point value 7476 | `a' to the 64-bit unsigned integer format. The conversion is 7477 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7478 | Arithmetic---which means in particular that the conversion is rounded 7479 | according to the current rounding mode. If `a' is a NaN, the largest 7480 | positive integer is returned. If the conversion overflows, the 7481 | largest unsigned integer is returned. If 'a' is negative, the value is 7482 | rounded and zero is returned; negative values that do not round to zero 7483 | will raise the inexact exception. 7484 *----------------------------------------------------------------------------*/ 7485 7486 uint64_t float64_to_uint64(float64 a, float_status *status) 7487 { 7488 flag aSign; 7489 int aExp; 7490 int shiftCount; 7491 uint64_t aSig, aSigExtra; 7492 a = float64_squash_input_denormal(a, status); 7493 7494 aSig = extractFloat64Frac(a); 7495 aExp = extractFloat64Exp(a); 7496 aSign = extractFloat64Sign(a); 7497 if (aSign && (aExp > 1022)) { 7498 float_raise(float_flag_invalid, status); 7499 if (float64_is_any_nan(a)) { 7500 return LIT64(0xFFFFFFFFFFFFFFFF); 7501 } else { 7502 return 0; 7503 } 7504 } 7505 if (aExp) { 7506 aSig |= LIT64(0x0010000000000000); 7507 } 7508 shiftCount = 0x433 - aExp; 7509 if (shiftCount <= 0) { 7510 if (0x43E < aExp) { 7511 float_raise(float_flag_invalid, status); 7512 return LIT64(0xFFFFFFFFFFFFFFFF); 7513 } 7514 aSigExtra = 0; 7515 aSig <<= -shiftCount; 7516 } else { 7517 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7518 } 7519 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7520 } 7521 7522 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7523 { 7524 signed char current_rounding_mode = status->float_rounding_mode; 7525 set_float_rounding_mode(float_round_to_zero, status); 7526 uint64_t v = float64_to_uint64(a, status); 7527 set_float_rounding_mode(current_rounding_mode, status); 7528 return v; 7529 } 7530 7531 #define COMPARE(s, nan_exp) \ 7532 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7533 int is_quiet, float_status *status) \ 7534 { \ 7535 flag aSign, bSign; \ 7536 uint ## s ## _t av, bv; \ 7537 a = float ## s ## _squash_input_denormal(a, status); \ 7538 b = float ## s ## _squash_input_denormal(b, status); \ 7539 \ 7540 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7541 extractFloat ## s ## Frac( a ) ) || \ 7542 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7543 extractFloat ## s ## Frac( b ) )) { \ 7544 if (!is_quiet || \ 7545 float ## s ## _is_signaling_nan(a, status) || \ 7546 float ## s ## _is_signaling_nan(b, status)) { \ 7547 float_raise(float_flag_invalid, status); \ 7548 } \ 7549 return float_relation_unordered; \ 7550 } \ 7551 aSign = extractFloat ## s ## Sign( a ); \ 7552 bSign = extractFloat ## s ## Sign( b ); \ 7553 av = float ## s ## _val(a); \ 7554 bv = float ## s ## _val(b); \ 7555 if ( aSign != bSign ) { \ 7556 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7557 /* zero case */ \ 7558 return float_relation_equal; \ 7559 } else { \ 7560 return 1 - (2 * aSign); \ 7561 } \ 7562 } else { \ 7563 if (av == bv) { \ 7564 return float_relation_equal; \ 7565 } else { \ 7566 return 1 - 2 * (aSign ^ ( av < bv )); \ 7567 } \ 7568 } \ 7569 } \ 7570 \ 7571 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7572 { \ 7573 return float ## s ## _compare_internal(a, b, 0, status); \ 7574 } \ 7575 \ 7576 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7577 float_status *status) \ 7578 { \ 7579 return float ## s ## _compare_internal(a, b, 1, status); \ 7580 } 7581 7582 COMPARE(32, 0xff) 7583 COMPARE(64, 0x7ff) 7584 7585 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7586 int is_quiet, float_status *status) 7587 { 7588 flag aSign, bSign; 7589 7590 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7591 float_raise(float_flag_invalid, status); 7592 return float_relation_unordered; 7593 } 7594 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7595 ( extractFloatx80Frac( a )<<1 ) ) || 7596 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7597 ( extractFloatx80Frac( b )<<1 ) )) { 7598 if (!is_quiet || 7599 floatx80_is_signaling_nan(a, status) || 7600 floatx80_is_signaling_nan(b, status)) { 7601 float_raise(float_flag_invalid, status); 7602 } 7603 return float_relation_unordered; 7604 } 7605 aSign = extractFloatx80Sign( a ); 7606 bSign = extractFloatx80Sign( b ); 7607 if ( aSign != bSign ) { 7608 7609 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7610 ( ( a.low | b.low ) == 0 ) ) { 7611 /* zero case */ 7612 return float_relation_equal; 7613 } else { 7614 return 1 - (2 * aSign); 7615 } 7616 } else { 7617 if (a.low == b.low && a.high == b.high) { 7618 return float_relation_equal; 7619 } else { 7620 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7621 } 7622 } 7623 } 7624 7625 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7626 { 7627 return floatx80_compare_internal(a, b, 0, status); 7628 } 7629 7630 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7631 { 7632 return floatx80_compare_internal(a, b, 1, status); 7633 } 7634 7635 static inline int float128_compare_internal(float128 a, float128 b, 7636 int is_quiet, float_status *status) 7637 { 7638 flag aSign, bSign; 7639 7640 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7641 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7642 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7643 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7644 if (!is_quiet || 7645 float128_is_signaling_nan(a, status) || 7646 float128_is_signaling_nan(b, status)) { 7647 float_raise(float_flag_invalid, status); 7648 } 7649 return float_relation_unordered; 7650 } 7651 aSign = extractFloat128Sign( a ); 7652 bSign = extractFloat128Sign( b ); 7653 if ( aSign != bSign ) { 7654 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7655 /* zero case */ 7656 return float_relation_equal; 7657 } else { 7658 return 1 - (2 * aSign); 7659 } 7660 } else { 7661 if (a.low == b.low && a.high == b.high) { 7662 return float_relation_equal; 7663 } else { 7664 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7665 } 7666 } 7667 } 7668 7669 int float128_compare(float128 a, float128 b, float_status *status) 7670 { 7671 return float128_compare_internal(a, b, 0, status); 7672 } 7673 7674 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7675 { 7676 return float128_compare_internal(a, b, 1, status); 7677 } 7678 7679 /* min() and max() functions. These can't be implemented as 7680 * 'compare and pick one input' because that would mishandle 7681 * NaNs and +0 vs -0. 7682 * 7683 * minnum() and maxnum() functions. These are similar to the min() 7684 * and max() functions but if one of the arguments is a QNaN and 7685 * the other is numerical then the numerical argument is returned. 7686 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7687 * and maxNum() operations. min() and max() are the typical min/max 7688 * semantics provided by many CPUs which predate that specification. 7689 * 7690 * minnummag() and maxnummag() functions correspond to minNumMag() 7691 * and minNumMag() from the IEEE-754 2008. 7692 */ 7693 #define MINMAX(s) \ 7694 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7695 int ismin, int isieee, \ 7696 int ismag, \ 7697 float_status *status) \ 7698 { \ 7699 flag aSign, bSign; \ 7700 uint ## s ## _t av, bv, aav, abv; \ 7701 a = float ## s ## _squash_input_denormal(a, status); \ 7702 b = float ## s ## _squash_input_denormal(b, status); \ 7703 if (float ## s ## _is_any_nan(a) || \ 7704 float ## s ## _is_any_nan(b)) { \ 7705 if (isieee) { \ 7706 if (float ## s ## _is_quiet_nan(a, status) && \ 7707 !float ## s ##_is_any_nan(b)) { \ 7708 return b; \ 7709 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7710 !float ## s ## _is_any_nan(a)) { \ 7711 return a; \ 7712 } \ 7713 } \ 7714 return propagateFloat ## s ## NaN(a, b, status); \ 7715 } \ 7716 aSign = extractFloat ## s ## Sign(a); \ 7717 bSign = extractFloat ## s ## Sign(b); \ 7718 av = float ## s ## _val(a); \ 7719 bv = float ## s ## _val(b); \ 7720 if (ismag) { \ 7721 aav = float ## s ## _abs(av); \ 7722 abv = float ## s ## _abs(bv); \ 7723 if (aav != abv) { \ 7724 if (ismin) { \ 7725 return (aav < abv) ? a : b; \ 7726 } else { \ 7727 return (aav < abv) ? b : a; \ 7728 } \ 7729 } \ 7730 } \ 7731 if (aSign != bSign) { \ 7732 if (ismin) { \ 7733 return aSign ? a : b; \ 7734 } else { \ 7735 return aSign ? b : a; \ 7736 } \ 7737 } else { \ 7738 if (ismin) { \ 7739 return (aSign ^ (av < bv)) ? a : b; \ 7740 } else { \ 7741 return (aSign ^ (av < bv)) ? b : a; \ 7742 } \ 7743 } \ 7744 } \ 7745 \ 7746 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7747 float_status *status) \ 7748 { \ 7749 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7750 } \ 7751 \ 7752 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7753 float_status *status) \ 7754 { \ 7755 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7756 } \ 7757 \ 7758 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7759 float_status *status) \ 7760 { \ 7761 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7762 } \ 7763 \ 7764 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7765 float_status *status) \ 7766 { \ 7767 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7768 } \ 7769 \ 7770 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7771 float_status *status) \ 7772 { \ 7773 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7774 } \ 7775 \ 7776 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7777 float_status *status) \ 7778 { \ 7779 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7780 } 7781 7782 MINMAX(32) 7783 MINMAX(64) 7784 7785 7786 /* Multiply A by 2 raised to the power N. */ 7787 float32 float32_scalbn(float32 a, int n, float_status *status) 7788 { 7789 flag aSign; 7790 int16_t aExp; 7791 uint32_t aSig; 7792 7793 a = float32_squash_input_denormal(a, status); 7794 aSig = extractFloat32Frac( a ); 7795 aExp = extractFloat32Exp( a ); 7796 aSign = extractFloat32Sign( a ); 7797 7798 if ( aExp == 0xFF ) { 7799 if ( aSig ) { 7800 return propagateFloat32NaN(a, a, status); 7801 } 7802 return a; 7803 } 7804 if (aExp != 0) { 7805 aSig |= 0x00800000; 7806 } else if (aSig == 0) { 7807 return a; 7808 } else { 7809 aExp++; 7810 } 7811 7812 if (n > 0x200) { 7813 n = 0x200; 7814 } else if (n < -0x200) { 7815 n = -0x200; 7816 } 7817 7818 aExp += n - 1; 7819 aSig <<= 7; 7820 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7821 } 7822 7823 float64 float64_scalbn(float64 a, int n, float_status *status) 7824 { 7825 flag aSign; 7826 int16_t aExp; 7827 uint64_t aSig; 7828 7829 a = float64_squash_input_denormal(a, status); 7830 aSig = extractFloat64Frac( a ); 7831 aExp = extractFloat64Exp( a ); 7832 aSign = extractFloat64Sign( a ); 7833 7834 if ( aExp == 0x7FF ) { 7835 if ( aSig ) { 7836 return propagateFloat64NaN(a, a, status); 7837 } 7838 return a; 7839 } 7840 if (aExp != 0) { 7841 aSig |= LIT64( 0x0010000000000000 ); 7842 } else if (aSig == 0) { 7843 return a; 7844 } else { 7845 aExp++; 7846 } 7847 7848 if (n > 0x1000) { 7849 n = 0x1000; 7850 } else if (n < -0x1000) { 7851 n = -0x1000; 7852 } 7853 7854 aExp += n - 1; 7855 aSig <<= 10; 7856 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7857 } 7858 7859 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7860 { 7861 flag aSign; 7862 int32_t aExp; 7863 uint64_t aSig; 7864 7865 if (floatx80_invalid_encoding(a)) { 7866 float_raise(float_flag_invalid, status); 7867 return floatx80_default_nan(status); 7868 } 7869 aSig = extractFloatx80Frac( a ); 7870 aExp = extractFloatx80Exp( a ); 7871 aSign = extractFloatx80Sign( a ); 7872 7873 if ( aExp == 0x7FFF ) { 7874 if ( aSig<<1 ) { 7875 return propagateFloatx80NaN(a, a, status); 7876 } 7877 return a; 7878 } 7879 7880 if (aExp == 0) { 7881 if (aSig == 0) { 7882 return a; 7883 } 7884 aExp++; 7885 } 7886 7887 if (n > 0x10000) { 7888 n = 0x10000; 7889 } else if (n < -0x10000) { 7890 n = -0x10000; 7891 } 7892 7893 aExp += n; 7894 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7895 aSign, aExp, aSig, 0, status); 7896 } 7897 7898 float128 float128_scalbn(float128 a, int n, float_status *status) 7899 { 7900 flag aSign; 7901 int32_t aExp; 7902 uint64_t aSig0, aSig1; 7903 7904 aSig1 = extractFloat128Frac1( a ); 7905 aSig0 = extractFloat128Frac0( a ); 7906 aExp = extractFloat128Exp( a ); 7907 aSign = extractFloat128Sign( a ); 7908 if ( aExp == 0x7FFF ) { 7909 if ( aSig0 | aSig1 ) { 7910 return propagateFloat128NaN(a, a, status); 7911 } 7912 return a; 7913 } 7914 if (aExp != 0) { 7915 aSig0 |= LIT64( 0x0001000000000000 ); 7916 } else if (aSig0 == 0 && aSig1 == 0) { 7917 return a; 7918 } else { 7919 aExp++; 7920 } 7921 7922 if (n > 0x10000) { 7923 n = 0x10000; 7924 } else if (n < -0x10000) { 7925 n = -0x10000; 7926 } 7927 7928 aExp += n - 1; 7929 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7930 , status); 7931 7932 } 7933