1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /*---------------------------------------------------------------------------- 190 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 191 | and 7, and returns the properly rounded 32-bit integer corresponding to the 192 | input. If `zSign' is 1, the input is negated before being converted to an 193 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 194 | is simply rounded to an integer, with the inexact exception raised if the 195 | input cannot be represented exactly as an integer. However, if the fixed- 196 | point input is too large, the invalid exception is raised and the largest 197 | positive or negative integer is returned. 198 *----------------------------------------------------------------------------*/ 199 200 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 201 { 202 int8_t roundingMode; 203 flag roundNearestEven; 204 int8_t roundIncrement, roundBits; 205 int32_t z; 206 207 roundingMode = status->float_rounding_mode; 208 roundNearestEven = ( roundingMode == float_round_nearest_even ); 209 switch (roundingMode) { 210 case float_round_nearest_even: 211 case float_round_ties_away: 212 roundIncrement = 0x40; 213 break; 214 case float_round_to_zero: 215 roundIncrement = 0; 216 break; 217 case float_round_up: 218 roundIncrement = zSign ? 0 : 0x7f; 219 break; 220 case float_round_down: 221 roundIncrement = zSign ? 0x7f : 0; 222 break; 223 default: 224 abort(); 225 } 226 roundBits = absZ & 0x7F; 227 absZ = ( absZ + roundIncrement )>>7; 228 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 229 z = absZ; 230 if ( zSign ) z = - z; 231 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 232 float_raise(float_flag_invalid, status); 233 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 234 } 235 if (roundBits) { 236 status->float_exception_flags |= float_flag_inexact; 237 } 238 return z; 239 240 } 241 242 /*---------------------------------------------------------------------------- 243 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 244 | `absZ1', with binary point between bits 63 and 64 (between the input words), 245 | and returns the properly rounded 64-bit integer corresponding to the input. 246 | If `zSign' is 1, the input is negated before being converted to an integer. 247 | Ordinarily, the fixed-point input is simply rounded to an integer, with 248 | the inexact exception raised if the input cannot be represented exactly as 249 | an integer. However, if the fixed-point input is too large, the invalid 250 | exception is raised and the largest positive or negative integer is 251 | returned. 252 *----------------------------------------------------------------------------*/ 253 254 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 255 float_status *status) 256 { 257 int8_t roundingMode; 258 flag roundNearestEven, increment; 259 int64_t z; 260 261 roundingMode = status->float_rounding_mode; 262 roundNearestEven = ( roundingMode == float_round_nearest_even ); 263 switch (roundingMode) { 264 case float_round_nearest_even: 265 case float_round_ties_away: 266 increment = ((int64_t) absZ1 < 0); 267 break; 268 case float_round_to_zero: 269 increment = 0; 270 break; 271 case float_round_up: 272 increment = !zSign && absZ1; 273 break; 274 case float_round_down: 275 increment = zSign && absZ1; 276 break; 277 default: 278 abort(); 279 } 280 if ( increment ) { 281 ++absZ0; 282 if ( absZ0 == 0 ) goto overflow; 283 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 284 } 285 z = absZ0; 286 if ( zSign ) z = - z; 287 if ( z && ( ( z < 0 ) ^ zSign ) ) { 288 overflow: 289 float_raise(float_flag_invalid, status); 290 return 291 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 292 : LIT64( 0x7FFFFFFFFFFFFFFF ); 293 } 294 if (absZ1) { 295 status->float_exception_flags |= float_flag_inexact; 296 } 297 return z; 298 299 } 300 301 /*---------------------------------------------------------------------------- 302 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 303 | `absZ1', with binary point between bits 63 and 64 (between the input words), 304 | and returns the properly rounded 64-bit unsigned integer corresponding to the 305 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 306 | with the inexact exception raised if the input cannot be represented exactly 307 | as an integer. However, if the fixed-point input is too large, the invalid 308 | exception is raised and the largest unsigned integer is returned. 309 *----------------------------------------------------------------------------*/ 310 311 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 312 uint64_t absZ1, float_status *status) 313 { 314 int8_t roundingMode; 315 flag roundNearestEven, increment; 316 317 roundingMode = status->float_rounding_mode; 318 roundNearestEven = (roundingMode == float_round_nearest_even); 319 switch (roundingMode) { 320 case float_round_nearest_even: 321 case float_round_ties_away: 322 increment = ((int64_t)absZ1 < 0); 323 break; 324 case float_round_to_zero: 325 increment = 0; 326 break; 327 case float_round_up: 328 increment = !zSign && absZ1; 329 break; 330 case float_round_down: 331 increment = zSign && absZ1; 332 break; 333 default: 334 abort(); 335 } 336 if (increment) { 337 ++absZ0; 338 if (absZ0 == 0) { 339 float_raise(float_flag_invalid, status); 340 return LIT64(0xFFFFFFFFFFFFFFFF); 341 } 342 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 343 } 344 345 if (zSign && absZ0) { 346 float_raise(float_flag_invalid, status); 347 return 0; 348 } 349 350 if (absZ1) { 351 status->float_exception_flags |= float_flag_inexact; 352 } 353 return absZ0; 354 } 355 356 /*---------------------------------------------------------------------------- 357 | If `a' is denormal and we are in flush-to-zero mode then set the 358 | input-denormal exception and return zero. Otherwise just return the value. 359 *----------------------------------------------------------------------------*/ 360 float32 float32_squash_input_denormal(float32 a, float_status *status) 361 { 362 if (status->flush_inputs_to_zero) { 363 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 364 float_raise(float_flag_input_denormal, status); 365 return make_float32(float32_val(a) & 0x80000000); 366 } 367 } 368 return a; 369 } 370 371 /*---------------------------------------------------------------------------- 372 | Normalizes the subnormal single-precision floating-point value represented 373 | by the denormalized significand `aSig'. The normalized exponent and 374 | significand are stored at the locations pointed to by `zExpPtr' and 375 | `zSigPtr', respectively. 376 *----------------------------------------------------------------------------*/ 377 378 static void 379 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 380 { 381 int8_t shiftCount; 382 383 shiftCount = countLeadingZeros32( aSig ) - 8; 384 *zSigPtr = aSig<<shiftCount; 385 *zExpPtr = 1 - shiftCount; 386 387 } 388 389 /*---------------------------------------------------------------------------- 390 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 391 | single-precision floating-point value, returning the result. After being 392 | shifted into the proper positions, the three fields are simply added 393 | together to form the result. This means that any integer portion of `zSig' 394 | will be added into the exponent. Since a properly normalized significand 395 | will have an integer portion equal to 1, the `zExp' input should be 1 less 396 | than the desired result exponent whenever `zSig' is a complete, normalized 397 | significand. 398 *----------------------------------------------------------------------------*/ 399 400 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 401 { 402 403 return make_float32( 404 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 405 406 } 407 408 /*---------------------------------------------------------------------------- 409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 410 | and significand `zSig', and returns the proper single-precision floating- 411 | point value corresponding to the abstract input. Ordinarily, the abstract 412 | value is simply rounded and packed into the single-precision format, with 413 | the inexact exception raised if the abstract input cannot be represented 414 | exactly. However, if the abstract value is too large, the overflow and 415 | inexact exceptions are raised and an infinity or maximal finite value is 416 | returned. If the abstract value is too small, the input value is rounded to 417 | a subnormal number, and the underflow and inexact exceptions are raised if 418 | the abstract input cannot be represented exactly as a subnormal single- 419 | precision floating-point number. 420 | The input significand `zSig' has its binary point between bits 30 421 | and 29, which is 7 bits to the left of the usual location. This shifted 422 | significand must be normalized or smaller. If `zSig' is not normalized, 423 | `zExp' must be 0; in that case, the result returned is a subnormal number, 424 | and it must not require rounding. In the usual case that `zSig' is 425 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 426 | The handling of underflow and overflow follows the IEC/IEEE Standard for 427 | Binary Floating-Point Arithmetic. 428 *----------------------------------------------------------------------------*/ 429 430 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 431 float_status *status) 432 { 433 int8_t roundingMode; 434 flag roundNearestEven; 435 int8_t roundIncrement, roundBits; 436 flag isTiny; 437 438 roundingMode = status->float_rounding_mode; 439 roundNearestEven = ( roundingMode == float_round_nearest_even ); 440 switch (roundingMode) { 441 case float_round_nearest_even: 442 case float_round_ties_away: 443 roundIncrement = 0x40; 444 break; 445 case float_round_to_zero: 446 roundIncrement = 0; 447 break; 448 case float_round_up: 449 roundIncrement = zSign ? 0 : 0x7f; 450 break; 451 case float_round_down: 452 roundIncrement = zSign ? 0x7f : 0; 453 break; 454 default: 455 abort(); 456 break; 457 } 458 roundBits = zSig & 0x7F; 459 if ( 0xFD <= (uint16_t) zExp ) { 460 if ( ( 0xFD < zExp ) 461 || ( ( zExp == 0xFD ) 462 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 463 ) { 464 float_raise(float_flag_overflow | float_flag_inexact, status); 465 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 466 } 467 if ( zExp < 0 ) { 468 if (status->flush_to_zero) { 469 float_raise(float_flag_output_denormal, status); 470 return packFloat32(zSign, 0, 0); 471 } 472 isTiny = 473 (status->float_detect_tininess 474 == float_tininess_before_rounding) 475 || ( zExp < -1 ) 476 || ( zSig + roundIncrement < 0x80000000 ); 477 shift32RightJamming( zSig, - zExp, &zSig ); 478 zExp = 0; 479 roundBits = zSig & 0x7F; 480 if (isTiny && roundBits) { 481 float_raise(float_flag_underflow, status); 482 } 483 } 484 } 485 if (roundBits) { 486 status->float_exception_flags |= float_flag_inexact; 487 } 488 zSig = ( zSig + roundIncrement )>>7; 489 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 490 if ( zSig == 0 ) zExp = 0; 491 return packFloat32( zSign, zExp, zSig ); 492 493 } 494 495 /*---------------------------------------------------------------------------- 496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 497 | and significand `zSig', and returns the proper single-precision floating- 498 | point value corresponding to the abstract input. This routine is just like 499 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 500 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 501 | floating-point exponent. 502 *----------------------------------------------------------------------------*/ 503 504 static float32 505 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 506 float_status *status) 507 { 508 int8_t shiftCount; 509 510 shiftCount = countLeadingZeros32( zSig ) - 1; 511 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 512 status); 513 514 } 515 516 /*---------------------------------------------------------------------------- 517 | If `a' is denormal and we are in flush-to-zero mode then set the 518 | input-denormal exception and return zero. Otherwise just return the value. 519 *----------------------------------------------------------------------------*/ 520 float64 float64_squash_input_denormal(float64 a, float_status *status) 521 { 522 if (status->flush_inputs_to_zero) { 523 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 524 float_raise(float_flag_input_denormal, status); 525 return make_float64(float64_val(a) & (1ULL << 63)); 526 } 527 } 528 return a; 529 } 530 531 /*---------------------------------------------------------------------------- 532 | Normalizes the subnormal double-precision floating-point value represented 533 | by the denormalized significand `aSig'. The normalized exponent and 534 | significand are stored at the locations pointed to by `zExpPtr' and 535 | `zSigPtr', respectively. 536 *----------------------------------------------------------------------------*/ 537 538 static void 539 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 540 { 541 int8_t shiftCount; 542 543 shiftCount = countLeadingZeros64( aSig ) - 11; 544 *zSigPtr = aSig<<shiftCount; 545 *zExpPtr = 1 - shiftCount; 546 547 } 548 549 /*---------------------------------------------------------------------------- 550 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 551 | double-precision floating-point value, returning the result. After being 552 | shifted into the proper positions, the three fields are simply added 553 | together to form the result. This means that any integer portion of `zSig' 554 | will be added into the exponent. Since a properly normalized significand 555 | will have an integer portion equal to 1, the `zExp' input should be 1 less 556 | than the desired result exponent whenever `zSig' is a complete, normalized 557 | significand. 558 *----------------------------------------------------------------------------*/ 559 560 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 561 { 562 563 return make_float64( 564 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 565 566 } 567 568 /*---------------------------------------------------------------------------- 569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 570 | and significand `zSig', and returns the proper double-precision floating- 571 | point value corresponding to the abstract input. Ordinarily, the abstract 572 | value is simply rounded and packed into the double-precision format, with 573 | the inexact exception raised if the abstract input cannot be represented 574 | exactly. However, if the abstract value is too large, the overflow and 575 | inexact exceptions are raised and an infinity or maximal finite value is 576 | returned. If the abstract value is too small, the input value is rounded to 577 | a subnormal number, and the underflow and inexact exceptions are raised if 578 | the abstract input cannot be represented exactly as a subnormal double- 579 | precision floating-point number. 580 | The input significand `zSig' has its binary point between bits 62 581 | and 61, which is 10 bits to the left of the usual location. This shifted 582 | significand must be normalized or smaller. If `zSig' is not normalized, 583 | `zExp' must be 0; in that case, the result returned is a subnormal number, 584 | and it must not require rounding. In the usual case that `zSig' is 585 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 586 | The handling of underflow and overflow follows the IEC/IEEE Standard for 587 | Binary Floating-Point Arithmetic. 588 *----------------------------------------------------------------------------*/ 589 590 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 591 float_status *status) 592 { 593 int8_t roundingMode; 594 flag roundNearestEven; 595 int roundIncrement, roundBits; 596 flag isTiny; 597 598 roundingMode = status->float_rounding_mode; 599 roundNearestEven = ( roundingMode == float_round_nearest_even ); 600 switch (roundingMode) { 601 case float_round_nearest_even: 602 case float_round_ties_away: 603 roundIncrement = 0x200; 604 break; 605 case float_round_to_zero: 606 roundIncrement = 0; 607 break; 608 case float_round_up: 609 roundIncrement = zSign ? 0 : 0x3ff; 610 break; 611 case float_round_down: 612 roundIncrement = zSign ? 0x3ff : 0; 613 break; 614 case float_round_to_odd: 615 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 616 break; 617 default: 618 abort(); 619 } 620 roundBits = zSig & 0x3FF; 621 if ( 0x7FD <= (uint16_t) zExp ) { 622 if ( ( 0x7FD < zExp ) 623 || ( ( zExp == 0x7FD ) 624 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 625 ) { 626 bool overflow_to_inf = roundingMode != float_round_to_odd && 627 roundIncrement != 0; 628 float_raise(float_flag_overflow | float_flag_inexact, status); 629 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 630 } 631 if ( zExp < 0 ) { 632 if (status->flush_to_zero) { 633 float_raise(float_flag_output_denormal, status); 634 return packFloat64(zSign, 0, 0); 635 } 636 isTiny = 637 (status->float_detect_tininess 638 == float_tininess_before_rounding) 639 || ( zExp < -1 ) 640 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 641 shift64RightJamming( zSig, - zExp, &zSig ); 642 zExp = 0; 643 roundBits = zSig & 0x3FF; 644 if (isTiny && roundBits) { 645 float_raise(float_flag_underflow, status); 646 } 647 if (roundingMode == float_round_to_odd) { 648 /* 649 * For round-to-odd case, the roundIncrement depends on 650 * zSig which just changed. 651 */ 652 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 653 } 654 } 655 } 656 if (roundBits) { 657 status->float_exception_flags |= float_flag_inexact; 658 } 659 zSig = ( zSig + roundIncrement )>>10; 660 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 661 if ( zSig == 0 ) zExp = 0; 662 return packFloat64( zSign, zExp, zSig ); 663 664 } 665 666 /*---------------------------------------------------------------------------- 667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 668 | and significand `zSig', and returns the proper double-precision floating- 669 | point value corresponding to the abstract input. This routine is just like 670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 672 | floating-point exponent. 673 *----------------------------------------------------------------------------*/ 674 675 static float64 676 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 677 float_status *status) 678 { 679 int8_t shiftCount; 680 681 shiftCount = countLeadingZeros64( zSig ) - 1; 682 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 683 status); 684 685 } 686 687 /*---------------------------------------------------------------------------- 688 | Returns the fraction bits of the extended double-precision floating-point 689 | value `a'. 690 *----------------------------------------------------------------------------*/ 691 692 static inline uint64_t extractFloatx80Frac( floatx80 a ) 693 { 694 695 return a.low; 696 697 } 698 699 /*---------------------------------------------------------------------------- 700 | Returns the exponent bits of the extended double-precision floating-point 701 | value `a'. 702 *----------------------------------------------------------------------------*/ 703 704 static inline int32_t extractFloatx80Exp( floatx80 a ) 705 { 706 707 return a.high & 0x7FFF; 708 709 } 710 711 /*---------------------------------------------------------------------------- 712 | Returns the sign bit of the extended double-precision floating-point value 713 | `a'. 714 *----------------------------------------------------------------------------*/ 715 716 static inline flag extractFloatx80Sign( floatx80 a ) 717 { 718 719 return a.high>>15; 720 721 } 722 723 /*---------------------------------------------------------------------------- 724 | Normalizes the subnormal extended double-precision floating-point value 725 | represented by the denormalized significand `aSig'. The normalized exponent 726 | and significand are stored at the locations pointed to by `zExpPtr' and 727 | `zSigPtr', respectively. 728 *----------------------------------------------------------------------------*/ 729 730 static void 731 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 732 { 733 int8_t shiftCount; 734 735 shiftCount = countLeadingZeros64( aSig ); 736 *zSigPtr = aSig<<shiftCount; 737 *zExpPtr = 1 - shiftCount; 738 739 } 740 741 /*---------------------------------------------------------------------------- 742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 743 | extended double-precision floating-point value, returning the result. 744 *----------------------------------------------------------------------------*/ 745 746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 747 { 748 floatx80 z; 749 750 z.low = zSig; 751 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 752 return z; 753 754 } 755 756 /*---------------------------------------------------------------------------- 757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 758 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 759 | and returns the proper extended double-precision floating-point value 760 | corresponding to the abstract input. Ordinarily, the abstract value is 761 | rounded and packed into the extended double-precision format, with the 762 | inexact exception raised if the abstract input cannot be represented 763 | exactly. However, if the abstract value is too large, the overflow and 764 | inexact exceptions are raised and an infinity or maximal finite value is 765 | returned. If the abstract value is too small, the input value is rounded to 766 | a subnormal number, and the underflow and inexact exceptions are raised if 767 | the abstract input cannot be represented exactly as a subnormal extended 768 | double-precision floating-point number. 769 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 770 | number of bits as single or double precision, respectively. Otherwise, the 771 | result is rounded to the full precision of the extended double-precision 772 | format. 773 | The input significand must be normalized or smaller. If the input 774 | significand is not normalized, `zExp' must be 0; in that case, the result 775 | returned is a subnormal number, and it must not require rounding. The 776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 777 | Floating-Point Arithmetic. 778 *----------------------------------------------------------------------------*/ 779 780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 781 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 782 float_status *status) 783 { 784 int8_t roundingMode; 785 flag roundNearestEven, increment, isTiny; 786 int64_t roundIncrement, roundMask, roundBits; 787 788 roundingMode = status->float_rounding_mode; 789 roundNearestEven = ( roundingMode == float_round_nearest_even ); 790 if ( roundingPrecision == 80 ) goto precision80; 791 if ( roundingPrecision == 64 ) { 792 roundIncrement = LIT64( 0x0000000000000400 ); 793 roundMask = LIT64( 0x00000000000007FF ); 794 } 795 else if ( roundingPrecision == 32 ) { 796 roundIncrement = LIT64( 0x0000008000000000 ); 797 roundMask = LIT64( 0x000000FFFFFFFFFF ); 798 } 799 else { 800 goto precision80; 801 } 802 zSig0 |= ( zSig1 != 0 ); 803 switch (roundingMode) { 804 case float_round_nearest_even: 805 case float_round_ties_away: 806 break; 807 case float_round_to_zero: 808 roundIncrement = 0; 809 break; 810 case float_round_up: 811 roundIncrement = zSign ? 0 : roundMask; 812 break; 813 case float_round_down: 814 roundIncrement = zSign ? roundMask : 0; 815 break; 816 default: 817 abort(); 818 } 819 roundBits = zSig0 & roundMask; 820 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 821 if ( ( 0x7FFE < zExp ) 822 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 823 ) { 824 goto overflow; 825 } 826 if ( zExp <= 0 ) { 827 if (status->flush_to_zero) { 828 float_raise(float_flag_output_denormal, status); 829 return packFloatx80(zSign, 0, 0); 830 } 831 isTiny = 832 (status->float_detect_tininess 833 == float_tininess_before_rounding) 834 || ( zExp < 0 ) 835 || ( zSig0 <= zSig0 + roundIncrement ); 836 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 837 zExp = 0; 838 roundBits = zSig0 & roundMask; 839 if (isTiny && roundBits) { 840 float_raise(float_flag_underflow, status); 841 } 842 if (roundBits) { 843 status->float_exception_flags |= float_flag_inexact; 844 } 845 zSig0 += roundIncrement; 846 if ( (int64_t) zSig0 < 0 ) zExp = 1; 847 roundIncrement = roundMask + 1; 848 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 849 roundMask |= roundIncrement; 850 } 851 zSig0 &= ~ roundMask; 852 return packFloatx80( zSign, zExp, zSig0 ); 853 } 854 } 855 if (roundBits) { 856 status->float_exception_flags |= float_flag_inexact; 857 } 858 zSig0 += roundIncrement; 859 if ( zSig0 < roundIncrement ) { 860 ++zExp; 861 zSig0 = LIT64( 0x8000000000000000 ); 862 } 863 roundIncrement = roundMask + 1; 864 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 865 roundMask |= roundIncrement; 866 } 867 zSig0 &= ~ roundMask; 868 if ( zSig0 == 0 ) zExp = 0; 869 return packFloatx80( zSign, zExp, zSig0 ); 870 precision80: 871 switch (roundingMode) { 872 case float_round_nearest_even: 873 case float_round_ties_away: 874 increment = ((int64_t)zSig1 < 0); 875 break; 876 case float_round_to_zero: 877 increment = 0; 878 break; 879 case float_round_up: 880 increment = !zSign && zSig1; 881 break; 882 case float_round_down: 883 increment = zSign && zSig1; 884 break; 885 default: 886 abort(); 887 } 888 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 889 if ( ( 0x7FFE < zExp ) 890 || ( ( zExp == 0x7FFE ) 891 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 892 && increment 893 ) 894 ) { 895 roundMask = 0; 896 overflow: 897 float_raise(float_flag_overflow | float_flag_inexact, status); 898 if ( ( roundingMode == float_round_to_zero ) 899 || ( zSign && ( roundingMode == float_round_up ) ) 900 || ( ! zSign && ( roundingMode == float_round_down ) ) 901 ) { 902 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 903 } 904 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 905 } 906 if ( zExp <= 0 ) { 907 isTiny = 908 (status->float_detect_tininess 909 == float_tininess_before_rounding) 910 || ( zExp < 0 ) 911 || ! increment 912 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 913 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 914 zExp = 0; 915 if (isTiny && zSig1) { 916 float_raise(float_flag_underflow, status); 917 } 918 if (zSig1) { 919 status->float_exception_flags |= float_flag_inexact; 920 } 921 switch (roundingMode) { 922 case float_round_nearest_even: 923 case float_round_ties_away: 924 increment = ((int64_t)zSig1 < 0); 925 break; 926 case float_round_to_zero: 927 increment = 0; 928 break; 929 case float_round_up: 930 increment = !zSign && zSig1; 931 break; 932 case float_round_down: 933 increment = zSign && zSig1; 934 break; 935 default: 936 abort(); 937 } 938 if ( increment ) { 939 ++zSig0; 940 zSig0 &= 941 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 942 if ( (int64_t) zSig0 < 0 ) zExp = 1; 943 } 944 return packFloatx80( zSign, zExp, zSig0 ); 945 } 946 } 947 if (zSig1) { 948 status->float_exception_flags |= float_flag_inexact; 949 } 950 if ( increment ) { 951 ++zSig0; 952 if ( zSig0 == 0 ) { 953 ++zExp; 954 zSig0 = LIT64( 0x8000000000000000 ); 955 } 956 else { 957 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 958 } 959 } 960 else { 961 if ( zSig0 == 0 ) zExp = 0; 962 } 963 return packFloatx80( zSign, zExp, zSig0 ); 964 965 } 966 967 /*---------------------------------------------------------------------------- 968 | Takes an abstract floating-point value having sign `zSign', exponent 969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 970 | and returns the proper extended double-precision floating-point value 971 | corresponding to the abstract input. This routine is just like 972 | `roundAndPackFloatx80' except that the input significand does not have to be 973 | normalized. 974 *----------------------------------------------------------------------------*/ 975 976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 977 flag zSign, int32_t zExp, 978 uint64_t zSig0, uint64_t zSig1, 979 float_status *status) 980 { 981 int8_t shiftCount; 982 983 if ( zSig0 == 0 ) { 984 zSig0 = zSig1; 985 zSig1 = 0; 986 zExp -= 64; 987 } 988 shiftCount = countLeadingZeros64( zSig0 ); 989 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 990 zExp -= shiftCount; 991 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 992 zSig0, zSig1, status); 993 994 } 995 996 /*---------------------------------------------------------------------------- 997 | Returns the least-significant 64 fraction bits of the quadruple-precision 998 | floating-point value `a'. 999 *----------------------------------------------------------------------------*/ 1000 1001 static inline uint64_t extractFloat128Frac1( float128 a ) 1002 { 1003 1004 return a.low; 1005 1006 } 1007 1008 /*---------------------------------------------------------------------------- 1009 | Returns the most-significant 48 fraction bits of the quadruple-precision 1010 | floating-point value `a'. 1011 *----------------------------------------------------------------------------*/ 1012 1013 static inline uint64_t extractFloat128Frac0( float128 a ) 1014 { 1015 1016 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 1017 1018 } 1019 1020 /*---------------------------------------------------------------------------- 1021 | Returns the exponent bits of the quadruple-precision floating-point value 1022 | `a'. 1023 *----------------------------------------------------------------------------*/ 1024 1025 static inline int32_t extractFloat128Exp( float128 a ) 1026 { 1027 1028 return ( a.high>>48 ) & 0x7FFF; 1029 1030 } 1031 1032 /*---------------------------------------------------------------------------- 1033 | Returns the sign bit of the quadruple-precision floating-point value `a'. 1034 *----------------------------------------------------------------------------*/ 1035 1036 static inline flag extractFloat128Sign( float128 a ) 1037 { 1038 1039 return a.high>>63; 1040 1041 } 1042 1043 /*---------------------------------------------------------------------------- 1044 | Normalizes the subnormal quadruple-precision floating-point value 1045 | represented by the denormalized significand formed by the concatenation of 1046 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 1047 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 1048 | significand are stored at the location pointed to by `zSig0Ptr', and the 1049 | least significant 64 bits of the normalized significand are stored at the 1050 | location pointed to by `zSig1Ptr'. 1051 *----------------------------------------------------------------------------*/ 1052 1053 static void 1054 normalizeFloat128Subnormal( 1055 uint64_t aSig0, 1056 uint64_t aSig1, 1057 int32_t *zExpPtr, 1058 uint64_t *zSig0Ptr, 1059 uint64_t *zSig1Ptr 1060 ) 1061 { 1062 int8_t shiftCount; 1063 1064 if ( aSig0 == 0 ) { 1065 shiftCount = countLeadingZeros64( aSig1 ) - 15; 1066 if ( shiftCount < 0 ) { 1067 *zSig0Ptr = aSig1>>( - shiftCount ); 1068 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 1069 } 1070 else { 1071 *zSig0Ptr = aSig1<<shiftCount; 1072 *zSig1Ptr = 0; 1073 } 1074 *zExpPtr = - shiftCount - 63; 1075 } 1076 else { 1077 shiftCount = countLeadingZeros64( aSig0 ) - 15; 1078 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 1079 *zExpPtr = 1 - shiftCount; 1080 } 1081 1082 } 1083 1084 /*---------------------------------------------------------------------------- 1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed 1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 1087 | floating-point value, returning the result. After being shifted into the 1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 1089 | added together to form the most significant 32 bits of the result. This 1090 | means that any integer portion of `zSig0' will be added into the exponent. 1091 | Since a properly normalized significand will have an integer portion equal 1092 | to 1, the `zExp' input should be 1 less than the desired result exponent 1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 1094 | significand. 1095 *----------------------------------------------------------------------------*/ 1096 1097 static inline float128 1098 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 1099 { 1100 float128 z; 1101 1102 z.low = zSig1; 1103 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 1104 return z; 1105 1106 } 1107 1108 /*---------------------------------------------------------------------------- 1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1110 | and extended significand formed by the concatenation of `zSig0', `zSig1', 1111 | and `zSig2', and returns the proper quadruple-precision floating-point value 1112 | corresponding to the abstract input. Ordinarily, the abstract value is 1113 | simply rounded and packed into the quadruple-precision format, with the 1114 | inexact exception raised if the abstract input cannot be represented 1115 | exactly. However, if the abstract value is too large, the overflow and 1116 | inexact exceptions are raised and an infinity or maximal finite value is 1117 | returned. If the abstract value is too small, the input value is rounded to 1118 | a subnormal number, and the underflow and inexact exceptions are raised if 1119 | the abstract input cannot be represented exactly as a subnormal quadruple- 1120 | precision floating-point number. 1121 | The input significand must be normalized or smaller. If the input 1122 | significand is not normalized, `zExp' must be 0; in that case, the result 1123 | returned is a subnormal number, and it must not require rounding. In the 1124 | usual case that the input significand is normalized, `zExp' must be 1 less 1125 | than the ``true'' floating-point exponent. The handling of underflow and 1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1127 *----------------------------------------------------------------------------*/ 1128 1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 1130 uint64_t zSig0, uint64_t zSig1, 1131 uint64_t zSig2, float_status *status) 1132 { 1133 int8_t roundingMode; 1134 flag roundNearestEven, increment, isTiny; 1135 1136 roundingMode = status->float_rounding_mode; 1137 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1138 switch (roundingMode) { 1139 case float_round_nearest_even: 1140 case float_round_ties_away: 1141 increment = ((int64_t)zSig2 < 0); 1142 break; 1143 case float_round_to_zero: 1144 increment = 0; 1145 break; 1146 case float_round_up: 1147 increment = !zSign && zSig2; 1148 break; 1149 case float_round_down: 1150 increment = zSign && zSig2; 1151 break; 1152 case float_round_to_odd: 1153 increment = !(zSig1 & 0x1) && zSig2; 1154 break; 1155 default: 1156 abort(); 1157 } 1158 if ( 0x7FFD <= (uint32_t) zExp ) { 1159 if ( ( 0x7FFD < zExp ) 1160 || ( ( zExp == 0x7FFD ) 1161 && eq128( 1162 LIT64( 0x0001FFFFFFFFFFFF ), 1163 LIT64( 0xFFFFFFFFFFFFFFFF ), 1164 zSig0, 1165 zSig1 1166 ) 1167 && increment 1168 ) 1169 ) { 1170 float_raise(float_flag_overflow | float_flag_inexact, status); 1171 if ( ( roundingMode == float_round_to_zero ) 1172 || ( zSign && ( roundingMode == float_round_up ) ) 1173 || ( ! zSign && ( roundingMode == float_round_down ) ) 1174 || (roundingMode == float_round_to_odd) 1175 ) { 1176 return 1177 packFloat128( 1178 zSign, 1179 0x7FFE, 1180 LIT64( 0x0000FFFFFFFFFFFF ), 1181 LIT64( 0xFFFFFFFFFFFFFFFF ) 1182 ); 1183 } 1184 return packFloat128( zSign, 0x7FFF, 0, 0 ); 1185 } 1186 if ( zExp < 0 ) { 1187 if (status->flush_to_zero) { 1188 float_raise(float_flag_output_denormal, status); 1189 return packFloat128(zSign, 0, 0, 0); 1190 } 1191 isTiny = 1192 (status->float_detect_tininess 1193 == float_tininess_before_rounding) 1194 || ( zExp < -1 ) 1195 || ! increment 1196 || lt128( 1197 zSig0, 1198 zSig1, 1199 LIT64( 0x0001FFFFFFFFFFFF ), 1200 LIT64( 0xFFFFFFFFFFFFFFFF ) 1201 ); 1202 shift128ExtraRightJamming( 1203 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 1204 zExp = 0; 1205 if (isTiny && zSig2) { 1206 float_raise(float_flag_underflow, status); 1207 } 1208 switch (roundingMode) { 1209 case float_round_nearest_even: 1210 case float_round_ties_away: 1211 increment = ((int64_t)zSig2 < 0); 1212 break; 1213 case float_round_to_zero: 1214 increment = 0; 1215 break; 1216 case float_round_up: 1217 increment = !zSign && zSig2; 1218 break; 1219 case float_round_down: 1220 increment = zSign && zSig2; 1221 break; 1222 case float_round_to_odd: 1223 increment = !(zSig1 & 0x1) && zSig2; 1224 break; 1225 default: 1226 abort(); 1227 } 1228 } 1229 } 1230 if (zSig2) { 1231 status->float_exception_flags |= float_flag_inexact; 1232 } 1233 if ( increment ) { 1234 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 1235 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 1236 } 1237 else { 1238 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 1239 } 1240 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1241 1242 } 1243 1244 /*---------------------------------------------------------------------------- 1245 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 1246 | and significand formed by the concatenation of `zSig0' and `zSig1', and 1247 | returns the proper quadruple-precision floating-point value corresponding 1248 | to the abstract input. This routine is just like `roundAndPackFloat128' 1249 | except that the input significand has fewer bits and does not have to be 1250 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 1251 | point exponent. 1252 *----------------------------------------------------------------------------*/ 1253 1254 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 1255 uint64_t zSig0, uint64_t zSig1, 1256 float_status *status) 1257 { 1258 int8_t shiftCount; 1259 uint64_t zSig2; 1260 1261 if ( zSig0 == 0 ) { 1262 zSig0 = zSig1; 1263 zSig1 = 0; 1264 zExp -= 64; 1265 } 1266 shiftCount = countLeadingZeros64( zSig0 ) - 15; 1267 if ( 0 <= shiftCount ) { 1268 zSig2 = 0; 1269 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1270 } 1271 else { 1272 shift128ExtraRightJamming( 1273 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 1274 } 1275 zExp -= shiftCount; 1276 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 1277 1278 } 1279 1280 /*---------------------------------------------------------------------------- 1281 | Returns the result of converting the 32-bit two's complement integer `a' 1282 | to the single-precision floating-point format. The conversion is performed 1283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1284 *----------------------------------------------------------------------------*/ 1285 1286 float32 int32_to_float32(int32_t a, float_status *status) 1287 { 1288 flag zSign; 1289 1290 if ( a == 0 ) return float32_zero; 1291 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); 1292 zSign = ( a < 0 ); 1293 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status); 1294 } 1295 1296 /*---------------------------------------------------------------------------- 1297 | Returns the result of converting the 32-bit two's complement integer `a' 1298 | to the double-precision floating-point format. The conversion is performed 1299 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1300 *----------------------------------------------------------------------------*/ 1301 1302 float64 int32_to_float64(int32_t a, float_status *status) 1303 { 1304 flag zSign; 1305 uint32_t absA; 1306 int8_t shiftCount; 1307 uint64_t zSig; 1308 1309 if ( a == 0 ) return float64_zero; 1310 zSign = ( a < 0 ); 1311 absA = zSign ? - a : a; 1312 shiftCount = countLeadingZeros32( absA ) + 21; 1313 zSig = absA; 1314 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); 1315 1316 } 1317 1318 /*---------------------------------------------------------------------------- 1319 | Returns the result of converting the 32-bit two's complement integer `a' 1320 | to the extended double-precision floating-point format. The conversion 1321 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1322 | Arithmetic. 1323 *----------------------------------------------------------------------------*/ 1324 1325 floatx80 int32_to_floatx80(int32_t a, float_status *status) 1326 { 1327 flag zSign; 1328 uint32_t absA; 1329 int8_t shiftCount; 1330 uint64_t zSig; 1331 1332 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1333 zSign = ( a < 0 ); 1334 absA = zSign ? - a : a; 1335 shiftCount = countLeadingZeros32( absA ) + 32; 1336 zSig = absA; 1337 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 1338 1339 } 1340 1341 /*---------------------------------------------------------------------------- 1342 | Returns the result of converting the 32-bit two's complement integer `a' to 1343 | the quadruple-precision floating-point format. The conversion is performed 1344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1345 *----------------------------------------------------------------------------*/ 1346 1347 float128 int32_to_float128(int32_t a, float_status *status) 1348 { 1349 flag zSign; 1350 uint32_t absA; 1351 int8_t shiftCount; 1352 uint64_t zSig0; 1353 1354 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1355 zSign = ( a < 0 ); 1356 absA = zSign ? - a : a; 1357 shiftCount = countLeadingZeros32( absA ) + 17; 1358 zSig0 = absA; 1359 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 1360 1361 } 1362 1363 /*---------------------------------------------------------------------------- 1364 | Returns the result of converting the 64-bit two's complement integer `a' 1365 | to the single-precision floating-point format. The conversion is performed 1366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1367 *----------------------------------------------------------------------------*/ 1368 1369 float32 int64_to_float32(int64_t a, float_status *status) 1370 { 1371 flag zSign; 1372 uint64_t absA; 1373 int8_t shiftCount; 1374 1375 if ( a == 0 ) return float32_zero; 1376 zSign = ( a < 0 ); 1377 absA = zSign ? - a : a; 1378 shiftCount = countLeadingZeros64( absA ) - 40; 1379 if ( 0 <= shiftCount ) { 1380 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); 1381 } 1382 else { 1383 shiftCount += 7; 1384 if ( shiftCount < 0 ) { 1385 shift64RightJamming( absA, - shiftCount, &absA ); 1386 } 1387 else { 1388 absA <<= shiftCount; 1389 } 1390 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status); 1391 } 1392 1393 } 1394 1395 /*---------------------------------------------------------------------------- 1396 | Returns the result of converting the 64-bit two's complement integer `a' 1397 | to the double-precision floating-point format. The conversion is performed 1398 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1399 *----------------------------------------------------------------------------*/ 1400 1401 float64 int64_to_float64(int64_t a, float_status *status) 1402 { 1403 flag zSign; 1404 1405 if ( a == 0 ) return float64_zero; 1406 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) { 1407 return packFloat64( 1, 0x43E, 0 ); 1408 } 1409 zSign = ( a < 0 ); 1410 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status); 1411 } 1412 1413 /*---------------------------------------------------------------------------- 1414 | Returns the result of converting the 64-bit two's complement integer `a' 1415 | to the extended double-precision floating-point format. The conversion 1416 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1417 | Arithmetic. 1418 *----------------------------------------------------------------------------*/ 1419 1420 floatx80 int64_to_floatx80(int64_t a, float_status *status) 1421 { 1422 flag zSign; 1423 uint64_t absA; 1424 int8_t shiftCount; 1425 1426 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 1427 zSign = ( a < 0 ); 1428 absA = zSign ? - a : a; 1429 shiftCount = countLeadingZeros64( absA ); 1430 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 1431 1432 } 1433 1434 /*---------------------------------------------------------------------------- 1435 | Returns the result of converting the 64-bit two's complement integer `a' to 1436 | the quadruple-precision floating-point format. The conversion is performed 1437 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1438 *----------------------------------------------------------------------------*/ 1439 1440 float128 int64_to_float128(int64_t a, float_status *status) 1441 { 1442 flag zSign; 1443 uint64_t absA; 1444 int8_t shiftCount; 1445 int32_t zExp; 1446 uint64_t zSig0, zSig1; 1447 1448 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 1449 zSign = ( a < 0 ); 1450 absA = zSign ? - a : a; 1451 shiftCount = countLeadingZeros64( absA ) + 49; 1452 zExp = 0x406E - shiftCount; 1453 if ( 64 <= shiftCount ) { 1454 zSig1 = 0; 1455 zSig0 = absA; 1456 shiftCount -= 64; 1457 } 1458 else { 1459 zSig1 = absA; 1460 zSig0 = 0; 1461 } 1462 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 1463 return packFloat128( zSign, zExp, zSig0, zSig1 ); 1464 1465 } 1466 1467 /*---------------------------------------------------------------------------- 1468 | Returns the result of converting the 64-bit unsigned integer `a' 1469 | to the single-precision floating-point format. The conversion is performed 1470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1471 *----------------------------------------------------------------------------*/ 1472 1473 float32 uint64_to_float32(uint64_t a, float_status *status) 1474 { 1475 int shiftcount; 1476 1477 if (a == 0) { 1478 return float32_zero; 1479 } 1480 1481 /* Determine (left) shift needed to put first set bit into bit posn 23 1482 * (since packFloat32() expects the binary point between bits 23 and 22); 1483 * this is the fast case for smallish numbers. 1484 */ 1485 shiftcount = countLeadingZeros64(a) - 40; 1486 if (shiftcount >= 0) { 1487 return packFloat32(0, 0x95 - shiftcount, a << shiftcount); 1488 } 1489 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32() 1490 * expects the binary point between bits 30 and 29, hence the + 7. 1491 */ 1492 shiftcount += 7; 1493 if (shiftcount < 0) { 1494 shift64RightJamming(a, -shiftcount, &a); 1495 } else { 1496 a <<= shiftcount; 1497 } 1498 1499 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status); 1500 } 1501 1502 /*---------------------------------------------------------------------------- 1503 | Returns the result of converting the 64-bit unsigned integer `a' 1504 | to the double-precision floating-point format. The conversion is performed 1505 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1506 *----------------------------------------------------------------------------*/ 1507 1508 float64 uint64_to_float64(uint64_t a, float_status *status) 1509 { 1510 int exp = 0x43C; 1511 int shiftcount; 1512 1513 if (a == 0) { 1514 return float64_zero; 1515 } 1516 1517 shiftcount = countLeadingZeros64(a) - 1; 1518 if (shiftcount < 0) { 1519 shift64RightJamming(a, -shiftcount, &a); 1520 } else { 1521 a <<= shiftcount; 1522 } 1523 return roundAndPackFloat64(0, exp - shiftcount, a, status); 1524 } 1525 1526 /*---------------------------------------------------------------------------- 1527 | Returns the result of converting the 64-bit unsigned integer `a' 1528 | to the quadruple-precision floating-point format. The conversion is performed 1529 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1530 *----------------------------------------------------------------------------*/ 1531 1532 float128 uint64_to_float128(uint64_t a, float_status *status) 1533 { 1534 if (a == 0) { 1535 return float128_zero; 1536 } 1537 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 1538 } 1539 1540 /*---------------------------------------------------------------------------- 1541 | Returns the result of converting the single-precision floating-point value 1542 | `a' to the 32-bit two's complement integer format. The conversion is 1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1544 | Arithmetic---which means in particular that the conversion is rounded 1545 | according to the current rounding mode. If `a' is a NaN, the largest 1546 | positive integer is returned. Otherwise, if the conversion overflows, the 1547 | largest integer with the same sign as `a' is returned. 1548 *----------------------------------------------------------------------------*/ 1549 1550 int32_t float32_to_int32(float32 a, float_status *status) 1551 { 1552 flag aSign; 1553 int aExp; 1554 int shiftCount; 1555 uint32_t aSig; 1556 uint64_t aSig64; 1557 1558 a = float32_squash_input_denormal(a, status); 1559 aSig = extractFloat32Frac( a ); 1560 aExp = extractFloat32Exp( a ); 1561 aSign = extractFloat32Sign( a ); 1562 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; 1563 if ( aExp ) aSig |= 0x00800000; 1564 shiftCount = 0xAF - aExp; 1565 aSig64 = aSig; 1566 aSig64 <<= 32; 1567 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); 1568 return roundAndPackInt32(aSign, aSig64, status); 1569 1570 } 1571 1572 /*---------------------------------------------------------------------------- 1573 | Returns the result of converting the single-precision floating-point value 1574 | `a' to the 32-bit two's complement integer format. The conversion is 1575 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1576 | Arithmetic, except that the conversion is always rounded toward zero. 1577 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1578 | the conversion overflows, the largest integer with the same sign as `a' is 1579 | returned. 1580 *----------------------------------------------------------------------------*/ 1581 1582 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status) 1583 { 1584 flag aSign; 1585 int aExp; 1586 int shiftCount; 1587 uint32_t aSig; 1588 int32_t z; 1589 a = float32_squash_input_denormal(a, status); 1590 1591 aSig = extractFloat32Frac( a ); 1592 aExp = extractFloat32Exp( a ); 1593 aSign = extractFloat32Sign( a ); 1594 shiftCount = aExp - 0x9E; 1595 if ( 0 <= shiftCount ) { 1596 if ( float32_val(a) != 0xCF000000 ) { 1597 float_raise(float_flag_invalid, status); 1598 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; 1599 } 1600 return (int32_t) 0x80000000; 1601 } 1602 else if ( aExp <= 0x7E ) { 1603 if (aExp | aSig) { 1604 status->float_exception_flags |= float_flag_inexact; 1605 } 1606 return 0; 1607 } 1608 aSig = ( aSig | 0x00800000 )<<8; 1609 z = aSig>>( - shiftCount ); 1610 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1611 status->float_exception_flags |= float_flag_inexact; 1612 } 1613 if ( aSign ) z = - z; 1614 return z; 1615 1616 } 1617 1618 /*---------------------------------------------------------------------------- 1619 | Returns the result of converting the single-precision floating-point value 1620 | `a' to the 16-bit two's complement integer format. The conversion is 1621 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1622 | Arithmetic, except that the conversion is always rounded toward zero. 1623 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 1624 | the conversion overflows, the largest integer with the same sign as `a' is 1625 | returned. 1626 *----------------------------------------------------------------------------*/ 1627 1628 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status) 1629 { 1630 flag aSign; 1631 int aExp; 1632 int shiftCount; 1633 uint32_t aSig; 1634 int32_t z; 1635 1636 aSig = extractFloat32Frac( a ); 1637 aExp = extractFloat32Exp( a ); 1638 aSign = extractFloat32Sign( a ); 1639 shiftCount = aExp - 0x8E; 1640 if ( 0 <= shiftCount ) { 1641 if ( float32_val(a) != 0xC7000000 ) { 1642 float_raise(float_flag_invalid, status); 1643 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1644 return 0x7FFF; 1645 } 1646 } 1647 return (int32_t) 0xffff8000; 1648 } 1649 else if ( aExp <= 0x7E ) { 1650 if ( aExp | aSig ) { 1651 status->float_exception_flags |= float_flag_inexact; 1652 } 1653 return 0; 1654 } 1655 shiftCount -= 0x10; 1656 aSig = ( aSig | 0x00800000 )<<8; 1657 z = aSig>>( - shiftCount ); 1658 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) { 1659 status->float_exception_flags |= float_flag_inexact; 1660 } 1661 if ( aSign ) { 1662 z = - z; 1663 } 1664 return z; 1665 1666 } 1667 1668 /*---------------------------------------------------------------------------- 1669 | Returns the result of converting the single-precision floating-point value 1670 | `a' to the 64-bit two's complement integer format. The conversion is 1671 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1672 | Arithmetic---which means in particular that the conversion is rounded 1673 | according to the current rounding mode. If `a' is a NaN, the largest 1674 | positive integer is returned. Otherwise, if the conversion overflows, the 1675 | largest integer with the same sign as `a' is returned. 1676 *----------------------------------------------------------------------------*/ 1677 1678 int64_t float32_to_int64(float32 a, float_status *status) 1679 { 1680 flag aSign; 1681 int aExp; 1682 int shiftCount; 1683 uint32_t aSig; 1684 uint64_t aSig64, aSigExtra; 1685 a = float32_squash_input_denormal(a, status); 1686 1687 aSig = extractFloat32Frac( a ); 1688 aExp = extractFloat32Exp( a ); 1689 aSign = extractFloat32Sign( a ); 1690 shiftCount = 0xBE - aExp; 1691 if ( shiftCount < 0 ) { 1692 float_raise(float_flag_invalid, status); 1693 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1694 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1695 } 1696 return (int64_t) LIT64( 0x8000000000000000 ); 1697 } 1698 if ( aExp ) aSig |= 0x00800000; 1699 aSig64 = aSig; 1700 aSig64 <<= 40; 1701 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); 1702 return roundAndPackInt64(aSign, aSig64, aSigExtra, status); 1703 1704 } 1705 1706 /*---------------------------------------------------------------------------- 1707 | Returns the result of converting the single-precision floating-point value 1708 | `a' to the 64-bit unsigned integer format. The conversion is 1709 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1710 | Arithmetic---which means in particular that the conversion is rounded 1711 | according to the current rounding mode. If `a' is a NaN, the largest 1712 | unsigned integer is returned. Otherwise, if the conversion overflows, the 1713 | largest unsigned integer is returned. If the 'a' is negative, the result 1714 | is rounded and zero is returned; values that do not round to zero will 1715 | raise the inexact exception flag. 1716 *----------------------------------------------------------------------------*/ 1717 1718 uint64_t float32_to_uint64(float32 a, float_status *status) 1719 { 1720 flag aSign; 1721 int aExp; 1722 int shiftCount; 1723 uint32_t aSig; 1724 uint64_t aSig64, aSigExtra; 1725 a = float32_squash_input_denormal(a, status); 1726 1727 aSig = extractFloat32Frac(a); 1728 aExp = extractFloat32Exp(a); 1729 aSign = extractFloat32Sign(a); 1730 if ((aSign) && (aExp > 126)) { 1731 float_raise(float_flag_invalid, status); 1732 if (float32_is_any_nan(a)) { 1733 return LIT64(0xFFFFFFFFFFFFFFFF); 1734 } else { 1735 return 0; 1736 } 1737 } 1738 shiftCount = 0xBE - aExp; 1739 if (aExp) { 1740 aSig |= 0x00800000; 1741 } 1742 if (shiftCount < 0) { 1743 float_raise(float_flag_invalid, status); 1744 return LIT64(0xFFFFFFFFFFFFFFFF); 1745 } 1746 1747 aSig64 = aSig; 1748 aSig64 <<= 40; 1749 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra); 1750 return roundAndPackUint64(aSign, aSig64, aSigExtra, status); 1751 } 1752 1753 /*---------------------------------------------------------------------------- 1754 | Returns the result of converting the single-precision floating-point value 1755 | `a' to the 64-bit unsigned integer format. The conversion is 1756 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1757 | Arithmetic, except that the conversion is always rounded toward zero. If 1758 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the 1759 | conversion overflows, the largest unsigned integer is returned. If the 1760 | 'a' is negative, the result is rounded and zero is returned; values that do 1761 | not round to zero will raise the inexact flag. 1762 *----------------------------------------------------------------------------*/ 1763 1764 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status) 1765 { 1766 signed char current_rounding_mode = status->float_rounding_mode; 1767 set_float_rounding_mode(float_round_to_zero, status); 1768 int64_t v = float32_to_uint64(a, status); 1769 set_float_rounding_mode(current_rounding_mode, status); 1770 return v; 1771 } 1772 1773 /*---------------------------------------------------------------------------- 1774 | Returns the result of converting the single-precision floating-point value 1775 | `a' to the 64-bit two's complement integer format. The conversion is 1776 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1777 | Arithmetic, except that the conversion is always rounded toward zero. If 1778 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 1779 | conversion overflows, the largest integer with the same sign as `a' is 1780 | returned. 1781 *----------------------------------------------------------------------------*/ 1782 1783 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status) 1784 { 1785 flag aSign; 1786 int aExp; 1787 int shiftCount; 1788 uint32_t aSig; 1789 uint64_t aSig64; 1790 int64_t z; 1791 a = float32_squash_input_denormal(a, status); 1792 1793 aSig = extractFloat32Frac( a ); 1794 aExp = extractFloat32Exp( a ); 1795 aSign = extractFloat32Sign( a ); 1796 shiftCount = aExp - 0xBE; 1797 if ( 0 <= shiftCount ) { 1798 if ( float32_val(a) != 0xDF000000 ) { 1799 float_raise(float_flag_invalid, status); 1800 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { 1801 return LIT64( 0x7FFFFFFFFFFFFFFF ); 1802 } 1803 } 1804 return (int64_t) LIT64( 0x8000000000000000 ); 1805 } 1806 else if ( aExp <= 0x7E ) { 1807 if (aExp | aSig) { 1808 status->float_exception_flags |= float_flag_inexact; 1809 } 1810 return 0; 1811 } 1812 aSig64 = aSig | 0x00800000; 1813 aSig64 <<= 40; 1814 z = aSig64>>( - shiftCount ); 1815 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) { 1816 status->float_exception_flags |= float_flag_inexact; 1817 } 1818 if ( aSign ) z = - z; 1819 return z; 1820 1821 } 1822 1823 /*---------------------------------------------------------------------------- 1824 | Returns the result of converting the single-precision floating-point value 1825 | `a' to the double-precision floating-point format. The conversion is 1826 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1827 | Arithmetic. 1828 *----------------------------------------------------------------------------*/ 1829 1830 float64 float32_to_float64(float32 a, float_status *status) 1831 { 1832 flag aSign; 1833 int aExp; 1834 uint32_t aSig; 1835 a = float32_squash_input_denormal(a, status); 1836 1837 aSig = extractFloat32Frac( a ); 1838 aExp = extractFloat32Exp( a ); 1839 aSign = extractFloat32Sign( a ); 1840 if ( aExp == 0xFF ) { 1841 if (aSig) { 1842 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 1843 } 1844 return packFloat64( aSign, 0x7FF, 0 ); 1845 } 1846 if ( aExp == 0 ) { 1847 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 1848 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1849 --aExp; 1850 } 1851 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 1852 1853 } 1854 1855 /*---------------------------------------------------------------------------- 1856 | Returns the result of converting the single-precision floating-point value 1857 | `a' to the extended double-precision floating-point format. The conversion 1858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 1859 | Arithmetic. 1860 *----------------------------------------------------------------------------*/ 1861 1862 floatx80 float32_to_floatx80(float32 a, float_status *status) 1863 { 1864 flag aSign; 1865 int aExp; 1866 uint32_t aSig; 1867 1868 a = float32_squash_input_denormal(a, status); 1869 aSig = extractFloat32Frac( a ); 1870 aExp = extractFloat32Exp( a ); 1871 aSign = extractFloat32Sign( a ); 1872 if ( aExp == 0xFF ) { 1873 if (aSig) { 1874 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 1875 } 1876 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 1877 } 1878 if ( aExp == 0 ) { 1879 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 1880 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1881 } 1882 aSig |= 0x00800000; 1883 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 1884 1885 } 1886 1887 /*---------------------------------------------------------------------------- 1888 | Returns the result of converting the single-precision floating-point value 1889 | `a' to the double-precision floating-point format. The conversion is 1890 | performed according to the IEC/IEEE Standard for Binary Floating-Point 1891 | Arithmetic. 1892 *----------------------------------------------------------------------------*/ 1893 1894 float128 float32_to_float128(float32 a, float_status *status) 1895 { 1896 flag aSign; 1897 int aExp; 1898 uint32_t aSig; 1899 1900 a = float32_squash_input_denormal(a, status); 1901 aSig = extractFloat32Frac( a ); 1902 aExp = extractFloat32Exp( a ); 1903 aSign = extractFloat32Sign( a ); 1904 if ( aExp == 0xFF ) { 1905 if (aSig) { 1906 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 1907 } 1908 return packFloat128( aSign, 0x7FFF, 0, 0 ); 1909 } 1910 if ( aExp == 0 ) { 1911 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 1912 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 1913 --aExp; 1914 } 1915 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 1916 1917 } 1918 1919 /*---------------------------------------------------------------------------- 1920 | Rounds the single-precision floating-point value `a' to an integer, and 1921 | returns the result as a single-precision floating-point value. The 1922 | operation is performed according to the IEC/IEEE Standard for Binary 1923 | Floating-Point Arithmetic. 1924 *----------------------------------------------------------------------------*/ 1925 1926 float32 float32_round_to_int(float32 a, float_status *status) 1927 { 1928 flag aSign; 1929 int aExp; 1930 uint32_t lastBitMask, roundBitsMask; 1931 uint32_t z; 1932 a = float32_squash_input_denormal(a, status); 1933 1934 aExp = extractFloat32Exp( a ); 1935 if ( 0x96 <= aExp ) { 1936 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { 1937 return propagateFloat32NaN(a, a, status); 1938 } 1939 return a; 1940 } 1941 if ( aExp <= 0x7E ) { 1942 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a; 1943 status->float_exception_flags |= float_flag_inexact; 1944 aSign = extractFloat32Sign( a ); 1945 switch (status->float_rounding_mode) { 1946 case float_round_nearest_even: 1947 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { 1948 return packFloat32( aSign, 0x7F, 0 ); 1949 } 1950 break; 1951 case float_round_ties_away: 1952 if (aExp == 0x7E) { 1953 return packFloat32(aSign, 0x7F, 0); 1954 } 1955 break; 1956 case float_round_down: 1957 return make_float32(aSign ? 0xBF800000 : 0); 1958 case float_round_up: 1959 return make_float32(aSign ? 0x80000000 : 0x3F800000); 1960 } 1961 return packFloat32( aSign, 0, 0 ); 1962 } 1963 lastBitMask = 1; 1964 lastBitMask <<= 0x96 - aExp; 1965 roundBitsMask = lastBitMask - 1; 1966 z = float32_val(a); 1967 switch (status->float_rounding_mode) { 1968 case float_round_nearest_even: 1969 z += lastBitMask>>1; 1970 if ((z & roundBitsMask) == 0) { 1971 z &= ~lastBitMask; 1972 } 1973 break; 1974 case float_round_ties_away: 1975 z += lastBitMask >> 1; 1976 break; 1977 case float_round_to_zero: 1978 break; 1979 case float_round_up: 1980 if (!extractFloat32Sign(make_float32(z))) { 1981 z += roundBitsMask; 1982 } 1983 break; 1984 case float_round_down: 1985 if (extractFloat32Sign(make_float32(z))) { 1986 z += roundBitsMask; 1987 } 1988 break; 1989 default: 1990 abort(); 1991 } 1992 z &= ~ roundBitsMask; 1993 if (z != float32_val(a)) { 1994 status->float_exception_flags |= float_flag_inexact; 1995 } 1996 return make_float32(z); 1997 1998 } 1999 2000 /*---------------------------------------------------------------------------- 2001 | Returns the result of adding the absolute values of the single-precision 2002 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 2003 | before being returned. `zSign' is ignored if the result is a NaN. 2004 | The addition is performed according to the IEC/IEEE Standard for Binary 2005 | Floating-Point Arithmetic. 2006 *----------------------------------------------------------------------------*/ 2007 2008 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign, 2009 float_status *status) 2010 { 2011 int aExp, bExp, zExp; 2012 uint32_t aSig, bSig, zSig; 2013 int expDiff; 2014 2015 aSig = extractFloat32Frac( a ); 2016 aExp = extractFloat32Exp( a ); 2017 bSig = extractFloat32Frac( b ); 2018 bExp = extractFloat32Exp( b ); 2019 expDiff = aExp - bExp; 2020 aSig <<= 6; 2021 bSig <<= 6; 2022 if ( 0 < expDiff ) { 2023 if ( aExp == 0xFF ) { 2024 if (aSig) { 2025 return propagateFloat32NaN(a, b, status); 2026 } 2027 return a; 2028 } 2029 if ( bExp == 0 ) { 2030 --expDiff; 2031 } 2032 else { 2033 bSig |= 0x20000000; 2034 } 2035 shift32RightJamming( bSig, expDiff, &bSig ); 2036 zExp = aExp; 2037 } 2038 else if ( expDiff < 0 ) { 2039 if ( bExp == 0xFF ) { 2040 if (bSig) { 2041 return propagateFloat32NaN(a, b, status); 2042 } 2043 return packFloat32( zSign, 0xFF, 0 ); 2044 } 2045 if ( aExp == 0 ) { 2046 ++expDiff; 2047 } 2048 else { 2049 aSig |= 0x20000000; 2050 } 2051 shift32RightJamming( aSig, - expDiff, &aSig ); 2052 zExp = bExp; 2053 } 2054 else { 2055 if ( aExp == 0xFF ) { 2056 if (aSig | bSig) { 2057 return propagateFloat32NaN(a, b, status); 2058 } 2059 return a; 2060 } 2061 if ( aExp == 0 ) { 2062 if (status->flush_to_zero) { 2063 if (aSig | bSig) { 2064 float_raise(float_flag_output_denormal, status); 2065 } 2066 return packFloat32(zSign, 0, 0); 2067 } 2068 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); 2069 } 2070 zSig = 0x40000000 + aSig + bSig; 2071 zExp = aExp; 2072 goto roundAndPack; 2073 } 2074 aSig |= 0x20000000; 2075 zSig = ( aSig + bSig )<<1; 2076 --zExp; 2077 if ( (int32_t) zSig < 0 ) { 2078 zSig = aSig + bSig; 2079 ++zExp; 2080 } 2081 roundAndPack: 2082 return roundAndPackFloat32(zSign, zExp, zSig, status); 2083 2084 } 2085 2086 /*---------------------------------------------------------------------------- 2087 | Returns the result of subtracting the absolute values of the single- 2088 | precision floating-point values `a' and `b'. If `zSign' is 1, the 2089 | difference is negated before being returned. `zSign' is ignored if the 2090 | result is a NaN. The subtraction is performed according to the IEC/IEEE 2091 | Standard for Binary Floating-Point Arithmetic. 2092 *----------------------------------------------------------------------------*/ 2093 2094 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign, 2095 float_status *status) 2096 { 2097 int aExp, bExp, zExp; 2098 uint32_t aSig, bSig, zSig; 2099 int expDiff; 2100 2101 aSig = extractFloat32Frac( a ); 2102 aExp = extractFloat32Exp( a ); 2103 bSig = extractFloat32Frac( b ); 2104 bExp = extractFloat32Exp( b ); 2105 expDiff = aExp - bExp; 2106 aSig <<= 7; 2107 bSig <<= 7; 2108 if ( 0 < expDiff ) goto aExpBigger; 2109 if ( expDiff < 0 ) goto bExpBigger; 2110 if ( aExp == 0xFF ) { 2111 if (aSig | bSig) { 2112 return propagateFloat32NaN(a, b, status); 2113 } 2114 float_raise(float_flag_invalid, status); 2115 return float32_default_nan(status); 2116 } 2117 if ( aExp == 0 ) { 2118 aExp = 1; 2119 bExp = 1; 2120 } 2121 if ( bSig < aSig ) goto aBigger; 2122 if ( aSig < bSig ) goto bBigger; 2123 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0); 2124 bExpBigger: 2125 if ( bExp == 0xFF ) { 2126 if (bSig) { 2127 return propagateFloat32NaN(a, b, status); 2128 } 2129 return packFloat32( zSign ^ 1, 0xFF, 0 ); 2130 } 2131 if ( aExp == 0 ) { 2132 ++expDiff; 2133 } 2134 else { 2135 aSig |= 0x40000000; 2136 } 2137 shift32RightJamming( aSig, - expDiff, &aSig ); 2138 bSig |= 0x40000000; 2139 bBigger: 2140 zSig = bSig - aSig; 2141 zExp = bExp; 2142 zSign ^= 1; 2143 goto normalizeRoundAndPack; 2144 aExpBigger: 2145 if ( aExp == 0xFF ) { 2146 if (aSig) { 2147 return propagateFloat32NaN(a, b, status); 2148 } 2149 return a; 2150 } 2151 if ( bExp == 0 ) { 2152 --expDiff; 2153 } 2154 else { 2155 bSig |= 0x40000000; 2156 } 2157 shift32RightJamming( bSig, expDiff, &bSig ); 2158 aSig |= 0x40000000; 2159 aBigger: 2160 zSig = aSig - bSig; 2161 zExp = aExp; 2162 normalizeRoundAndPack: 2163 --zExp; 2164 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status); 2165 2166 } 2167 2168 /*---------------------------------------------------------------------------- 2169 | Returns the result of adding the single-precision floating-point values `a' 2170 | and `b'. The operation is performed according to the IEC/IEEE Standard for 2171 | Binary Floating-Point Arithmetic. 2172 *----------------------------------------------------------------------------*/ 2173 2174 float32 float32_add(float32 a, float32 b, float_status *status) 2175 { 2176 flag aSign, bSign; 2177 a = float32_squash_input_denormal(a, status); 2178 b = float32_squash_input_denormal(b, status); 2179 2180 aSign = extractFloat32Sign( a ); 2181 bSign = extractFloat32Sign( b ); 2182 if ( aSign == bSign ) { 2183 return addFloat32Sigs(a, b, aSign, status); 2184 } 2185 else { 2186 return subFloat32Sigs(a, b, aSign, status); 2187 } 2188 2189 } 2190 2191 /*---------------------------------------------------------------------------- 2192 | Returns the result of subtracting the single-precision floating-point values 2193 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2194 | for Binary Floating-Point Arithmetic. 2195 *----------------------------------------------------------------------------*/ 2196 2197 float32 float32_sub(float32 a, float32 b, float_status *status) 2198 { 2199 flag aSign, bSign; 2200 a = float32_squash_input_denormal(a, status); 2201 b = float32_squash_input_denormal(b, status); 2202 2203 aSign = extractFloat32Sign( a ); 2204 bSign = extractFloat32Sign( b ); 2205 if ( aSign == bSign ) { 2206 return subFloat32Sigs(a, b, aSign, status); 2207 } 2208 else { 2209 return addFloat32Sigs(a, b, aSign, status); 2210 } 2211 2212 } 2213 2214 /*---------------------------------------------------------------------------- 2215 | Returns the result of multiplying the single-precision floating-point values 2216 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 2217 | for Binary Floating-Point Arithmetic. 2218 *----------------------------------------------------------------------------*/ 2219 2220 float32 float32_mul(float32 a, float32 b, float_status *status) 2221 { 2222 flag aSign, bSign, zSign; 2223 int aExp, bExp, zExp; 2224 uint32_t aSig, bSig; 2225 uint64_t zSig64; 2226 uint32_t zSig; 2227 2228 a = float32_squash_input_denormal(a, status); 2229 b = float32_squash_input_denormal(b, status); 2230 2231 aSig = extractFloat32Frac( a ); 2232 aExp = extractFloat32Exp( a ); 2233 aSign = extractFloat32Sign( a ); 2234 bSig = extractFloat32Frac( b ); 2235 bExp = extractFloat32Exp( b ); 2236 bSign = extractFloat32Sign( b ); 2237 zSign = aSign ^ bSign; 2238 if ( aExp == 0xFF ) { 2239 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2240 return propagateFloat32NaN(a, b, status); 2241 } 2242 if ( ( bExp | bSig ) == 0 ) { 2243 float_raise(float_flag_invalid, status); 2244 return float32_default_nan(status); 2245 } 2246 return packFloat32( zSign, 0xFF, 0 ); 2247 } 2248 if ( bExp == 0xFF ) { 2249 if (bSig) { 2250 return propagateFloat32NaN(a, b, status); 2251 } 2252 if ( ( aExp | aSig ) == 0 ) { 2253 float_raise(float_flag_invalid, status); 2254 return float32_default_nan(status); 2255 } 2256 return packFloat32( zSign, 0xFF, 0 ); 2257 } 2258 if ( aExp == 0 ) { 2259 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2260 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2261 } 2262 if ( bExp == 0 ) { 2263 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); 2264 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2265 } 2266 zExp = aExp + bExp - 0x7F; 2267 aSig = ( aSig | 0x00800000 )<<7; 2268 bSig = ( bSig | 0x00800000 )<<8; 2269 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 ); 2270 zSig = zSig64; 2271 if ( 0 <= (int32_t) ( zSig<<1 ) ) { 2272 zSig <<= 1; 2273 --zExp; 2274 } 2275 return roundAndPackFloat32(zSign, zExp, zSig, status); 2276 2277 } 2278 2279 /*---------------------------------------------------------------------------- 2280 | Returns the result of dividing the single-precision floating-point value `a' 2281 | by the corresponding value `b'. The operation is performed according to the 2282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2283 *----------------------------------------------------------------------------*/ 2284 2285 float32 float32_div(float32 a, float32 b, float_status *status) 2286 { 2287 flag aSign, bSign, zSign; 2288 int aExp, bExp, zExp; 2289 uint32_t aSig, bSig, zSig; 2290 a = float32_squash_input_denormal(a, status); 2291 b = float32_squash_input_denormal(b, status); 2292 2293 aSig = extractFloat32Frac( a ); 2294 aExp = extractFloat32Exp( a ); 2295 aSign = extractFloat32Sign( a ); 2296 bSig = extractFloat32Frac( b ); 2297 bExp = extractFloat32Exp( b ); 2298 bSign = extractFloat32Sign( b ); 2299 zSign = aSign ^ bSign; 2300 if ( aExp == 0xFF ) { 2301 if (aSig) { 2302 return propagateFloat32NaN(a, b, status); 2303 } 2304 if ( bExp == 0xFF ) { 2305 if (bSig) { 2306 return propagateFloat32NaN(a, b, status); 2307 } 2308 float_raise(float_flag_invalid, status); 2309 return float32_default_nan(status); 2310 } 2311 return packFloat32( zSign, 0xFF, 0 ); 2312 } 2313 if ( bExp == 0xFF ) { 2314 if (bSig) { 2315 return propagateFloat32NaN(a, b, status); 2316 } 2317 return packFloat32( zSign, 0, 0 ); 2318 } 2319 if ( bExp == 0 ) { 2320 if ( bSig == 0 ) { 2321 if ( ( aExp | aSig ) == 0 ) { 2322 float_raise(float_flag_invalid, status); 2323 return float32_default_nan(status); 2324 } 2325 float_raise(float_flag_divbyzero, status); 2326 return packFloat32( zSign, 0xFF, 0 ); 2327 } 2328 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2329 } 2330 if ( aExp == 0 ) { 2331 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); 2332 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2333 } 2334 zExp = aExp - bExp + 0x7D; 2335 aSig = ( aSig | 0x00800000 )<<7; 2336 bSig = ( bSig | 0x00800000 )<<8; 2337 if ( bSig <= ( aSig + aSig ) ) { 2338 aSig >>= 1; 2339 ++zExp; 2340 } 2341 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig; 2342 if ( ( zSig & 0x3F ) == 0 ) { 2343 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 ); 2344 } 2345 return roundAndPackFloat32(zSign, zExp, zSig, status); 2346 2347 } 2348 2349 /*---------------------------------------------------------------------------- 2350 | Returns the remainder of the single-precision floating-point value `a' 2351 | with respect to the corresponding value `b'. The operation is performed 2352 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2353 *----------------------------------------------------------------------------*/ 2354 2355 float32 float32_rem(float32 a, float32 b, float_status *status) 2356 { 2357 flag aSign, zSign; 2358 int aExp, bExp, expDiff; 2359 uint32_t aSig, bSig; 2360 uint32_t q; 2361 uint64_t aSig64, bSig64, q64; 2362 uint32_t alternateASig; 2363 int32_t sigMean; 2364 a = float32_squash_input_denormal(a, status); 2365 b = float32_squash_input_denormal(b, status); 2366 2367 aSig = extractFloat32Frac( a ); 2368 aExp = extractFloat32Exp( a ); 2369 aSign = extractFloat32Sign( a ); 2370 bSig = extractFloat32Frac( b ); 2371 bExp = extractFloat32Exp( b ); 2372 if ( aExp == 0xFF ) { 2373 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 2374 return propagateFloat32NaN(a, b, status); 2375 } 2376 float_raise(float_flag_invalid, status); 2377 return float32_default_nan(status); 2378 } 2379 if ( bExp == 0xFF ) { 2380 if (bSig) { 2381 return propagateFloat32NaN(a, b, status); 2382 } 2383 return a; 2384 } 2385 if ( bExp == 0 ) { 2386 if ( bSig == 0 ) { 2387 float_raise(float_flag_invalid, status); 2388 return float32_default_nan(status); 2389 } 2390 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 2391 } 2392 if ( aExp == 0 ) { 2393 if ( aSig == 0 ) return a; 2394 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2395 } 2396 expDiff = aExp - bExp; 2397 aSig |= 0x00800000; 2398 bSig |= 0x00800000; 2399 if ( expDiff < 32 ) { 2400 aSig <<= 8; 2401 bSig <<= 8; 2402 if ( expDiff < 0 ) { 2403 if ( expDiff < -1 ) return a; 2404 aSig >>= 1; 2405 } 2406 q = ( bSig <= aSig ); 2407 if ( q ) aSig -= bSig; 2408 if ( 0 < expDiff ) { 2409 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 2410 q >>= 32 - expDiff; 2411 bSig >>= 2; 2412 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 2413 } 2414 else { 2415 aSig >>= 2; 2416 bSig >>= 2; 2417 } 2418 } 2419 else { 2420 if ( bSig <= aSig ) aSig -= bSig; 2421 aSig64 = ( (uint64_t) aSig )<<40; 2422 bSig64 = ( (uint64_t) bSig )<<40; 2423 expDiff -= 64; 2424 while ( 0 < expDiff ) { 2425 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2426 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2427 aSig64 = - ( ( bSig * q64 )<<38 ); 2428 expDiff -= 62; 2429 } 2430 expDiff += 64; 2431 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 2432 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 2433 q = q64>>( 64 - expDiff ); 2434 bSig <<= 6; 2435 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 2436 } 2437 do { 2438 alternateASig = aSig; 2439 ++q; 2440 aSig -= bSig; 2441 } while ( 0 <= (int32_t) aSig ); 2442 sigMean = aSig + alternateASig; 2443 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 2444 aSig = alternateASig; 2445 } 2446 zSign = ( (int32_t) aSig < 0 ); 2447 if ( zSign ) aSig = - aSig; 2448 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 2449 } 2450 2451 /*---------------------------------------------------------------------------- 2452 | Returns the result of multiplying the single-precision floating-point values 2453 | `a' and `b' then adding 'c', with no intermediate rounding step after the 2454 | multiplication. The operation is performed according to the IEC/IEEE 2455 | Standard for Binary Floating-Point Arithmetic 754-2008. 2456 | The flags argument allows the caller to select negation of the 2457 | addend, the intermediate product, or the final result. (The difference 2458 | between this and having the caller do a separate negation is that negating 2459 | externally will flip the sign bit on NaNs.) 2460 *----------------------------------------------------------------------------*/ 2461 2462 float32 float32_muladd(float32 a, float32 b, float32 c, int flags, 2463 float_status *status) 2464 { 2465 flag aSign, bSign, cSign, zSign; 2466 int aExp, bExp, cExp, pExp, zExp, expDiff; 2467 uint32_t aSig, bSig, cSig; 2468 flag pInf, pZero, pSign; 2469 uint64_t pSig64, cSig64, zSig64; 2470 uint32_t pSig; 2471 int shiftcount; 2472 flag signflip, infzero; 2473 2474 a = float32_squash_input_denormal(a, status); 2475 b = float32_squash_input_denormal(b, status); 2476 c = float32_squash_input_denormal(c, status); 2477 aSig = extractFloat32Frac(a); 2478 aExp = extractFloat32Exp(a); 2479 aSign = extractFloat32Sign(a); 2480 bSig = extractFloat32Frac(b); 2481 bExp = extractFloat32Exp(b); 2482 bSign = extractFloat32Sign(b); 2483 cSig = extractFloat32Frac(c); 2484 cExp = extractFloat32Exp(c); 2485 cSign = extractFloat32Sign(c); 2486 2487 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || 2488 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); 2489 2490 /* It is implementation-defined whether the cases of (0,inf,qnan) 2491 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 2492 * they return if they do), so we have to hand this information 2493 * off to the target-specific pick-a-NaN routine. 2494 */ 2495 if (((aExp == 0xff) && aSig) || 2496 ((bExp == 0xff) && bSig) || 2497 ((cExp == 0xff) && cSig)) { 2498 return propagateFloat32MulAddNaN(a, b, c, infzero, status); 2499 } 2500 2501 if (infzero) { 2502 float_raise(float_flag_invalid, status); 2503 return float32_default_nan(status); 2504 } 2505 2506 if (flags & float_muladd_negate_c) { 2507 cSign ^= 1; 2508 } 2509 2510 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 2511 2512 /* Work out the sign and type of the product */ 2513 pSign = aSign ^ bSign; 2514 if (flags & float_muladd_negate_product) { 2515 pSign ^= 1; 2516 } 2517 pInf = (aExp == 0xff) || (bExp == 0xff); 2518 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 2519 2520 if (cExp == 0xff) { 2521 if (pInf && (pSign ^ cSign)) { 2522 /* addition of opposite-signed infinities => InvalidOperation */ 2523 float_raise(float_flag_invalid, status); 2524 return float32_default_nan(status); 2525 } 2526 /* Otherwise generate an infinity of the same sign */ 2527 return packFloat32(cSign ^ signflip, 0xff, 0); 2528 } 2529 2530 if (pInf) { 2531 return packFloat32(pSign ^ signflip, 0xff, 0); 2532 } 2533 2534 if (pZero) { 2535 if (cExp == 0) { 2536 if (cSig == 0) { 2537 /* Adding two exact zeroes */ 2538 if (pSign == cSign) { 2539 zSign = pSign; 2540 } else if (status->float_rounding_mode == float_round_down) { 2541 zSign = 1; 2542 } else { 2543 zSign = 0; 2544 } 2545 return packFloat32(zSign ^ signflip, 0, 0); 2546 } 2547 /* Exact zero plus a denorm */ 2548 if (status->flush_to_zero) { 2549 float_raise(float_flag_output_denormal, status); 2550 return packFloat32(cSign ^ signflip, 0, 0); 2551 } 2552 } 2553 /* Zero plus something non-zero : just return the something */ 2554 if (flags & float_muladd_halve_result) { 2555 if (cExp == 0) { 2556 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2557 } 2558 /* Subtract one to halve, and one again because roundAndPackFloat32 2559 * wants one less than the true exponent. 2560 */ 2561 cExp -= 2; 2562 cSig = (cSig | 0x00800000) << 7; 2563 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status); 2564 } 2565 return packFloat32(cSign ^ signflip, cExp, cSig); 2566 } 2567 2568 if (aExp == 0) { 2569 normalizeFloat32Subnormal(aSig, &aExp, &aSig); 2570 } 2571 if (bExp == 0) { 2572 normalizeFloat32Subnormal(bSig, &bExp, &bSig); 2573 } 2574 2575 /* Calculate the actual result a * b + c */ 2576 2577 /* Multiply first; this is easy. */ 2578 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f 2579 * because we want the true exponent, not the "one-less-than" 2580 * flavour that roundAndPackFloat32() takes. 2581 */ 2582 pExp = aExp + bExp - 0x7e; 2583 aSig = (aSig | 0x00800000) << 7; 2584 bSig = (bSig | 0x00800000) << 8; 2585 pSig64 = (uint64_t)aSig * bSig; 2586 if ((int64_t)(pSig64 << 1) >= 0) { 2587 pSig64 <<= 1; 2588 pExp--; 2589 } 2590 2591 zSign = pSign ^ signflip; 2592 2593 /* Now pSig64 is the significand of the multiply, with the explicit bit in 2594 * position 62. 2595 */ 2596 if (cExp == 0) { 2597 if (!cSig) { 2598 /* Throw out the special case of c being an exact zero now */ 2599 shift64RightJamming(pSig64, 32, &pSig64); 2600 pSig = pSig64; 2601 if (flags & float_muladd_halve_result) { 2602 pExp--; 2603 } 2604 return roundAndPackFloat32(zSign, pExp - 1, 2605 pSig, status); 2606 } 2607 normalizeFloat32Subnormal(cSig, &cExp, &cSig); 2608 } 2609 2610 cSig64 = (uint64_t)cSig << (62 - 23); 2611 cSig64 |= LIT64(0x4000000000000000); 2612 expDiff = pExp - cExp; 2613 2614 if (pSign == cSign) { 2615 /* Addition */ 2616 if (expDiff > 0) { 2617 /* scale c to match p */ 2618 shift64RightJamming(cSig64, expDiff, &cSig64); 2619 zExp = pExp; 2620 } else if (expDiff < 0) { 2621 /* scale p to match c */ 2622 shift64RightJamming(pSig64, -expDiff, &pSig64); 2623 zExp = cExp; 2624 } else { 2625 /* no scaling needed */ 2626 zExp = cExp; 2627 } 2628 /* Add significands and make sure explicit bit ends up in posn 62 */ 2629 zSig64 = pSig64 + cSig64; 2630 if ((int64_t)zSig64 < 0) { 2631 shift64RightJamming(zSig64, 1, &zSig64); 2632 } else { 2633 zExp--; 2634 } 2635 } else { 2636 /* Subtraction */ 2637 if (expDiff > 0) { 2638 shift64RightJamming(cSig64, expDiff, &cSig64); 2639 zSig64 = pSig64 - cSig64; 2640 zExp = pExp; 2641 } else if (expDiff < 0) { 2642 shift64RightJamming(pSig64, -expDiff, &pSig64); 2643 zSig64 = cSig64 - pSig64; 2644 zExp = cExp; 2645 zSign ^= 1; 2646 } else { 2647 zExp = pExp; 2648 if (cSig64 < pSig64) { 2649 zSig64 = pSig64 - cSig64; 2650 } else if (pSig64 < cSig64) { 2651 zSig64 = cSig64 - pSig64; 2652 zSign ^= 1; 2653 } else { 2654 /* Exact zero */ 2655 zSign = signflip; 2656 if (status->float_rounding_mode == float_round_down) { 2657 zSign ^= 1; 2658 } 2659 return packFloat32(zSign, 0, 0); 2660 } 2661 } 2662 --zExp; 2663 /* Normalize to put the explicit bit back into bit 62. */ 2664 shiftcount = countLeadingZeros64(zSig64) - 1; 2665 zSig64 <<= shiftcount; 2666 zExp -= shiftcount; 2667 } 2668 if (flags & float_muladd_halve_result) { 2669 zExp--; 2670 } 2671 2672 shift64RightJamming(zSig64, 32, &zSig64); 2673 return roundAndPackFloat32(zSign, zExp, zSig64, status); 2674 } 2675 2676 2677 /*---------------------------------------------------------------------------- 2678 | Returns the square root of the single-precision floating-point value `a'. 2679 | The operation is performed according to the IEC/IEEE Standard for Binary 2680 | Floating-Point Arithmetic. 2681 *----------------------------------------------------------------------------*/ 2682 2683 float32 float32_sqrt(float32 a, float_status *status) 2684 { 2685 flag aSign; 2686 int aExp, zExp; 2687 uint32_t aSig, zSig; 2688 uint64_t rem, term; 2689 a = float32_squash_input_denormal(a, status); 2690 2691 aSig = extractFloat32Frac( a ); 2692 aExp = extractFloat32Exp( a ); 2693 aSign = extractFloat32Sign( a ); 2694 if ( aExp == 0xFF ) { 2695 if (aSig) { 2696 return propagateFloat32NaN(a, float32_zero, status); 2697 } 2698 if ( ! aSign ) return a; 2699 float_raise(float_flag_invalid, status); 2700 return float32_default_nan(status); 2701 } 2702 if ( aSign ) { 2703 if ( ( aExp | aSig ) == 0 ) return a; 2704 float_raise(float_flag_invalid, status); 2705 return float32_default_nan(status); 2706 } 2707 if ( aExp == 0 ) { 2708 if ( aSig == 0 ) return float32_zero; 2709 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2710 } 2711 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 2712 aSig = ( aSig | 0x00800000 )<<8; 2713 zSig = estimateSqrt32( aExp, aSig ) + 2; 2714 if ( ( zSig & 0x7F ) <= 5 ) { 2715 if ( zSig < 2 ) { 2716 zSig = 0x7FFFFFFF; 2717 goto roundAndPack; 2718 } 2719 aSig >>= aExp & 1; 2720 term = ( (uint64_t) zSig ) * zSig; 2721 rem = ( ( (uint64_t) aSig )<<32 ) - term; 2722 while ( (int64_t) rem < 0 ) { 2723 --zSig; 2724 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 2725 } 2726 zSig |= ( rem != 0 ); 2727 } 2728 shift32RightJamming( zSig, 1, &zSig ); 2729 roundAndPack: 2730 return roundAndPackFloat32(0, zExp, zSig, status); 2731 2732 } 2733 2734 /*---------------------------------------------------------------------------- 2735 | Returns the binary exponential of the single-precision floating-point value 2736 | `a'. The operation is performed according to the IEC/IEEE Standard for 2737 | Binary Floating-Point Arithmetic. 2738 | 2739 | Uses the following identities: 2740 | 2741 | 1. ------------------------------------------------------------------------- 2742 | x x*ln(2) 2743 | 2 = e 2744 | 2745 | 2. ------------------------------------------------------------------------- 2746 | 2 3 4 5 n 2747 | x x x x x x x 2748 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 2749 | 1! 2! 3! 4! 5! n! 2750 *----------------------------------------------------------------------------*/ 2751 2752 static const float64 float32_exp2_coefficients[15] = 2753 { 2754 const_float64( 0x3ff0000000000000ll ), /* 1 */ 2755 const_float64( 0x3fe0000000000000ll ), /* 2 */ 2756 const_float64( 0x3fc5555555555555ll ), /* 3 */ 2757 const_float64( 0x3fa5555555555555ll ), /* 4 */ 2758 const_float64( 0x3f81111111111111ll ), /* 5 */ 2759 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 2760 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 2761 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 2762 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 2763 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 2764 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 2765 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 2766 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 2767 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 2768 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 2769 }; 2770 2771 float32 float32_exp2(float32 a, float_status *status) 2772 { 2773 flag aSign; 2774 int aExp; 2775 uint32_t aSig; 2776 float64 r, x, xn; 2777 int i; 2778 a = float32_squash_input_denormal(a, status); 2779 2780 aSig = extractFloat32Frac( a ); 2781 aExp = extractFloat32Exp( a ); 2782 aSign = extractFloat32Sign( a ); 2783 2784 if ( aExp == 0xFF) { 2785 if (aSig) { 2786 return propagateFloat32NaN(a, float32_zero, status); 2787 } 2788 return (aSign) ? float32_zero : a; 2789 } 2790 if (aExp == 0) { 2791 if (aSig == 0) return float32_one; 2792 } 2793 2794 float_raise(float_flag_inexact, status); 2795 2796 /* ******************************* */ 2797 /* using float64 for approximation */ 2798 /* ******************************* */ 2799 x = float32_to_float64(a, status); 2800 x = float64_mul(x, float64_ln2, status); 2801 2802 xn = x; 2803 r = float64_one; 2804 for (i = 0 ; i < 15 ; i++) { 2805 float64 f; 2806 2807 f = float64_mul(xn, float32_exp2_coefficients[i], status); 2808 r = float64_add(r, f, status); 2809 2810 xn = float64_mul(xn, x, status); 2811 } 2812 2813 return float64_to_float32(r, status); 2814 } 2815 2816 /*---------------------------------------------------------------------------- 2817 | Returns the binary log of the single-precision floating-point value `a'. 2818 | The operation is performed according to the IEC/IEEE Standard for Binary 2819 | Floating-Point Arithmetic. 2820 *----------------------------------------------------------------------------*/ 2821 float32 float32_log2(float32 a, float_status *status) 2822 { 2823 flag aSign, zSign; 2824 int aExp; 2825 uint32_t aSig, zSig, i; 2826 2827 a = float32_squash_input_denormal(a, status); 2828 aSig = extractFloat32Frac( a ); 2829 aExp = extractFloat32Exp( a ); 2830 aSign = extractFloat32Sign( a ); 2831 2832 if ( aExp == 0 ) { 2833 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 2834 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 2835 } 2836 if ( aSign ) { 2837 float_raise(float_flag_invalid, status); 2838 return float32_default_nan(status); 2839 } 2840 if ( aExp == 0xFF ) { 2841 if (aSig) { 2842 return propagateFloat32NaN(a, float32_zero, status); 2843 } 2844 return a; 2845 } 2846 2847 aExp -= 0x7F; 2848 aSig |= 0x00800000; 2849 zSign = aExp < 0; 2850 zSig = aExp << 23; 2851 2852 for (i = 1 << 22; i > 0; i >>= 1) { 2853 aSig = ( (uint64_t)aSig * aSig ) >> 23; 2854 if ( aSig & 0x01000000 ) { 2855 aSig >>= 1; 2856 zSig |= i; 2857 } 2858 } 2859 2860 if ( zSign ) 2861 zSig = -zSig; 2862 2863 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 2864 } 2865 2866 /*---------------------------------------------------------------------------- 2867 | Returns 1 if the single-precision floating-point value `a' is equal to 2868 | the corresponding value `b', and 0 otherwise. The invalid exception is 2869 | raised if either operand is a NaN. Otherwise, the comparison is performed 2870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2871 *----------------------------------------------------------------------------*/ 2872 2873 int float32_eq(float32 a, float32 b, float_status *status) 2874 { 2875 uint32_t av, bv; 2876 a = float32_squash_input_denormal(a, status); 2877 b = float32_squash_input_denormal(b, status); 2878 2879 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2880 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2881 ) { 2882 float_raise(float_flag_invalid, status); 2883 return 0; 2884 } 2885 av = float32_val(a); 2886 bv = float32_val(b); 2887 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2888 } 2889 2890 /*---------------------------------------------------------------------------- 2891 | Returns 1 if the single-precision floating-point value `a' is less than 2892 | or equal to the corresponding value `b', and 0 otherwise. The invalid 2893 | exception is raised if either operand is a NaN. The comparison is performed 2894 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2895 *----------------------------------------------------------------------------*/ 2896 2897 int float32_le(float32 a, float32 b, float_status *status) 2898 { 2899 flag aSign, bSign; 2900 uint32_t av, bv; 2901 a = float32_squash_input_denormal(a, status); 2902 b = float32_squash_input_denormal(b, status); 2903 2904 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2905 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2906 ) { 2907 float_raise(float_flag_invalid, status); 2908 return 0; 2909 } 2910 aSign = extractFloat32Sign( a ); 2911 bSign = extractFloat32Sign( b ); 2912 av = float32_val(a); 2913 bv = float32_val(b); 2914 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 2915 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 2916 2917 } 2918 2919 /*---------------------------------------------------------------------------- 2920 | Returns 1 if the single-precision floating-point value `a' is less than 2921 | the corresponding value `b', and 0 otherwise. The invalid exception is 2922 | raised if either operand is a NaN. The comparison is performed according 2923 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2924 *----------------------------------------------------------------------------*/ 2925 2926 int float32_lt(float32 a, float32 b, float_status *status) 2927 { 2928 flag aSign, bSign; 2929 uint32_t av, bv; 2930 a = float32_squash_input_denormal(a, status); 2931 b = float32_squash_input_denormal(b, status); 2932 2933 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2934 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2935 ) { 2936 float_raise(float_flag_invalid, status); 2937 return 0; 2938 } 2939 aSign = extractFloat32Sign( a ); 2940 bSign = extractFloat32Sign( b ); 2941 av = float32_val(a); 2942 bv = float32_val(b); 2943 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 2944 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 2945 2946 } 2947 2948 /*---------------------------------------------------------------------------- 2949 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 2950 | be compared, and 0 otherwise. The invalid exception is raised if either 2951 | operand is a NaN. The comparison is performed according to the IEC/IEEE 2952 | Standard for Binary Floating-Point Arithmetic. 2953 *----------------------------------------------------------------------------*/ 2954 2955 int float32_unordered(float32 a, float32 b, float_status *status) 2956 { 2957 a = float32_squash_input_denormal(a, status); 2958 b = float32_squash_input_denormal(b, status); 2959 2960 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2961 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2962 ) { 2963 float_raise(float_flag_invalid, status); 2964 return 1; 2965 } 2966 return 0; 2967 } 2968 2969 /*---------------------------------------------------------------------------- 2970 | Returns 1 if the single-precision floating-point value `a' is equal to 2971 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 2972 | exception. The comparison is performed according to the IEC/IEEE Standard 2973 | for Binary Floating-Point Arithmetic. 2974 *----------------------------------------------------------------------------*/ 2975 2976 int float32_eq_quiet(float32 a, float32 b, float_status *status) 2977 { 2978 a = float32_squash_input_denormal(a, status); 2979 b = float32_squash_input_denormal(b, status); 2980 2981 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 2982 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 2983 ) { 2984 if (float32_is_signaling_nan(a, status) 2985 || float32_is_signaling_nan(b, status)) { 2986 float_raise(float_flag_invalid, status); 2987 } 2988 return 0; 2989 } 2990 return ( float32_val(a) == float32_val(b) ) || 2991 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 2992 } 2993 2994 /*---------------------------------------------------------------------------- 2995 | Returns 1 if the single-precision floating-point value `a' is less than or 2996 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 2997 | cause an exception. Otherwise, the comparison is performed according to the 2998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2999 *----------------------------------------------------------------------------*/ 3000 3001 int float32_le_quiet(float32 a, float32 b, float_status *status) 3002 { 3003 flag aSign, bSign; 3004 uint32_t av, bv; 3005 a = float32_squash_input_denormal(a, status); 3006 b = float32_squash_input_denormal(b, status); 3007 3008 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3009 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3010 ) { 3011 if (float32_is_signaling_nan(a, status) 3012 || float32_is_signaling_nan(b, status)) { 3013 float_raise(float_flag_invalid, status); 3014 } 3015 return 0; 3016 } 3017 aSign = extractFloat32Sign( a ); 3018 bSign = extractFloat32Sign( b ); 3019 av = float32_val(a); 3020 bv = float32_val(b); 3021 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3022 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3023 3024 } 3025 3026 /*---------------------------------------------------------------------------- 3027 | Returns 1 if the single-precision floating-point value `a' is less than 3028 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3029 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3030 | Standard for Binary Floating-Point Arithmetic. 3031 *----------------------------------------------------------------------------*/ 3032 3033 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3034 { 3035 flag aSign, bSign; 3036 uint32_t av, bv; 3037 a = float32_squash_input_denormal(a, status); 3038 b = float32_squash_input_denormal(b, status); 3039 3040 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3041 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3042 ) { 3043 if (float32_is_signaling_nan(a, status) 3044 || float32_is_signaling_nan(b, status)) { 3045 float_raise(float_flag_invalid, status); 3046 } 3047 return 0; 3048 } 3049 aSign = extractFloat32Sign( a ); 3050 bSign = extractFloat32Sign( b ); 3051 av = float32_val(a); 3052 bv = float32_val(b); 3053 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3054 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3055 3056 } 3057 3058 /*---------------------------------------------------------------------------- 3059 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3060 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3061 | comparison is performed according to the IEC/IEEE Standard for Binary 3062 | Floating-Point Arithmetic. 3063 *----------------------------------------------------------------------------*/ 3064 3065 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3066 { 3067 a = float32_squash_input_denormal(a, status); 3068 b = float32_squash_input_denormal(b, status); 3069 3070 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3071 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3072 ) { 3073 if (float32_is_signaling_nan(a, status) 3074 || float32_is_signaling_nan(b, status)) { 3075 float_raise(float_flag_invalid, status); 3076 } 3077 return 1; 3078 } 3079 return 0; 3080 } 3081 3082 /*---------------------------------------------------------------------------- 3083 | Returns the result of converting the double-precision floating-point value 3084 | `a' to the 32-bit two's complement integer format. The conversion is 3085 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3086 | Arithmetic---which means in particular that the conversion is rounded 3087 | according to the current rounding mode. If `a' is a NaN, the largest 3088 | positive integer is returned. Otherwise, if the conversion overflows, the 3089 | largest integer with the same sign as `a' is returned. 3090 *----------------------------------------------------------------------------*/ 3091 3092 int32_t float64_to_int32(float64 a, float_status *status) 3093 { 3094 flag aSign; 3095 int aExp; 3096 int shiftCount; 3097 uint64_t aSig; 3098 a = float64_squash_input_denormal(a, status); 3099 3100 aSig = extractFloat64Frac( a ); 3101 aExp = extractFloat64Exp( a ); 3102 aSign = extractFloat64Sign( a ); 3103 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3104 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3105 shiftCount = 0x42C - aExp; 3106 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); 3107 return roundAndPackInt32(aSign, aSig, status); 3108 3109 } 3110 3111 /*---------------------------------------------------------------------------- 3112 | Returns the result of converting the double-precision floating-point value 3113 | `a' to the 32-bit two's complement integer format. The conversion is 3114 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3115 | Arithmetic, except that the conversion is always rounded toward zero. 3116 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3117 | the conversion overflows, the largest integer with the same sign as `a' is 3118 | returned. 3119 *----------------------------------------------------------------------------*/ 3120 3121 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status) 3122 { 3123 flag aSign; 3124 int aExp; 3125 int shiftCount; 3126 uint64_t aSig, savedASig; 3127 int32_t z; 3128 a = float64_squash_input_denormal(a, status); 3129 3130 aSig = extractFloat64Frac( a ); 3131 aExp = extractFloat64Exp( a ); 3132 aSign = extractFloat64Sign( a ); 3133 if ( 0x41E < aExp ) { 3134 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; 3135 goto invalid; 3136 } 3137 else if ( aExp < 0x3FF ) { 3138 if (aExp || aSig) { 3139 status->float_exception_flags |= float_flag_inexact; 3140 } 3141 return 0; 3142 } 3143 aSig |= LIT64( 0x0010000000000000 ); 3144 shiftCount = 0x433 - aExp; 3145 savedASig = aSig; 3146 aSig >>= shiftCount; 3147 z = aSig; 3148 if ( aSign ) z = - z; 3149 if ( ( z < 0 ) ^ aSign ) { 3150 invalid: 3151 float_raise(float_flag_invalid, status); 3152 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 3153 } 3154 if ( ( aSig<<shiftCount ) != savedASig ) { 3155 status->float_exception_flags |= float_flag_inexact; 3156 } 3157 return z; 3158 3159 } 3160 3161 /*---------------------------------------------------------------------------- 3162 | Returns the result of converting the double-precision floating-point value 3163 | `a' to the 16-bit two's complement integer format. The conversion is 3164 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3165 | Arithmetic, except that the conversion is always rounded toward zero. 3166 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3167 | the conversion overflows, the largest integer with the same sign as `a' is 3168 | returned. 3169 *----------------------------------------------------------------------------*/ 3170 3171 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status) 3172 { 3173 flag aSign; 3174 int aExp; 3175 int shiftCount; 3176 uint64_t aSig, savedASig; 3177 int32_t z; 3178 3179 aSig = extractFloat64Frac( a ); 3180 aExp = extractFloat64Exp( a ); 3181 aSign = extractFloat64Sign( a ); 3182 if ( 0x40E < aExp ) { 3183 if ( ( aExp == 0x7FF ) && aSig ) { 3184 aSign = 0; 3185 } 3186 goto invalid; 3187 } 3188 else if ( aExp < 0x3FF ) { 3189 if ( aExp || aSig ) { 3190 status->float_exception_flags |= float_flag_inexact; 3191 } 3192 return 0; 3193 } 3194 aSig |= LIT64( 0x0010000000000000 ); 3195 shiftCount = 0x433 - aExp; 3196 savedASig = aSig; 3197 aSig >>= shiftCount; 3198 z = aSig; 3199 if ( aSign ) { 3200 z = - z; 3201 } 3202 if ( ( (int16_t)z < 0 ) ^ aSign ) { 3203 invalid: 3204 float_raise(float_flag_invalid, status); 3205 return aSign ? (int32_t) 0xffff8000 : 0x7FFF; 3206 } 3207 if ( ( aSig<<shiftCount ) != savedASig ) { 3208 status->float_exception_flags |= float_flag_inexact; 3209 } 3210 return z; 3211 } 3212 3213 /*---------------------------------------------------------------------------- 3214 | Returns the result of converting the double-precision floating-point value 3215 | `a' to the 64-bit two's complement integer format. The conversion is 3216 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3217 | Arithmetic---which means in particular that the conversion is rounded 3218 | according to the current rounding mode. If `a' is a NaN, the largest 3219 | positive integer is returned. Otherwise, if the conversion overflows, the 3220 | largest integer with the same sign as `a' is returned. 3221 *----------------------------------------------------------------------------*/ 3222 3223 int64_t float64_to_int64(float64 a, float_status *status) 3224 { 3225 flag aSign; 3226 int aExp; 3227 int shiftCount; 3228 uint64_t aSig, aSigExtra; 3229 a = float64_squash_input_denormal(a, status); 3230 3231 aSig = extractFloat64Frac( a ); 3232 aExp = extractFloat64Exp( a ); 3233 aSign = extractFloat64Sign( a ); 3234 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3235 shiftCount = 0x433 - aExp; 3236 if ( shiftCount <= 0 ) { 3237 if ( 0x43E < aExp ) { 3238 float_raise(float_flag_invalid, status); 3239 if ( ! aSign 3240 || ( ( aExp == 0x7FF ) 3241 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3242 ) { 3243 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3244 } 3245 return (int64_t) LIT64( 0x8000000000000000 ); 3246 } 3247 aSigExtra = 0; 3248 aSig <<= - shiftCount; 3249 } 3250 else { 3251 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 3252 } 3253 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 3254 3255 } 3256 3257 /*---------------------------------------------------------------------------- 3258 | Returns the result of converting the double-precision floating-point value 3259 | `a' to the 64-bit two's complement integer format. The conversion is 3260 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3261 | Arithmetic, except that the conversion is always rounded toward zero. 3262 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 3263 | the conversion overflows, the largest integer with the same sign as `a' is 3264 | returned. 3265 *----------------------------------------------------------------------------*/ 3266 3267 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status) 3268 { 3269 flag aSign; 3270 int aExp; 3271 int shiftCount; 3272 uint64_t aSig; 3273 int64_t z; 3274 a = float64_squash_input_denormal(a, status); 3275 3276 aSig = extractFloat64Frac( a ); 3277 aExp = extractFloat64Exp( a ); 3278 aSign = extractFloat64Sign( a ); 3279 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); 3280 shiftCount = aExp - 0x433; 3281 if ( 0 <= shiftCount ) { 3282 if ( 0x43E <= aExp ) { 3283 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { 3284 float_raise(float_flag_invalid, status); 3285 if ( ! aSign 3286 || ( ( aExp == 0x7FF ) 3287 && ( aSig != LIT64( 0x0010000000000000 ) ) ) 3288 ) { 3289 return LIT64( 0x7FFFFFFFFFFFFFFF ); 3290 } 3291 } 3292 return (int64_t) LIT64( 0x8000000000000000 ); 3293 } 3294 z = aSig<<shiftCount; 3295 } 3296 else { 3297 if ( aExp < 0x3FE ) { 3298 if (aExp | aSig) { 3299 status->float_exception_flags |= float_flag_inexact; 3300 } 3301 return 0; 3302 } 3303 z = aSig>>( - shiftCount ); 3304 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 3305 status->float_exception_flags |= float_flag_inexact; 3306 } 3307 } 3308 if ( aSign ) z = - z; 3309 return z; 3310 3311 } 3312 3313 /*---------------------------------------------------------------------------- 3314 | Returns the result of converting the double-precision floating-point value 3315 | `a' to the single-precision floating-point format. The conversion is 3316 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3317 | Arithmetic. 3318 *----------------------------------------------------------------------------*/ 3319 3320 float32 float64_to_float32(float64 a, float_status *status) 3321 { 3322 flag aSign; 3323 int aExp; 3324 uint64_t aSig; 3325 uint32_t zSig; 3326 a = float64_squash_input_denormal(a, status); 3327 3328 aSig = extractFloat64Frac( a ); 3329 aExp = extractFloat64Exp( a ); 3330 aSign = extractFloat64Sign( a ); 3331 if ( aExp == 0x7FF ) { 3332 if (aSig) { 3333 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3334 } 3335 return packFloat32( aSign, 0xFF, 0 ); 3336 } 3337 shift64RightJamming( aSig, 22, &aSig ); 3338 zSig = aSig; 3339 if ( aExp || zSig ) { 3340 zSig |= 0x40000000; 3341 aExp -= 0x381; 3342 } 3343 return roundAndPackFloat32(aSign, aExp, zSig, status); 3344 3345 } 3346 3347 3348 /*---------------------------------------------------------------------------- 3349 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3350 | half-precision floating-point value, returning the result. After being 3351 | shifted into the proper positions, the three fields are simply added 3352 | together to form the result. This means that any integer portion of `zSig' 3353 | will be added into the exponent. Since a properly normalized significand 3354 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3355 | than the desired result exponent whenever `zSig' is a complete, normalized 3356 | significand. 3357 *----------------------------------------------------------------------------*/ 3358 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3359 { 3360 return make_float16( 3361 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3362 } 3363 3364 /*---------------------------------------------------------------------------- 3365 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3366 | and significand `zSig', and returns the proper half-precision floating- 3367 | point value corresponding to the abstract input. Ordinarily, the abstract 3368 | value is simply rounded and packed into the half-precision format, with 3369 | the inexact exception raised if the abstract input cannot be represented 3370 | exactly. However, if the abstract value is too large, the overflow and 3371 | inexact exceptions are raised and an infinity or maximal finite value is 3372 | returned. If the abstract value is too small, the input value is rounded to 3373 | a subnormal number, and the underflow and inexact exceptions are raised if 3374 | the abstract input cannot be represented exactly as a subnormal half- 3375 | precision floating-point number. 3376 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3377 | ARM-style "alternative representation", which omits the NaN and Inf 3378 | encodings in order to raise the maximum representable exponent by one. 3379 | The input significand `zSig' has its binary point between bits 22 3380 | and 23, which is 13 bits to the left of the usual location. This shifted 3381 | significand must be normalized or smaller. If `zSig' is not normalized, 3382 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3383 | and it must not require rounding. In the usual case that `zSig' is 3384 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3385 | Note the slightly odd position of the binary point in zSig compared with the 3386 | other roundAndPackFloat functions. This should probably be fixed if we 3387 | need to implement more float16 routines than just conversion. 3388 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3389 | Binary Floating-Point Arithmetic. 3390 *----------------------------------------------------------------------------*/ 3391 3392 static float16 roundAndPackFloat16(flag zSign, int zExp, 3393 uint32_t zSig, flag ieee, 3394 float_status *status) 3395 { 3396 int maxexp = ieee ? 29 : 30; 3397 uint32_t mask; 3398 uint32_t increment; 3399 bool rounding_bumps_exp; 3400 bool is_tiny = false; 3401 3402 /* Calculate the mask of bits of the mantissa which are not 3403 * representable in half-precision and will be lost. 3404 */ 3405 if (zExp < 1) { 3406 /* Will be denormal in halfprec */ 3407 mask = 0x00ffffff; 3408 if (zExp >= -11) { 3409 mask >>= 11 + zExp; 3410 } 3411 } else { 3412 /* Normal number in halfprec */ 3413 mask = 0x00001fff; 3414 } 3415 3416 switch (status->float_rounding_mode) { 3417 case float_round_nearest_even: 3418 increment = (mask + 1) >> 1; 3419 if ((zSig & mask) == increment) { 3420 increment = zSig & (increment << 1); 3421 } 3422 break; 3423 case float_round_ties_away: 3424 increment = (mask + 1) >> 1; 3425 break; 3426 case float_round_up: 3427 increment = zSign ? 0 : mask; 3428 break; 3429 case float_round_down: 3430 increment = zSign ? mask : 0; 3431 break; 3432 default: /* round_to_zero */ 3433 increment = 0; 3434 break; 3435 } 3436 3437 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3438 3439 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3440 if (ieee) { 3441 float_raise(float_flag_overflow | float_flag_inexact, status); 3442 return packFloat16(zSign, 0x1f, 0); 3443 } else { 3444 float_raise(float_flag_invalid, status); 3445 return packFloat16(zSign, 0x1f, 0x3ff); 3446 } 3447 } 3448 3449 if (zExp < 0) { 3450 /* Note that flush-to-zero does not affect half-precision results */ 3451 is_tiny = 3452 (status->float_detect_tininess == float_tininess_before_rounding) 3453 || (zExp < -1) 3454 || (!rounding_bumps_exp); 3455 } 3456 if (zSig & mask) { 3457 float_raise(float_flag_inexact, status); 3458 if (is_tiny) { 3459 float_raise(float_flag_underflow, status); 3460 } 3461 } 3462 3463 zSig += increment; 3464 if (rounding_bumps_exp) { 3465 zSig >>= 1; 3466 zExp++; 3467 } 3468 3469 if (zExp < -10) { 3470 return packFloat16(zSign, 0, 0); 3471 } 3472 if (zExp < 0) { 3473 zSig >>= -zExp; 3474 zExp = 0; 3475 } 3476 return packFloat16(zSign, zExp, zSig >> 13); 3477 } 3478 3479 /*---------------------------------------------------------------------------- 3480 | If `a' is denormal and we are in flush-to-zero mode then set the 3481 | input-denormal exception and return zero. Otherwise just return the value. 3482 *----------------------------------------------------------------------------*/ 3483 float16 float16_squash_input_denormal(float16 a, float_status *status) 3484 { 3485 if (status->flush_inputs_to_zero) { 3486 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3487 float_raise(float_flag_input_denormal, status); 3488 return make_float16(float16_val(a) & 0x8000); 3489 } 3490 } 3491 return a; 3492 } 3493 3494 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3495 uint32_t *zSigPtr) 3496 { 3497 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3498 *zSigPtr = aSig << shiftCount; 3499 *zExpPtr = 1 - shiftCount; 3500 } 3501 3502 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3503 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3504 3505 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3506 { 3507 flag aSign; 3508 int aExp; 3509 uint32_t aSig; 3510 3511 aSign = extractFloat16Sign(a); 3512 aExp = extractFloat16Exp(a); 3513 aSig = extractFloat16Frac(a); 3514 3515 if (aExp == 0x1f && ieee) { 3516 if (aSig) { 3517 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3518 } 3519 return packFloat32(aSign, 0xff, 0); 3520 } 3521 if (aExp == 0) { 3522 if (aSig == 0) { 3523 return packFloat32(aSign, 0, 0); 3524 } 3525 3526 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3527 aExp--; 3528 } 3529 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3530 } 3531 3532 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3533 { 3534 flag aSign; 3535 int aExp; 3536 uint32_t aSig; 3537 3538 a = float32_squash_input_denormal(a, status); 3539 3540 aSig = extractFloat32Frac( a ); 3541 aExp = extractFloat32Exp( a ); 3542 aSign = extractFloat32Sign( a ); 3543 if ( aExp == 0xFF ) { 3544 if (aSig) { 3545 /* Input is a NaN */ 3546 if (!ieee) { 3547 float_raise(float_flag_invalid, status); 3548 return packFloat16(aSign, 0, 0); 3549 } 3550 return commonNaNToFloat16( 3551 float32ToCommonNaN(a, status), status); 3552 } 3553 /* Infinity */ 3554 if (!ieee) { 3555 float_raise(float_flag_invalid, status); 3556 return packFloat16(aSign, 0x1f, 0x3ff); 3557 } 3558 return packFloat16(aSign, 0x1f, 0); 3559 } 3560 if (aExp == 0 && aSig == 0) { 3561 return packFloat16(aSign, 0, 0); 3562 } 3563 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3564 * even if the input is denormal; however this is harmless because 3565 * the largest possible single-precision denormal is still smaller 3566 * than the smallest representable half-precision denormal, and so we 3567 * will end up ignoring aSig and returning via the "always return zero" 3568 * codepath. 3569 */ 3570 aSig |= 0x00800000; 3571 aExp -= 0x71; 3572 3573 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3574 } 3575 3576 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3577 { 3578 flag aSign; 3579 int aExp; 3580 uint32_t aSig; 3581 3582 aSign = extractFloat16Sign(a); 3583 aExp = extractFloat16Exp(a); 3584 aSig = extractFloat16Frac(a); 3585 3586 if (aExp == 0x1f && ieee) { 3587 if (aSig) { 3588 return commonNaNToFloat64( 3589 float16ToCommonNaN(a, status), status); 3590 } 3591 return packFloat64(aSign, 0x7ff, 0); 3592 } 3593 if (aExp == 0) { 3594 if (aSig == 0) { 3595 return packFloat64(aSign, 0, 0); 3596 } 3597 3598 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3599 aExp--; 3600 } 3601 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3602 } 3603 3604 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3605 { 3606 flag aSign; 3607 int aExp; 3608 uint64_t aSig; 3609 uint32_t zSig; 3610 3611 a = float64_squash_input_denormal(a, status); 3612 3613 aSig = extractFloat64Frac(a); 3614 aExp = extractFloat64Exp(a); 3615 aSign = extractFloat64Sign(a); 3616 if (aExp == 0x7FF) { 3617 if (aSig) { 3618 /* Input is a NaN */ 3619 if (!ieee) { 3620 float_raise(float_flag_invalid, status); 3621 return packFloat16(aSign, 0, 0); 3622 } 3623 return commonNaNToFloat16( 3624 float64ToCommonNaN(a, status), status); 3625 } 3626 /* Infinity */ 3627 if (!ieee) { 3628 float_raise(float_flag_invalid, status); 3629 return packFloat16(aSign, 0x1f, 0x3ff); 3630 } 3631 return packFloat16(aSign, 0x1f, 0); 3632 } 3633 shift64RightJamming(aSig, 29, &aSig); 3634 zSig = aSig; 3635 if (aExp == 0 && zSig == 0) { 3636 return packFloat16(aSign, 0, 0); 3637 } 3638 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3639 * even if the input is denormal; however this is harmless because 3640 * the largest possible single-precision denormal is still smaller 3641 * than the smallest representable half-precision denormal, and so we 3642 * will end up ignoring aSig and returning via the "always return zero" 3643 * codepath. 3644 */ 3645 zSig |= 0x00800000; 3646 aExp -= 0x3F1; 3647 3648 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 3649 } 3650 3651 /*---------------------------------------------------------------------------- 3652 | Returns the result of converting the double-precision floating-point value 3653 | `a' to the extended double-precision floating-point format. The conversion 3654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3655 | Arithmetic. 3656 *----------------------------------------------------------------------------*/ 3657 3658 floatx80 float64_to_floatx80(float64 a, float_status *status) 3659 { 3660 flag aSign; 3661 int aExp; 3662 uint64_t aSig; 3663 3664 a = float64_squash_input_denormal(a, status); 3665 aSig = extractFloat64Frac( a ); 3666 aExp = extractFloat64Exp( a ); 3667 aSign = extractFloat64Sign( a ); 3668 if ( aExp == 0x7FF ) { 3669 if (aSig) { 3670 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 3671 } 3672 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3673 } 3674 if ( aExp == 0 ) { 3675 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3676 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3677 } 3678 return 3679 packFloatx80( 3680 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 3681 3682 } 3683 3684 /*---------------------------------------------------------------------------- 3685 | Returns the result of converting the double-precision floating-point value 3686 | `a' to the quadruple-precision floating-point format. The conversion is 3687 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3688 | Arithmetic. 3689 *----------------------------------------------------------------------------*/ 3690 3691 float128 float64_to_float128(float64 a, float_status *status) 3692 { 3693 flag aSign; 3694 int aExp; 3695 uint64_t aSig, zSig0, zSig1; 3696 3697 a = float64_squash_input_denormal(a, status); 3698 aSig = extractFloat64Frac( a ); 3699 aExp = extractFloat64Exp( a ); 3700 aSign = extractFloat64Sign( a ); 3701 if ( aExp == 0x7FF ) { 3702 if (aSig) { 3703 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 3704 } 3705 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3706 } 3707 if ( aExp == 0 ) { 3708 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3709 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 3710 --aExp; 3711 } 3712 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 3713 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 3714 3715 } 3716 3717 /*---------------------------------------------------------------------------- 3718 | Rounds the double-precision floating-point value `a' to an integer, and 3719 | returns the result as a double-precision floating-point value. The 3720 | operation is performed according to the IEC/IEEE Standard for Binary 3721 | Floating-Point Arithmetic. 3722 *----------------------------------------------------------------------------*/ 3723 3724 float64 float64_round_to_int(float64 a, float_status *status) 3725 { 3726 flag aSign; 3727 int aExp; 3728 uint64_t lastBitMask, roundBitsMask; 3729 uint64_t z; 3730 a = float64_squash_input_denormal(a, status); 3731 3732 aExp = extractFloat64Exp( a ); 3733 if ( 0x433 <= aExp ) { 3734 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { 3735 return propagateFloat64NaN(a, a, status); 3736 } 3737 return a; 3738 } 3739 if ( aExp < 0x3FF ) { 3740 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a; 3741 status->float_exception_flags |= float_flag_inexact; 3742 aSign = extractFloat64Sign( a ); 3743 switch (status->float_rounding_mode) { 3744 case float_round_nearest_even: 3745 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { 3746 return packFloat64( aSign, 0x3FF, 0 ); 3747 } 3748 break; 3749 case float_round_ties_away: 3750 if (aExp == 0x3FE) { 3751 return packFloat64(aSign, 0x3ff, 0); 3752 } 3753 break; 3754 case float_round_down: 3755 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); 3756 case float_round_up: 3757 return make_float64( 3758 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); 3759 } 3760 return packFloat64( aSign, 0, 0 ); 3761 } 3762 lastBitMask = 1; 3763 lastBitMask <<= 0x433 - aExp; 3764 roundBitsMask = lastBitMask - 1; 3765 z = float64_val(a); 3766 switch (status->float_rounding_mode) { 3767 case float_round_nearest_even: 3768 z += lastBitMask >> 1; 3769 if ((z & roundBitsMask) == 0) { 3770 z &= ~lastBitMask; 3771 } 3772 break; 3773 case float_round_ties_away: 3774 z += lastBitMask >> 1; 3775 break; 3776 case float_round_to_zero: 3777 break; 3778 case float_round_up: 3779 if (!extractFloat64Sign(make_float64(z))) { 3780 z += roundBitsMask; 3781 } 3782 break; 3783 case float_round_down: 3784 if (extractFloat64Sign(make_float64(z))) { 3785 z += roundBitsMask; 3786 } 3787 break; 3788 default: 3789 abort(); 3790 } 3791 z &= ~ roundBitsMask; 3792 if (z != float64_val(a)) { 3793 status->float_exception_flags |= float_flag_inexact; 3794 } 3795 return make_float64(z); 3796 3797 } 3798 3799 float64 float64_trunc_to_int(float64 a, float_status *status) 3800 { 3801 int oldmode; 3802 float64 res; 3803 oldmode = status->float_rounding_mode; 3804 status->float_rounding_mode = float_round_to_zero; 3805 res = float64_round_to_int(a, status); 3806 status->float_rounding_mode = oldmode; 3807 return res; 3808 } 3809 3810 /*---------------------------------------------------------------------------- 3811 | Returns the result of adding the absolute values of the double-precision 3812 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 3813 | before being returned. `zSign' is ignored if the result is a NaN. 3814 | The addition is performed according to the IEC/IEEE Standard for Binary 3815 | Floating-Point Arithmetic. 3816 *----------------------------------------------------------------------------*/ 3817 3818 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign, 3819 float_status *status) 3820 { 3821 int aExp, bExp, zExp; 3822 uint64_t aSig, bSig, zSig; 3823 int expDiff; 3824 3825 aSig = extractFloat64Frac( a ); 3826 aExp = extractFloat64Exp( a ); 3827 bSig = extractFloat64Frac( b ); 3828 bExp = extractFloat64Exp( b ); 3829 expDiff = aExp - bExp; 3830 aSig <<= 9; 3831 bSig <<= 9; 3832 if ( 0 < expDiff ) { 3833 if ( aExp == 0x7FF ) { 3834 if (aSig) { 3835 return propagateFloat64NaN(a, b, status); 3836 } 3837 return a; 3838 } 3839 if ( bExp == 0 ) { 3840 --expDiff; 3841 } 3842 else { 3843 bSig |= LIT64( 0x2000000000000000 ); 3844 } 3845 shift64RightJamming( bSig, expDiff, &bSig ); 3846 zExp = aExp; 3847 } 3848 else if ( expDiff < 0 ) { 3849 if ( bExp == 0x7FF ) { 3850 if (bSig) { 3851 return propagateFloat64NaN(a, b, status); 3852 } 3853 return packFloat64( zSign, 0x7FF, 0 ); 3854 } 3855 if ( aExp == 0 ) { 3856 ++expDiff; 3857 } 3858 else { 3859 aSig |= LIT64( 0x2000000000000000 ); 3860 } 3861 shift64RightJamming( aSig, - expDiff, &aSig ); 3862 zExp = bExp; 3863 } 3864 else { 3865 if ( aExp == 0x7FF ) { 3866 if (aSig | bSig) { 3867 return propagateFloat64NaN(a, b, status); 3868 } 3869 return a; 3870 } 3871 if ( aExp == 0 ) { 3872 if (status->flush_to_zero) { 3873 if (aSig | bSig) { 3874 float_raise(float_flag_output_denormal, status); 3875 } 3876 return packFloat64(zSign, 0, 0); 3877 } 3878 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); 3879 } 3880 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; 3881 zExp = aExp; 3882 goto roundAndPack; 3883 } 3884 aSig |= LIT64( 0x2000000000000000 ); 3885 zSig = ( aSig + bSig )<<1; 3886 --zExp; 3887 if ( (int64_t) zSig < 0 ) { 3888 zSig = aSig + bSig; 3889 ++zExp; 3890 } 3891 roundAndPack: 3892 return roundAndPackFloat64(zSign, zExp, zSig, status); 3893 3894 } 3895 3896 /*---------------------------------------------------------------------------- 3897 | Returns the result of subtracting the absolute values of the double- 3898 | precision floating-point values `a' and `b'. If `zSign' is 1, the 3899 | difference is negated before being returned. `zSign' is ignored if the 3900 | result is a NaN. The subtraction is performed according to the IEC/IEEE 3901 | Standard for Binary Floating-Point Arithmetic. 3902 *----------------------------------------------------------------------------*/ 3903 3904 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign, 3905 float_status *status) 3906 { 3907 int aExp, bExp, zExp; 3908 uint64_t aSig, bSig, zSig; 3909 int expDiff; 3910 3911 aSig = extractFloat64Frac( a ); 3912 aExp = extractFloat64Exp( a ); 3913 bSig = extractFloat64Frac( b ); 3914 bExp = extractFloat64Exp( b ); 3915 expDiff = aExp - bExp; 3916 aSig <<= 10; 3917 bSig <<= 10; 3918 if ( 0 < expDiff ) goto aExpBigger; 3919 if ( expDiff < 0 ) goto bExpBigger; 3920 if ( aExp == 0x7FF ) { 3921 if (aSig | bSig) { 3922 return propagateFloat64NaN(a, b, status); 3923 } 3924 float_raise(float_flag_invalid, status); 3925 return float64_default_nan(status); 3926 } 3927 if ( aExp == 0 ) { 3928 aExp = 1; 3929 bExp = 1; 3930 } 3931 if ( bSig < aSig ) goto aBigger; 3932 if ( aSig < bSig ) goto bBigger; 3933 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0); 3934 bExpBigger: 3935 if ( bExp == 0x7FF ) { 3936 if (bSig) { 3937 return propagateFloat64NaN(a, b, status); 3938 } 3939 return packFloat64( zSign ^ 1, 0x7FF, 0 ); 3940 } 3941 if ( aExp == 0 ) { 3942 ++expDiff; 3943 } 3944 else { 3945 aSig |= LIT64( 0x4000000000000000 ); 3946 } 3947 shift64RightJamming( aSig, - expDiff, &aSig ); 3948 bSig |= LIT64( 0x4000000000000000 ); 3949 bBigger: 3950 zSig = bSig - aSig; 3951 zExp = bExp; 3952 zSign ^= 1; 3953 goto normalizeRoundAndPack; 3954 aExpBigger: 3955 if ( aExp == 0x7FF ) { 3956 if (aSig) { 3957 return propagateFloat64NaN(a, b, status); 3958 } 3959 return a; 3960 } 3961 if ( bExp == 0 ) { 3962 --expDiff; 3963 } 3964 else { 3965 bSig |= LIT64( 0x4000000000000000 ); 3966 } 3967 shift64RightJamming( bSig, expDiff, &bSig ); 3968 aSig |= LIT64( 0x4000000000000000 ); 3969 aBigger: 3970 zSig = aSig - bSig; 3971 zExp = aExp; 3972 normalizeRoundAndPack: 3973 --zExp; 3974 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status); 3975 3976 } 3977 3978 /*---------------------------------------------------------------------------- 3979 | Returns the result of adding the double-precision floating-point values `a' 3980 | and `b'. The operation is performed according to the IEC/IEEE Standard for 3981 | Binary Floating-Point Arithmetic. 3982 *----------------------------------------------------------------------------*/ 3983 3984 float64 float64_add(float64 a, float64 b, float_status *status) 3985 { 3986 flag aSign, bSign; 3987 a = float64_squash_input_denormal(a, status); 3988 b = float64_squash_input_denormal(b, status); 3989 3990 aSign = extractFloat64Sign( a ); 3991 bSign = extractFloat64Sign( b ); 3992 if ( aSign == bSign ) { 3993 return addFloat64Sigs(a, b, aSign, status); 3994 } 3995 else { 3996 return subFloat64Sigs(a, b, aSign, status); 3997 } 3998 3999 } 4000 4001 /*---------------------------------------------------------------------------- 4002 | Returns the result of subtracting the double-precision floating-point values 4003 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4004 | for Binary Floating-Point Arithmetic. 4005 *----------------------------------------------------------------------------*/ 4006 4007 float64 float64_sub(float64 a, float64 b, float_status *status) 4008 { 4009 flag aSign, bSign; 4010 a = float64_squash_input_denormal(a, status); 4011 b = float64_squash_input_denormal(b, status); 4012 4013 aSign = extractFloat64Sign( a ); 4014 bSign = extractFloat64Sign( b ); 4015 if ( aSign == bSign ) { 4016 return subFloat64Sigs(a, b, aSign, status); 4017 } 4018 else { 4019 return addFloat64Sigs(a, b, aSign, status); 4020 } 4021 4022 } 4023 4024 /*---------------------------------------------------------------------------- 4025 | Returns the result of multiplying the double-precision floating-point values 4026 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 4027 | for Binary Floating-Point Arithmetic. 4028 *----------------------------------------------------------------------------*/ 4029 4030 float64 float64_mul(float64 a, float64 b, float_status *status) 4031 { 4032 flag aSign, bSign, zSign; 4033 int aExp, bExp, zExp; 4034 uint64_t aSig, bSig, zSig0, zSig1; 4035 4036 a = float64_squash_input_denormal(a, status); 4037 b = float64_squash_input_denormal(b, status); 4038 4039 aSig = extractFloat64Frac( a ); 4040 aExp = extractFloat64Exp( a ); 4041 aSign = extractFloat64Sign( a ); 4042 bSig = extractFloat64Frac( b ); 4043 bExp = extractFloat64Exp( b ); 4044 bSign = extractFloat64Sign( b ); 4045 zSign = aSign ^ bSign; 4046 if ( aExp == 0x7FF ) { 4047 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4048 return propagateFloat64NaN(a, b, status); 4049 } 4050 if ( ( bExp | bSig ) == 0 ) { 4051 float_raise(float_flag_invalid, status); 4052 return float64_default_nan(status); 4053 } 4054 return packFloat64( zSign, 0x7FF, 0 ); 4055 } 4056 if ( bExp == 0x7FF ) { 4057 if (bSig) { 4058 return propagateFloat64NaN(a, b, status); 4059 } 4060 if ( ( aExp | aSig ) == 0 ) { 4061 float_raise(float_flag_invalid, status); 4062 return float64_default_nan(status); 4063 } 4064 return packFloat64( zSign, 0x7FF, 0 ); 4065 } 4066 if ( aExp == 0 ) { 4067 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4068 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4069 } 4070 if ( bExp == 0 ) { 4071 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); 4072 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4073 } 4074 zExp = aExp + bExp - 0x3FF; 4075 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4076 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4077 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 4078 zSig0 |= ( zSig1 != 0 ); 4079 if ( 0 <= (int64_t) ( zSig0<<1 ) ) { 4080 zSig0 <<= 1; 4081 --zExp; 4082 } 4083 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4084 4085 } 4086 4087 /*---------------------------------------------------------------------------- 4088 | Returns the result of dividing the double-precision floating-point value `a' 4089 | by the corresponding value `b'. The operation is performed according to 4090 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4091 *----------------------------------------------------------------------------*/ 4092 4093 float64 float64_div(float64 a, float64 b, float_status *status) 4094 { 4095 flag aSign, bSign, zSign; 4096 int aExp, bExp, zExp; 4097 uint64_t aSig, bSig, zSig; 4098 uint64_t rem0, rem1; 4099 uint64_t term0, term1; 4100 a = float64_squash_input_denormal(a, status); 4101 b = float64_squash_input_denormal(b, status); 4102 4103 aSig = extractFloat64Frac( a ); 4104 aExp = extractFloat64Exp( a ); 4105 aSign = extractFloat64Sign( a ); 4106 bSig = extractFloat64Frac( b ); 4107 bExp = extractFloat64Exp( b ); 4108 bSign = extractFloat64Sign( b ); 4109 zSign = aSign ^ bSign; 4110 if ( aExp == 0x7FF ) { 4111 if (aSig) { 4112 return propagateFloat64NaN(a, b, status); 4113 } 4114 if ( bExp == 0x7FF ) { 4115 if (bSig) { 4116 return propagateFloat64NaN(a, b, status); 4117 } 4118 float_raise(float_flag_invalid, status); 4119 return float64_default_nan(status); 4120 } 4121 return packFloat64( zSign, 0x7FF, 0 ); 4122 } 4123 if ( bExp == 0x7FF ) { 4124 if (bSig) { 4125 return propagateFloat64NaN(a, b, status); 4126 } 4127 return packFloat64( zSign, 0, 0 ); 4128 } 4129 if ( bExp == 0 ) { 4130 if ( bSig == 0 ) { 4131 if ( ( aExp | aSig ) == 0 ) { 4132 float_raise(float_flag_invalid, status); 4133 return float64_default_nan(status); 4134 } 4135 float_raise(float_flag_divbyzero, status); 4136 return packFloat64( zSign, 0x7FF, 0 ); 4137 } 4138 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4139 } 4140 if ( aExp == 0 ) { 4141 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); 4142 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4143 } 4144 zExp = aExp - bExp + 0x3FD; 4145 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; 4146 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4147 if ( bSig <= ( aSig + aSig ) ) { 4148 aSig >>= 1; 4149 ++zExp; 4150 } 4151 zSig = estimateDiv128To64( aSig, 0, bSig ); 4152 if ( ( zSig & 0x1FF ) <= 2 ) { 4153 mul64To128( bSig, zSig, &term0, &term1 ); 4154 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4155 while ( (int64_t) rem0 < 0 ) { 4156 --zSig; 4157 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 4158 } 4159 zSig |= ( rem1 != 0 ); 4160 } 4161 return roundAndPackFloat64(zSign, zExp, zSig, status); 4162 4163 } 4164 4165 /*---------------------------------------------------------------------------- 4166 | Returns the remainder of the double-precision floating-point value `a' 4167 | with respect to the corresponding value `b'. The operation is performed 4168 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4169 *----------------------------------------------------------------------------*/ 4170 4171 float64 float64_rem(float64 a, float64 b, float_status *status) 4172 { 4173 flag aSign, zSign; 4174 int aExp, bExp, expDiff; 4175 uint64_t aSig, bSig; 4176 uint64_t q, alternateASig; 4177 int64_t sigMean; 4178 4179 a = float64_squash_input_denormal(a, status); 4180 b = float64_squash_input_denormal(b, status); 4181 aSig = extractFloat64Frac( a ); 4182 aExp = extractFloat64Exp( a ); 4183 aSign = extractFloat64Sign( a ); 4184 bSig = extractFloat64Frac( b ); 4185 bExp = extractFloat64Exp( b ); 4186 if ( aExp == 0x7FF ) { 4187 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4188 return propagateFloat64NaN(a, b, status); 4189 } 4190 float_raise(float_flag_invalid, status); 4191 return float64_default_nan(status); 4192 } 4193 if ( bExp == 0x7FF ) { 4194 if (bSig) { 4195 return propagateFloat64NaN(a, b, status); 4196 } 4197 return a; 4198 } 4199 if ( bExp == 0 ) { 4200 if ( bSig == 0 ) { 4201 float_raise(float_flag_invalid, status); 4202 return float64_default_nan(status); 4203 } 4204 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4205 } 4206 if ( aExp == 0 ) { 4207 if ( aSig == 0 ) return a; 4208 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4209 } 4210 expDiff = aExp - bExp; 4211 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4212 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4213 if ( expDiff < 0 ) { 4214 if ( expDiff < -1 ) return a; 4215 aSig >>= 1; 4216 } 4217 q = ( bSig <= aSig ); 4218 if ( q ) aSig -= bSig; 4219 expDiff -= 64; 4220 while ( 0 < expDiff ) { 4221 q = estimateDiv128To64( aSig, 0, bSig ); 4222 q = ( 2 < q ) ? q - 2 : 0; 4223 aSig = - ( ( bSig>>2 ) * q ); 4224 expDiff -= 62; 4225 } 4226 expDiff += 64; 4227 if ( 0 < expDiff ) { 4228 q = estimateDiv128To64( aSig, 0, bSig ); 4229 q = ( 2 < q ) ? q - 2 : 0; 4230 q >>= 64 - expDiff; 4231 bSig >>= 2; 4232 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4233 } 4234 else { 4235 aSig >>= 2; 4236 bSig >>= 2; 4237 } 4238 do { 4239 alternateASig = aSig; 4240 ++q; 4241 aSig -= bSig; 4242 } while ( 0 <= (int64_t) aSig ); 4243 sigMean = aSig + alternateASig; 4244 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4245 aSig = alternateASig; 4246 } 4247 zSign = ( (int64_t) aSig < 0 ); 4248 if ( zSign ) aSig = - aSig; 4249 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4250 4251 } 4252 4253 /*---------------------------------------------------------------------------- 4254 | Returns the result of multiplying the double-precision floating-point values 4255 | `a' and `b' then adding 'c', with no intermediate rounding step after the 4256 | multiplication. The operation is performed according to the IEC/IEEE 4257 | Standard for Binary Floating-Point Arithmetic 754-2008. 4258 | The flags argument allows the caller to select negation of the 4259 | addend, the intermediate product, or the final result. (The difference 4260 | between this and having the caller do a separate negation is that negating 4261 | externally will flip the sign bit on NaNs.) 4262 *----------------------------------------------------------------------------*/ 4263 4264 float64 float64_muladd(float64 a, float64 b, float64 c, int flags, 4265 float_status *status) 4266 { 4267 flag aSign, bSign, cSign, zSign; 4268 int aExp, bExp, cExp, pExp, zExp, expDiff; 4269 uint64_t aSig, bSig, cSig; 4270 flag pInf, pZero, pSign; 4271 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; 4272 int shiftcount; 4273 flag signflip, infzero; 4274 4275 a = float64_squash_input_denormal(a, status); 4276 b = float64_squash_input_denormal(b, status); 4277 c = float64_squash_input_denormal(c, status); 4278 aSig = extractFloat64Frac(a); 4279 aExp = extractFloat64Exp(a); 4280 aSign = extractFloat64Sign(a); 4281 bSig = extractFloat64Frac(b); 4282 bExp = extractFloat64Exp(b); 4283 bSign = extractFloat64Sign(b); 4284 cSig = extractFloat64Frac(c); 4285 cExp = extractFloat64Exp(c); 4286 cSign = extractFloat64Sign(c); 4287 4288 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || 4289 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); 4290 4291 /* It is implementation-defined whether the cases of (0,inf,qnan) 4292 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 4293 * they return if they do), so we have to hand this information 4294 * off to the target-specific pick-a-NaN routine. 4295 */ 4296 if (((aExp == 0x7ff) && aSig) || 4297 ((bExp == 0x7ff) && bSig) || 4298 ((cExp == 0x7ff) && cSig)) { 4299 return propagateFloat64MulAddNaN(a, b, c, infzero, status); 4300 } 4301 4302 if (infzero) { 4303 float_raise(float_flag_invalid, status); 4304 return float64_default_nan(status); 4305 } 4306 4307 if (flags & float_muladd_negate_c) { 4308 cSign ^= 1; 4309 } 4310 4311 signflip = (flags & float_muladd_negate_result) ? 1 : 0; 4312 4313 /* Work out the sign and type of the product */ 4314 pSign = aSign ^ bSign; 4315 if (flags & float_muladd_negate_product) { 4316 pSign ^= 1; 4317 } 4318 pInf = (aExp == 0x7ff) || (bExp == 0x7ff); 4319 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); 4320 4321 if (cExp == 0x7ff) { 4322 if (pInf && (pSign ^ cSign)) { 4323 /* addition of opposite-signed infinities => InvalidOperation */ 4324 float_raise(float_flag_invalid, status); 4325 return float64_default_nan(status); 4326 } 4327 /* Otherwise generate an infinity of the same sign */ 4328 return packFloat64(cSign ^ signflip, 0x7ff, 0); 4329 } 4330 4331 if (pInf) { 4332 return packFloat64(pSign ^ signflip, 0x7ff, 0); 4333 } 4334 4335 if (pZero) { 4336 if (cExp == 0) { 4337 if (cSig == 0) { 4338 /* Adding two exact zeroes */ 4339 if (pSign == cSign) { 4340 zSign = pSign; 4341 } else if (status->float_rounding_mode == float_round_down) { 4342 zSign = 1; 4343 } else { 4344 zSign = 0; 4345 } 4346 return packFloat64(zSign ^ signflip, 0, 0); 4347 } 4348 /* Exact zero plus a denorm */ 4349 if (status->flush_to_zero) { 4350 float_raise(float_flag_output_denormal, status); 4351 return packFloat64(cSign ^ signflip, 0, 0); 4352 } 4353 } 4354 /* Zero plus something non-zero : just return the something */ 4355 if (flags & float_muladd_halve_result) { 4356 if (cExp == 0) { 4357 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4358 } 4359 /* Subtract one to halve, and one again because roundAndPackFloat64 4360 * wants one less than the true exponent. 4361 */ 4362 cExp -= 2; 4363 cSig = (cSig | 0x0010000000000000ULL) << 10; 4364 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status); 4365 } 4366 return packFloat64(cSign ^ signflip, cExp, cSig); 4367 } 4368 4369 if (aExp == 0) { 4370 normalizeFloat64Subnormal(aSig, &aExp, &aSig); 4371 } 4372 if (bExp == 0) { 4373 normalizeFloat64Subnormal(bSig, &bExp, &bSig); 4374 } 4375 4376 /* Calculate the actual result a * b + c */ 4377 4378 /* Multiply first; this is easy. */ 4379 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff 4380 * because we want the true exponent, not the "one-less-than" 4381 * flavour that roundAndPackFloat64() takes. 4382 */ 4383 pExp = aExp + bExp - 0x3fe; 4384 aSig = (aSig | LIT64(0x0010000000000000))<<10; 4385 bSig = (bSig | LIT64(0x0010000000000000))<<11; 4386 mul64To128(aSig, bSig, &pSig0, &pSig1); 4387 if ((int64_t)(pSig0 << 1) >= 0) { 4388 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); 4389 pExp--; 4390 } 4391 4392 zSign = pSign ^ signflip; 4393 4394 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit 4395 * bit in position 126. 4396 */ 4397 if (cExp == 0) { 4398 if (!cSig) { 4399 /* Throw out the special case of c being an exact zero now */ 4400 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); 4401 if (flags & float_muladd_halve_result) { 4402 pExp--; 4403 } 4404 return roundAndPackFloat64(zSign, pExp - 1, 4405 pSig1, status); 4406 } 4407 normalizeFloat64Subnormal(cSig, &cExp, &cSig); 4408 } 4409 4410 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the 4411 * significand of the addend, with the explicit bit in position 126. 4412 */ 4413 cSig0 = cSig << (126 - 64 - 52); 4414 cSig1 = 0; 4415 cSig0 |= LIT64(0x4000000000000000); 4416 expDiff = pExp - cExp; 4417 4418 if (pSign == cSign) { 4419 /* Addition */ 4420 if (expDiff > 0) { 4421 /* scale c to match p */ 4422 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4423 zExp = pExp; 4424 } else if (expDiff < 0) { 4425 /* scale p to match c */ 4426 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4427 zExp = cExp; 4428 } else { 4429 /* no scaling needed */ 4430 zExp = cExp; 4431 } 4432 /* Add significands and make sure explicit bit ends up in posn 126 */ 4433 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4434 if ((int64_t)zSig0 < 0) { 4435 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); 4436 } else { 4437 zExp--; 4438 } 4439 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); 4440 if (flags & float_muladd_halve_result) { 4441 zExp--; 4442 } 4443 return roundAndPackFloat64(zSign, zExp, zSig1, status); 4444 } else { 4445 /* Subtraction */ 4446 if (expDiff > 0) { 4447 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); 4448 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4449 zExp = pExp; 4450 } else if (expDiff < 0) { 4451 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); 4452 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4453 zExp = cExp; 4454 zSign ^= 1; 4455 } else { 4456 zExp = pExp; 4457 if (lt128(cSig0, cSig1, pSig0, pSig1)) { 4458 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); 4459 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { 4460 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); 4461 zSign ^= 1; 4462 } else { 4463 /* Exact zero */ 4464 zSign = signflip; 4465 if (status->float_rounding_mode == float_round_down) { 4466 zSign ^= 1; 4467 } 4468 return packFloat64(zSign, 0, 0); 4469 } 4470 } 4471 --zExp; 4472 /* Do the equivalent of normalizeRoundAndPackFloat64() but 4473 * starting with the significand in a pair of uint64_t. 4474 */ 4475 if (zSig0) { 4476 shiftcount = countLeadingZeros64(zSig0) - 1; 4477 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); 4478 if (zSig1) { 4479 zSig0 |= 1; 4480 } 4481 zExp -= shiftcount; 4482 } else { 4483 shiftcount = countLeadingZeros64(zSig1); 4484 if (shiftcount == 0) { 4485 zSig0 = (zSig1 >> 1) | (zSig1 & 1); 4486 zExp -= 63; 4487 } else { 4488 shiftcount--; 4489 zSig0 = zSig1 << shiftcount; 4490 zExp -= (shiftcount + 64); 4491 } 4492 } 4493 if (flags & float_muladd_halve_result) { 4494 zExp--; 4495 } 4496 return roundAndPackFloat64(zSign, zExp, zSig0, status); 4497 } 4498 } 4499 4500 /*---------------------------------------------------------------------------- 4501 | Returns the square root of the double-precision floating-point value `a'. 4502 | The operation is performed according to the IEC/IEEE Standard for Binary 4503 | Floating-Point Arithmetic. 4504 *----------------------------------------------------------------------------*/ 4505 4506 float64 float64_sqrt(float64 a, float_status *status) 4507 { 4508 flag aSign; 4509 int aExp, zExp; 4510 uint64_t aSig, zSig, doubleZSig; 4511 uint64_t rem0, rem1, term0, term1; 4512 a = float64_squash_input_denormal(a, status); 4513 4514 aSig = extractFloat64Frac( a ); 4515 aExp = extractFloat64Exp( a ); 4516 aSign = extractFloat64Sign( a ); 4517 if ( aExp == 0x7FF ) { 4518 if (aSig) { 4519 return propagateFloat64NaN(a, a, status); 4520 } 4521 if ( ! aSign ) return a; 4522 float_raise(float_flag_invalid, status); 4523 return float64_default_nan(status); 4524 } 4525 if ( aSign ) { 4526 if ( ( aExp | aSig ) == 0 ) return a; 4527 float_raise(float_flag_invalid, status); 4528 return float64_default_nan(status); 4529 } 4530 if ( aExp == 0 ) { 4531 if ( aSig == 0 ) return float64_zero; 4532 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4533 } 4534 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4535 aSig |= LIT64( 0x0010000000000000 ); 4536 zSig = estimateSqrt32( aExp, aSig>>21 ); 4537 aSig <<= 9 - ( aExp & 1 ); 4538 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4539 if ( ( zSig & 0x1FF ) <= 5 ) { 4540 doubleZSig = zSig<<1; 4541 mul64To128( zSig, zSig, &term0, &term1 ); 4542 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4543 while ( (int64_t) rem0 < 0 ) { 4544 --zSig; 4545 doubleZSig -= 2; 4546 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4547 } 4548 zSig |= ( ( rem0 | rem1 ) != 0 ); 4549 } 4550 return roundAndPackFloat64(0, zExp, zSig, status); 4551 4552 } 4553 4554 /*---------------------------------------------------------------------------- 4555 | Returns the binary log of the double-precision floating-point value `a'. 4556 | The operation is performed according to the IEC/IEEE Standard for Binary 4557 | Floating-Point Arithmetic. 4558 *----------------------------------------------------------------------------*/ 4559 float64 float64_log2(float64 a, float_status *status) 4560 { 4561 flag aSign, zSign; 4562 int aExp; 4563 uint64_t aSig, aSig0, aSig1, zSig, i; 4564 a = float64_squash_input_denormal(a, status); 4565 4566 aSig = extractFloat64Frac( a ); 4567 aExp = extractFloat64Exp( a ); 4568 aSign = extractFloat64Sign( a ); 4569 4570 if ( aExp == 0 ) { 4571 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4572 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4573 } 4574 if ( aSign ) { 4575 float_raise(float_flag_invalid, status); 4576 return float64_default_nan(status); 4577 } 4578 if ( aExp == 0x7FF ) { 4579 if (aSig) { 4580 return propagateFloat64NaN(a, float64_zero, status); 4581 } 4582 return a; 4583 } 4584 4585 aExp -= 0x3FF; 4586 aSig |= LIT64( 0x0010000000000000 ); 4587 zSign = aExp < 0; 4588 zSig = (uint64_t)aExp << 52; 4589 for (i = 1LL << 51; i > 0; i >>= 1) { 4590 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4591 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4592 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4593 aSig >>= 1; 4594 zSig |= i; 4595 } 4596 } 4597 4598 if ( zSign ) 4599 zSig = -zSig; 4600 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4601 } 4602 4603 /*---------------------------------------------------------------------------- 4604 | Returns 1 if the double-precision floating-point value `a' is equal to the 4605 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4606 | if either operand is a NaN. Otherwise, the comparison is performed 4607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4608 *----------------------------------------------------------------------------*/ 4609 4610 int float64_eq(float64 a, float64 b, float_status *status) 4611 { 4612 uint64_t av, bv; 4613 a = float64_squash_input_denormal(a, status); 4614 b = float64_squash_input_denormal(b, status); 4615 4616 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4617 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4618 ) { 4619 float_raise(float_flag_invalid, status); 4620 return 0; 4621 } 4622 av = float64_val(a); 4623 bv = float64_val(b); 4624 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4625 4626 } 4627 4628 /*---------------------------------------------------------------------------- 4629 | Returns 1 if the double-precision floating-point value `a' is less than or 4630 | equal to the corresponding value `b', and 0 otherwise. The invalid 4631 | exception is raised if either operand is a NaN. The comparison is performed 4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4633 *----------------------------------------------------------------------------*/ 4634 4635 int float64_le(float64 a, float64 b, float_status *status) 4636 { 4637 flag aSign, bSign; 4638 uint64_t av, bv; 4639 a = float64_squash_input_denormal(a, status); 4640 b = float64_squash_input_denormal(b, status); 4641 4642 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4643 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4644 ) { 4645 float_raise(float_flag_invalid, status); 4646 return 0; 4647 } 4648 aSign = extractFloat64Sign( a ); 4649 bSign = extractFloat64Sign( b ); 4650 av = float64_val(a); 4651 bv = float64_val(b); 4652 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4653 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4654 4655 } 4656 4657 /*---------------------------------------------------------------------------- 4658 | Returns 1 if the double-precision floating-point value `a' is less than 4659 | the corresponding value `b', and 0 otherwise. The invalid exception is 4660 | raised if either operand is a NaN. The comparison is performed according 4661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4662 *----------------------------------------------------------------------------*/ 4663 4664 int float64_lt(float64 a, float64 b, float_status *status) 4665 { 4666 flag aSign, bSign; 4667 uint64_t av, bv; 4668 4669 a = float64_squash_input_denormal(a, status); 4670 b = float64_squash_input_denormal(b, status); 4671 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4672 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4673 ) { 4674 float_raise(float_flag_invalid, status); 4675 return 0; 4676 } 4677 aSign = extractFloat64Sign( a ); 4678 bSign = extractFloat64Sign( b ); 4679 av = float64_val(a); 4680 bv = float64_val(b); 4681 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4682 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4683 4684 } 4685 4686 /*---------------------------------------------------------------------------- 4687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4688 | be compared, and 0 otherwise. The invalid exception is raised if either 4689 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4690 | Standard for Binary Floating-Point Arithmetic. 4691 *----------------------------------------------------------------------------*/ 4692 4693 int float64_unordered(float64 a, float64 b, float_status *status) 4694 { 4695 a = float64_squash_input_denormal(a, status); 4696 b = float64_squash_input_denormal(b, status); 4697 4698 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4699 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4700 ) { 4701 float_raise(float_flag_invalid, status); 4702 return 1; 4703 } 4704 return 0; 4705 } 4706 4707 /*---------------------------------------------------------------------------- 4708 | Returns 1 if the double-precision floating-point value `a' is equal to the 4709 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4710 | exception.The comparison is performed according to the IEC/IEEE Standard 4711 | for Binary Floating-Point Arithmetic. 4712 *----------------------------------------------------------------------------*/ 4713 4714 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4715 { 4716 uint64_t av, bv; 4717 a = float64_squash_input_denormal(a, status); 4718 b = float64_squash_input_denormal(b, status); 4719 4720 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4721 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4722 ) { 4723 if (float64_is_signaling_nan(a, status) 4724 || float64_is_signaling_nan(b, status)) { 4725 float_raise(float_flag_invalid, status); 4726 } 4727 return 0; 4728 } 4729 av = float64_val(a); 4730 bv = float64_val(b); 4731 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4732 4733 } 4734 4735 /*---------------------------------------------------------------------------- 4736 | Returns 1 if the double-precision floating-point value `a' is less than or 4737 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4738 | cause an exception. Otherwise, the comparison is performed according to the 4739 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4740 *----------------------------------------------------------------------------*/ 4741 4742 int float64_le_quiet(float64 a, float64 b, float_status *status) 4743 { 4744 flag aSign, bSign; 4745 uint64_t av, bv; 4746 a = float64_squash_input_denormal(a, status); 4747 b = float64_squash_input_denormal(b, status); 4748 4749 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4750 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4751 ) { 4752 if (float64_is_signaling_nan(a, status) 4753 || float64_is_signaling_nan(b, status)) { 4754 float_raise(float_flag_invalid, status); 4755 } 4756 return 0; 4757 } 4758 aSign = extractFloat64Sign( a ); 4759 bSign = extractFloat64Sign( b ); 4760 av = float64_val(a); 4761 bv = float64_val(b); 4762 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4763 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4764 4765 } 4766 4767 /*---------------------------------------------------------------------------- 4768 | Returns 1 if the double-precision floating-point value `a' is less than 4769 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4770 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4771 | Standard for Binary Floating-Point Arithmetic. 4772 *----------------------------------------------------------------------------*/ 4773 4774 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4775 { 4776 flag aSign, bSign; 4777 uint64_t av, bv; 4778 a = float64_squash_input_denormal(a, status); 4779 b = float64_squash_input_denormal(b, status); 4780 4781 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4782 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4783 ) { 4784 if (float64_is_signaling_nan(a, status) 4785 || float64_is_signaling_nan(b, status)) { 4786 float_raise(float_flag_invalid, status); 4787 } 4788 return 0; 4789 } 4790 aSign = extractFloat64Sign( a ); 4791 bSign = extractFloat64Sign( b ); 4792 av = float64_val(a); 4793 bv = float64_val(b); 4794 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4795 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4796 4797 } 4798 4799 /*---------------------------------------------------------------------------- 4800 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4801 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4802 | comparison is performed according to the IEC/IEEE Standard for Binary 4803 | Floating-Point Arithmetic. 4804 *----------------------------------------------------------------------------*/ 4805 4806 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4807 { 4808 a = float64_squash_input_denormal(a, status); 4809 b = float64_squash_input_denormal(b, status); 4810 4811 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4812 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4813 ) { 4814 if (float64_is_signaling_nan(a, status) 4815 || float64_is_signaling_nan(b, status)) { 4816 float_raise(float_flag_invalid, status); 4817 } 4818 return 1; 4819 } 4820 return 0; 4821 } 4822 4823 /*---------------------------------------------------------------------------- 4824 | Returns the result of converting the extended double-precision floating- 4825 | point value `a' to the 32-bit two's complement integer format. The 4826 | conversion is performed according to the IEC/IEEE Standard for Binary 4827 | Floating-Point Arithmetic---which means in particular that the conversion 4828 | is rounded according to the current rounding mode. If `a' is a NaN, the 4829 | largest positive integer is returned. Otherwise, if the conversion 4830 | overflows, the largest integer with the same sign as `a' is returned. 4831 *----------------------------------------------------------------------------*/ 4832 4833 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4834 { 4835 flag aSign; 4836 int32_t aExp, shiftCount; 4837 uint64_t aSig; 4838 4839 if (floatx80_invalid_encoding(a)) { 4840 float_raise(float_flag_invalid, status); 4841 return 1 << 31; 4842 } 4843 aSig = extractFloatx80Frac( a ); 4844 aExp = extractFloatx80Exp( a ); 4845 aSign = extractFloatx80Sign( a ); 4846 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4847 shiftCount = 0x4037 - aExp; 4848 if ( shiftCount <= 0 ) shiftCount = 1; 4849 shift64RightJamming( aSig, shiftCount, &aSig ); 4850 return roundAndPackInt32(aSign, aSig, status); 4851 4852 } 4853 4854 /*---------------------------------------------------------------------------- 4855 | Returns the result of converting the extended double-precision floating- 4856 | point value `a' to the 32-bit two's complement integer format. The 4857 | conversion is performed according to the IEC/IEEE Standard for Binary 4858 | Floating-Point Arithmetic, except that the conversion is always rounded 4859 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4860 | Otherwise, if the conversion overflows, the largest integer with the same 4861 | sign as `a' is returned. 4862 *----------------------------------------------------------------------------*/ 4863 4864 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4865 { 4866 flag aSign; 4867 int32_t aExp, shiftCount; 4868 uint64_t aSig, savedASig; 4869 int32_t z; 4870 4871 if (floatx80_invalid_encoding(a)) { 4872 float_raise(float_flag_invalid, status); 4873 return 1 << 31; 4874 } 4875 aSig = extractFloatx80Frac( a ); 4876 aExp = extractFloatx80Exp( a ); 4877 aSign = extractFloatx80Sign( a ); 4878 if ( 0x401E < aExp ) { 4879 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4880 goto invalid; 4881 } 4882 else if ( aExp < 0x3FFF ) { 4883 if (aExp || aSig) { 4884 status->float_exception_flags |= float_flag_inexact; 4885 } 4886 return 0; 4887 } 4888 shiftCount = 0x403E - aExp; 4889 savedASig = aSig; 4890 aSig >>= shiftCount; 4891 z = aSig; 4892 if ( aSign ) z = - z; 4893 if ( ( z < 0 ) ^ aSign ) { 4894 invalid: 4895 float_raise(float_flag_invalid, status); 4896 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4897 } 4898 if ( ( aSig<<shiftCount ) != savedASig ) { 4899 status->float_exception_flags |= float_flag_inexact; 4900 } 4901 return z; 4902 4903 } 4904 4905 /*---------------------------------------------------------------------------- 4906 | Returns the result of converting the extended double-precision floating- 4907 | point value `a' to the 64-bit two's complement integer format. The 4908 | conversion is performed according to the IEC/IEEE Standard for Binary 4909 | Floating-Point Arithmetic---which means in particular that the conversion 4910 | is rounded according to the current rounding mode. If `a' is a NaN, 4911 | the largest positive integer is returned. Otherwise, if the conversion 4912 | overflows, the largest integer with the same sign as `a' is returned. 4913 *----------------------------------------------------------------------------*/ 4914 4915 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4916 { 4917 flag aSign; 4918 int32_t aExp, shiftCount; 4919 uint64_t aSig, aSigExtra; 4920 4921 if (floatx80_invalid_encoding(a)) { 4922 float_raise(float_flag_invalid, status); 4923 return 1ULL << 63; 4924 } 4925 aSig = extractFloatx80Frac( a ); 4926 aExp = extractFloatx80Exp( a ); 4927 aSign = extractFloatx80Sign( a ); 4928 shiftCount = 0x403E - aExp; 4929 if ( shiftCount <= 0 ) { 4930 if ( shiftCount ) { 4931 float_raise(float_flag_invalid, status); 4932 if ( ! aSign 4933 || ( ( aExp == 0x7FFF ) 4934 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4935 ) { 4936 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4937 } 4938 return (int64_t) LIT64( 0x8000000000000000 ); 4939 } 4940 aSigExtra = 0; 4941 } 4942 else { 4943 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4944 } 4945 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4946 4947 } 4948 4949 /*---------------------------------------------------------------------------- 4950 | Returns the result of converting the extended double-precision floating- 4951 | point value `a' to the 64-bit two's complement integer format. The 4952 | conversion is performed according to the IEC/IEEE Standard for Binary 4953 | Floating-Point Arithmetic, except that the conversion is always rounded 4954 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4955 | Otherwise, if the conversion overflows, the largest integer with the same 4956 | sign as `a' is returned. 4957 *----------------------------------------------------------------------------*/ 4958 4959 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4960 { 4961 flag aSign; 4962 int32_t aExp, shiftCount; 4963 uint64_t aSig; 4964 int64_t z; 4965 4966 if (floatx80_invalid_encoding(a)) { 4967 float_raise(float_flag_invalid, status); 4968 return 1ULL << 63; 4969 } 4970 aSig = extractFloatx80Frac( a ); 4971 aExp = extractFloatx80Exp( a ); 4972 aSign = extractFloatx80Sign( a ); 4973 shiftCount = aExp - 0x403E; 4974 if ( 0 <= shiftCount ) { 4975 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4976 if ( ( a.high != 0xC03E ) || aSig ) { 4977 float_raise(float_flag_invalid, status); 4978 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4979 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4980 } 4981 } 4982 return (int64_t) LIT64( 0x8000000000000000 ); 4983 } 4984 else if ( aExp < 0x3FFF ) { 4985 if (aExp | aSig) { 4986 status->float_exception_flags |= float_flag_inexact; 4987 } 4988 return 0; 4989 } 4990 z = aSig>>( - shiftCount ); 4991 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4992 status->float_exception_flags |= float_flag_inexact; 4993 } 4994 if ( aSign ) z = - z; 4995 return z; 4996 4997 } 4998 4999 /*---------------------------------------------------------------------------- 5000 | Returns the result of converting the extended double-precision floating- 5001 | point value `a' to the single-precision floating-point format. The 5002 | conversion is performed according to the IEC/IEEE Standard for Binary 5003 | Floating-Point Arithmetic. 5004 *----------------------------------------------------------------------------*/ 5005 5006 float32 floatx80_to_float32(floatx80 a, float_status *status) 5007 { 5008 flag aSign; 5009 int32_t aExp; 5010 uint64_t aSig; 5011 5012 if (floatx80_invalid_encoding(a)) { 5013 float_raise(float_flag_invalid, status); 5014 return float32_default_nan(status); 5015 } 5016 aSig = extractFloatx80Frac( a ); 5017 aExp = extractFloatx80Exp( a ); 5018 aSign = extractFloatx80Sign( a ); 5019 if ( aExp == 0x7FFF ) { 5020 if ( (uint64_t) ( aSig<<1 ) ) { 5021 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 5022 } 5023 return packFloat32( aSign, 0xFF, 0 ); 5024 } 5025 shift64RightJamming( aSig, 33, &aSig ); 5026 if ( aExp || aSig ) aExp -= 0x3F81; 5027 return roundAndPackFloat32(aSign, aExp, aSig, status); 5028 5029 } 5030 5031 /*---------------------------------------------------------------------------- 5032 | Returns the result of converting the extended double-precision floating- 5033 | point value `a' to the double-precision floating-point format. The 5034 | conversion is performed according to the IEC/IEEE Standard for Binary 5035 | Floating-Point Arithmetic. 5036 *----------------------------------------------------------------------------*/ 5037 5038 float64 floatx80_to_float64(floatx80 a, float_status *status) 5039 { 5040 flag aSign; 5041 int32_t aExp; 5042 uint64_t aSig, zSig; 5043 5044 if (floatx80_invalid_encoding(a)) { 5045 float_raise(float_flag_invalid, status); 5046 return float64_default_nan(status); 5047 } 5048 aSig = extractFloatx80Frac( a ); 5049 aExp = extractFloatx80Exp( a ); 5050 aSign = extractFloatx80Sign( a ); 5051 if ( aExp == 0x7FFF ) { 5052 if ( (uint64_t) ( aSig<<1 ) ) { 5053 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 5054 } 5055 return packFloat64( aSign, 0x7FF, 0 ); 5056 } 5057 shift64RightJamming( aSig, 1, &zSig ); 5058 if ( aExp || aSig ) aExp -= 0x3C01; 5059 return roundAndPackFloat64(aSign, aExp, zSig, status); 5060 5061 } 5062 5063 /*---------------------------------------------------------------------------- 5064 | Returns the result of converting the extended double-precision floating- 5065 | point value `a' to the quadruple-precision floating-point format. The 5066 | conversion is performed according to the IEC/IEEE Standard for Binary 5067 | Floating-Point Arithmetic. 5068 *----------------------------------------------------------------------------*/ 5069 5070 float128 floatx80_to_float128(floatx80 a, float_status *status) 5071 { 5072 flag aSign; 5073 int aExp; 5074 uint64_t aSig, zSig0, zSig1; 5075 5076 if (floatx80_invalid_encoding(a)) { 5077 float_raise(float_flag_invalid, status); 5078 return float128_default_nan(status); 5079 } 5080 aSig = extractFloatx80Frac( a ); 5081 aExp = extractFloatx80Exp( a ); 5082 aSign = extractFloatx80Sign( a ); 5083 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 5084 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 5085 } 5086 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 5087 return packFloat128( aSign, aExp, zSig0, zSig1 ); 5088 5089 } 5090 5091 /*---------------------------------------------------------------------------- 5092 | Rounds the extended double-precision floating-point value `a' 5093 | to the precision provided by floatx80_rounding_precision and returns the 5094 | result as an extended double-precision floating-point value. 5095 | The operation is performed according to the IEC/IEEE Standard for Binary 5096 | Floating-Point Arithmetic. 5097 *----------------------------------------------------------------------------*/ 5098 5099 floatx80 floatx80_round(floatx80 a, float_status *status) 5100 { 5101 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5102 extractFloatx80Sign(a), 5103 extractFloatx80Exp(a), 5104 extractFloatx80Frac(a), 0, status); 5105 } 5106 5107 /*---------------------------------------------------------------------------- 5108 | Rounds the extended double-precision floating-point value `a' to an integer, 5109 | and returns the result as an extended quadruple-precision floating-point 5110 | value. The operation is performed according to the IEC/IEEE Standard for 5111 | Binary Floating-Point Arithmetic. 5112 *----------------------------------------------------------------------------*/ 5113 5114 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 5115 { 5116 flag aSign; 5117 int32_t aExp; 5118 uint64_t lastBitMask, roundBitsMask; 5119 floatx80 z; 5120 5121 if (floatx80_invalid_encoding(a)) { 5122 float_raise(float_flag_invalid, status); 5123 return floatx80_default_nan(status); 5124 } 5125 aExp = extractFloatx80Exp( a ); 5126 if ( 0x403E <= aExp ) { 5127 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 5128 return propagateFloatx80NaN(a, a, status); 5129 } 5130 return a; 5131 } 5132 if ( aExp < 0x3FFF ) { 5133 if ( ( aExp == 0 ) 5134 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 5135 return a; 5136 } 5137 status->float_exception_flags |= float_flag_inexact; 5138 aSign = extractFloatx80Sign( a ); 5139 switch (status->float_rounding_mode) { 5140 case float_round_nearest_even: 5141 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 5142 ) { 5143 return 5144 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5145 } 5146 break; 5147 case float_round_ties_away: 5148 if (aExp == 0x3FFE) { 5149 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 5150 } 5151 break; 5152 case float_round_down: 5153 return 5154 aSign ? 5155 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 5156 : packFloatx80( 0, 0, 0 ); 5157 case float_round_up: 5158 return 5159 aSign ? packFloatx80( 1, 0, 0 ) 5160 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 5161 } 5162 return packFloatx80( aSign, 0, 0 ); 5163 } 5164 lastBitMask = 1; 5165 lastBitMask <<= 0x403E - aExp; 5166 roundBitsMask = lastBitMask - 1; 5167 z = a; 5168 switch (status->float_rounding_mode) { 5169 case float_round_nearest_even: 5170 z.low += lastBitMask>>1; 5171 if ((z.low & roundBitsMask) == 0) { 5172 z.low &= ~lastBitMask; 5173 } 5174 break; 5175 case float_round_ties_away: 5176 z.low += lastBitMask >> 1; 5177 break; 5178 case float_round_to_zero: 5179 break; 5180 case float_round_up: 5181 if (!extractFloatx80Sign(z)) { 5182 z.low += roundBitsMask; 5183 } 5184 break; 5185 case float_round_down: 5186 if (extractFloatx80Sign(z)) { 5187 z.low += roundBitsMask; 5188 } 5189 break; 5190 default: 5191 abort(); 5192 } 5193 z.low &= ~ roundBitsMask; 5194 if ( z.low == 0 ) { 5195 ++z.high; 5196 z.low = LIT64( 0x8000000000000000 ); 5197 } 5198 if (z.low != a.low) { 5199 status->float_exception_flags |= float_flag_inexact; 5200 } 5201 return z; 5202 5203 } 5204 5205 /*---------------------------------------------------------------------------- 5206 | Returns the result of adding the absolute values of the extended double- 5207 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 5208 | negated before being returned. `zSign' is ignored if the result is a NaN. 5209 | The addition is performed according to the IEC/IEEE Standard for Binary 5210 | Floating-Point Arithmetic. 5211 *----------------------------------------------------------------------------*/ 5212 5213 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5214 float_status *status) 5215 { 5216 int32_t aExp, bExp, zExp; 5217 uint64_t aSig, bSig, zSig0, zSig1; 5218 int32_t expDiff; 5219 5220 aSig = extractFloatx80Frac( a ); 5221 aExp = extractFloatx80Exp( a ); 5222 bSig = extractFloatx80Frac( b ); 5223 bExp = extractFloatx80Exp( b ); 5224 expDiff = aExp - bExp; 5225 if ( 0 < expDiff ) { 5226 if ( aExp == 0x7FFF ) { 5227 if ((uint64_t)(aSig << 1)) { 5228 return propagateFloatx80NaN(a, b, status); 5229 } 5230 return a; 5231 } 5232 if ( bExp == 0 ) --expDiff; 5233 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5234 zExp = aExp; 5235 } 5236 else if ( expDiff < 0 ) { 5237 if ( bExp == 0x7FFF ) { 5238 if ((uint64_t)(bSig << 1)) { 5239 return propagateFloatx80NaN(a, b, status); 5240 } 5241 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5242 } 5243 if ( aExp == 0 ) ++expDiff; 5244 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5245 zExp = bExp; 5246 } 5247 else { 5248 if ( aExp == 0x7FFF ) { 5249 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5250 return propagateFloatx80NaN(a, b, status); 5251 } 5252 return a; 5253 } 5254 zSig1 = 0; 5255 zSig0 = aSig + bSig; 5256 if ( aExp == 0 ) { 5257 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5258 goto roundAndPack; 5259 } 5260 zExp = aExp; 5261 goto shiftRight1; 5262 } 5263 zSig0 = aSig + bSig; 5264 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5265 shiftRight1: 5266 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5267 zSig0 |= LIT64( 0x8000000000000000 ); 5268 ++zExp; 5269 roundAndPack: 5270 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5271 zSign, zExp, zSig0, zSig1, status); 5272 } 5273 5274 /*---------------------------------------------------------------------------- 5275 | Returns the result of subtracting the absolute values of the extended 5276 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5277 | difference is negated before being returned. `zSign' is ignored if the 5278 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5279 | Standard for Binary Floating-Point Arithmetic. 5280 *----------------------------------------------------------------------------*/ 5281 5282 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5283 float_status *status) 5284 { 5285 int32_t aExp, bExp, zExp; 5286 uint64_t aSig, bSig, zSig0, zSig1; 5287 int32_t expDiff; 5288 5289 aSig = extractFloatx80Frac( a ); 5290 aExp = extractFloatx80Exp( a ); 5291 bSig = extractFloatx80Frac( b ); 5292 bExp = extractFloatx80Exp( b ); 5293 expDiff = aExp - bExp; 5294 if ( 0 < expDiff ) goto aExpBigger; 5295 if ( expDiff < 0 ) goto bExpBigger; 5296 if ( aExp == 0x7FFF ) { 5297 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5298 return propagateFloatx80NaN(a, b, status); 5299 } 5300 float_raise(float_flag_invalid, status); 5301 return floatx80_default_nan(status); 5302 } 5303 if ( aExp == 0 ) { 5304 aExp = 1; 5305 bExp = 1; 5306 } 5307 zSig1 = 0; 5308 if ( bSig < aSig ) goto aBigger; 5309 if ( aSig < bSig ) goto bBigger; 5310 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5311 bExpBigger: 5312 if ( bExp == 0x7FFF ) { 5313 if ((uint64_t)(bSig << 1)) { 5314 return propagateFloatx80NaN(a, b, status); 5315 } 5316 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5317 } 5318 if ( aExp == 0 ) ++expDiff; 5319 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5320 bBigger: 5321 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5322 zExp = bExp; 5323 zSign ^= 1; 5324 goto normalizeRoundAndPack; 5325 aExpBigger: 5326 if ( aExp == 0x7FFF ) { 5327 if ((uint64_t)(aSig << 1)) { 5328 return propagateFloatx80NaN(a, b, status); 5329 } 5330 return a; 5331 } 5332 if ( bExp == 0 ) --expDiff; 5333 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5334 aBigger: 5335 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5336 zExp = aExp; 5337 normalizeRoundAndPack: 5338 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5339 zSign, zExp, zSig0, zSig1, status); 5340 } 5341 5342 /*---------------------------------------------------------------------------- 5343 | Returns the result of adding the extended double-precision floating-point 5344 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5345 | Standard for Binary Floating-Point Arithmetic. 5346 *----------------------------------------------------------------------------*/ 5347 5348 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5349 { 5350 flag aSign, bSign; 5351 5352 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5353 float_raise(float_flag_invalid, status); 5354 return floatx80_default_nan(status); 5355 } 5356 aSign = extractFloatx80Sign( a ); 5357 bSign = extractFloatx80Sign( b ); 5358 if ( aSign == bSign ) { 5359 return addFloatx80Sigs(a, b, aSign, status); 5360 } 5361 else { 5362 return subFloatx80Sigs(a, b, aSign, status); 5363 } 5364 5365 } 5366 5367 /*---------------------------------------------------------------------------- 5368 | Returns the result of subtracting the extended double-precision floating- 5369 | point values `a' and `b'. The operation is performed according to the 5370 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5371 *----------------------------------------------------------------------------*/ 5372 5373 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5374 { 5375 flag aSign, bSign; 5376 5377 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5378 float_raise(float_flag_invalid, status); 5379 return floatx80_default_nan(status); 5380 } 5381 aSign = extractFloatx80Sign( a ); 5382 bSign = extractFloatx80Sign( b ); 5383 if ( aSign == bSign ) { 5384 return subFloatx80Sigs(a, b, aSign, status); 5385 } 5386 else { 5387 return addFloatx80Sigs(a, b, aSign, status); 5388 } 5389 5390 } 5391 5392 /*---------------------------------------------------------------------------- 5393 | Returns the result of multiplying the extended double-precision floating- 5394 | point values `a' and `b'. The operation is performed according to the 5395 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5396 *----------------------------------------------------------------------------*/ 5397 5398 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5399 { 5400 flag aSign, bSign, zSign; 5401 int32_t aExp, bExp, zExp; 5402 uint64_t aSig, bSig, zSig0, zSig1; 5403 5404 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5405 float_raise(float_flag_invalid, status); 5406 return floatx80_default_nan(status); 5407 } 5408 aSig = extractFloatx80Frac( a ); 5409 aExp = extractFloatx80Exp( a ); 5410 aSign = extractFloatx80Sign( a ); 5411 bSig = extractFloatx80Frac( b ); 5412 bExp = extractFloatx80Exp( b ); 5413 bSign = extractFloatx80Sign( b ); 5414 zSign = aSign ^ bSign; 5415 if ( aExp == 0x7FFF ) { 5416 if ( (uint64_t) ( aSig<<1 ) 5417 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5418 return propagateFloatx80NaN(a, b, status); 5419 } 5420 if ( ( bExp | bSig ) == 0 ) goto invalid; 5421 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5422 } 5423 if ( bExp == 0x7FFF ) { 5424 if ((uint64_t)(bSig << 1)) { 5425 return propagateFloatx80NaN(a, b, status); 5426 } 5427 if ( ( aExp | aSig ) == 0 ) { 5428 invalid: 5429 float_raise(float_flag_invalid, status); 5430 return floatx80_default_nan(status); 5431 } 5432 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5433 } 5434 if ( aExp == 0 ) { 5435 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5436 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5437 } 5438 if ( bExp == 0 ) { 5439 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5440 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5441 } 5442 zExp = aExp + bExp - 0x3FFE; 5443 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5444 if ( 0 < (int64_t) zSig0 ) { 5445 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5446 --zExp; 5447 } 5448 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5449 zSign, zExp, zSig0, zSig1, status); 5450 } 5451 5452 /*---------------------------------------------------------------------------- 5453 | Returns the result of dividing the extended double-precision floating-point 5454 | value `a' by the corresponding value `b'. The operation is performed 5455 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5456 *----------------------------------------------------------------------------*/ 5457 5458 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5459 { 5460 flag aSign, bSign, zSign; 5461 int32_t aExp, bExp, zExp; 5462 uint64_t aSig, bSig, zSig0, zSig1; 5463 uint64_t rem0, rem1, rem2, term0, term1, term2; 5464 5465 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5466 float_raise(float_flag_invalid, status); 5467 return floatx80_default_nan(status); 5468 } 5469 aSig = extractFloatx80Frac( a ); 5470 aExp = extractFloatx80Exp( a ); 5471 aSign = extractFloatx80Sign( a ); 5472 bSig = extractFloatx80Frac( b ); 5473 bExp = extractFloatx80Exp( b ); 5474 bSign = extractFloatx80Sign( b ); 5475 zSign = aSign ^ bSign; 5476 if ( aExp == 0x7FFF ) { 5477 if ((uint64_t)(aSig << 1)) { 5478 return propagateFloatx80NaN(a, b, status); 5479 } 5480 if ( bExp == 0x7FFF ) { 5481 if ((uint64_t)(bSig << 1)) { 5482 return propagateFloatx80NaN(a, b, status); 5483 } 5484 goto invalid; 5485 } 5486 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5487 } 5488 if ( bExp == 0x7FFF ) { 5489 if ((uint64_t)(bSig << 1)) { 5490 return propagateFloatx80NaN(a, b, status); 5491 } 5492 return packFloatx80( zSign, 0, 0 ); 5493 } 5494 if ( bExp == 0 ) { 5495 if ( bSig == 0 ) { 5496 if ( ( aExp | aSig ) == 0 ) { 5497 invalid: 5498 float_raise(float_flag_invalid, status); 5499 return floatx80_default_nan(status); 5500 } 5501 float_raise(float_flag_divbyzero, status); 5502 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5503 } 5504 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5505 } 5506 if ( aExp == 0 ) { 5507 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5508 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5509 } 5510 zExp = aExp - bExp + 0x3FFE; 5511 rem1 = 0; 5512 if ( bSig <= aSig ) { 5513 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5514 ++zExp; 5515 } 5516 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5517 mul64To128( bSig, zSig0, &term0, &term1 ); 5518 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5519 while ( (int64_t) rem0 < 0 ) { 5520 --zSig0; 5521 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5522 } 5523 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5524 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5525 mul64To128( bSig, zSig1, &term1, &term2 ); 5526 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5527 while ( (int64_t) rem1 < 0 ) { 5528 --zSig1; 5529 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5530 } 5531 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5532 } 5533 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5534 zSign, zExp, zSig0, zSig1, status); 5535 } 5536 5537 /*---------------------------------------------------------------------------- 5538 | Returns the remainder of the extended double-precision floating-point value 5539 | `a' with respect to the corresponding value `b'. The operation is performed 5540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5541 *----------------------------------------------------------------------------*/ 5542 5543 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5544 { 5545 flag aSign, zSign; 5546 int32_t aExp, bExp, expDiff; 5547 uint64_t aSig0, aSig1, bSig; 5548 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5549 5550 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5551 float_raise(float_flag_invalid, status); 5552 return floatx80_default_nan(status); 5553 } 5554 aSig0 = extractFloatx80Frac( a ); 5555 aExp = extractFloatx80Exp( a ); 5556 aSign = extractFloatx80Sign( a ); 5557 bSig = extractFloatx80Frac( b ); 5558 bExp = extractFloatx80Exp( b ); 5559 if ( aExp == 0x7FFF ) { 5560 if ( (uint64_t) ( aSig0<<1 ) 5561 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5562 return propagateFloatx80NaN(a, b, status); 5563 } 5564 goto invalid; 5565 } 5566 if ( bExp == 0x7FFF ) { 5567 if ((uint64_t)(bSig << 1)) { 5568 return propagateFloatx80NaN(a, b, status); 5569 } 5570 return a; 5571 } 5572 if ( bExp == 0 ) { 5573 if ( bSig == 0 ) { 5574 invalid: 5575 float_raise(float_flag_invalid, status); 5576 return floatx80_default_nan(status); 5577 } 5578 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5579 } 5580 if ( aExp == 0 ) { 5581 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5582 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5583 } 5584 bSig |= LIT64( 0x8000000000000000 ); 5585 zSign = aSign; 5586 expDiff = aExp - bExp; 5587 aSig1 = 0; 5588 if ( expDiff < 0 ) { 5589 if ( expDiff < -1 ) return a; 5590 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5591 expDiff = 0; 5592 } 5593 q = ( bSig <= aSig0 ); 5594 if ( q ) aSig0 -= bSig; 5595 expDiff -= 64; 5596 while ( 0 < expDiff ) { 5597 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5598 q = ( 2 < q ) ? q - 2 : 0; 5599 mul64To128( bSig, q, &term0, &term1 ); 5600 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5601 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5602 expDiff -= 62; 5603 } 5604 expDiff += 64; 5605 if ( 0 < expDiff ) { 5606 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5607 q = ( 2 < q ) ? q - 2 : 0; 5608 q >>= 64 - expDiff; 5609 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5610 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5611 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5612 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5613 ++q; 5614 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5615 } 5616 } 5617 else { 5618 term1 = 0; 5619 term0 = bSig; 5620 } 5621 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5622 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5623 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5624 && ( q & 1 ) ) 5625 ) { 5626 aSig0 = alternateASig0; 5627 aSig1 = alternateASig1; 5628 zSign = ! zSign; 5629 } 5630 return 5631 normalizeRoundAndPackFloatx80( 5632 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5633 5634 } 5635 5636 /*---------------------------------------------------------------------------- 5637 | Returns the square root of the extended double-precision floating-point 5638 | value `a'. The operation is performed according to the IEC/IEEE Standard 5639 | for Binary Floating-Point Arithmetic. 5640 *----------------------------------------------------------------------------*/ 5641 5642 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5643 { 5644 flag aSign; 5645 int32_t aExp, zExp; 5646 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5647 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5648 5649 if (floatx80_invalid_encoding(a)) { 5650 float_raise(float_flag_invalid, status); 5651 return floatx80_default_nan(status); 5652 } 5653 aSig0 = extractFloatx80Frac( a ); 5654 aExp = extractFloatx80Exp( a ); 5655 aSign = extractFloatx80Sign( a ); 5656 if ( aExp == 0x7FFF ) { 5657 if ((uint64_t)(aSig0 << 1)) { 5658 return propagateFloatx80NaN(a, a, status); 5659 } 5660 if ( ! aSign ) return a; 5661 goto invalid; 5662 } 5663 if ( aSign ) { 5664 if ( ( aExp | aSig0 ) == 0 ) return a; 5665 invalid: 5666 float_raise(float_flag_invalid, status); 5667 return floatx80_default_nan(status); 5668 } 5669 if ( aExp == 0 ) { 5670 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5671 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5672 } 5673 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5674 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5675 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5676 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5677 doubleZSig0 = zSig0<<1; 5678 mul64To128( zSig0, zSig0, &term0, &term1 ); 5679 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5680 while ( (int64_t) rem0 < 0 ) { 5681 --zSig0; 5682 doubleZSig0 -= 2; 5683 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5684 } 5685 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5686 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5687 if ( zSig1 == 0 ) zSig1 = 1; 5688 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5689 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5690 mul64To128( zSig1, zSig1, &term2, &term3 ); 5691 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5692 while ( (int64_t) rem1 < 0 ) { 5693 --zSig1; 5694 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5695 term3 |= 1; 5696 term2 |= doubleZSig0; 5697 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5698 } 5699 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5700 } 5701 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5702 zSig0 |= doubleZSig0; 5703 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5704 0, zExp, zSig0, zSig1, status); 5705 } 5706 5707 /*---------------------------------------------------------------------------- 5708 | Returns 1 if the extended double-precision floating-point value `a' is equal 5709 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5710 | raised if either operand is a NaN. Otherwise, the comparison is performed 5711 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5712 *----------------------------------------------------------------------------*/ 5713 5714 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5715 { 5716 5717 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5718 || (extractFloatx80Exp(a) == 0x7FFF 5719 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5720 || (extractFloatx80Exp(b) == 0x7FFF 5721 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5722 ) { 5723 float_raise(float_flag_invalid, status); 5724 return 0; 5725 } 5726 return 5727 ( a.low == b.low ) 5728 && ( ( a.high == b.high ) 5729 || ( ( a.low == 0 ) 5730 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5731 ); 5732 5733 } 5734 5735 /*---------------------------------------------------------------------------- 5736 | Returns 1 if the extended double-precision floating-point value `a' is 5737 | less than or equal to the corresponding value `b', and 0 otherwise. The 5738 | invalid exception is raised if either operand is a NaN. The comparison is 5739 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5740 | Arithmetic. 5741 *----------------------------------------------------------------------------*/ 5742 5743 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5744 { 5745 flag aSign, bSign; 5746 5747 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5748 || (extractFloatx80Exp(a) == 0x7FFF 5749 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5750 || (extractFloatx80Exp(b) == 0x7FFF 5751 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5752 ) { 5753 float_raise(float_flag_invalid, status); 5754 return 0; 5755 } 5756 aSign = extractFloatx80Sign( a ); 5757 bSign = extractFloatx80Sign( b ); 5758 if ( aSign != bSign ) { 5759 return 5760 aSign 5761 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5762 == 0 ); 5763 } 5764 return 5765 aSign ? le128( b.high, b.low, a.high, a.low ) 5766 : le128( a.high, a.low, b.high, b.low ); 5767 5768 } 5769 5770 /*---------------------------------------------------------------------------- 5771 | Returns 1 if the extended double-precision floating-point value `a' is 5772 | less than the corresponding value `b', and 0 otherwise. The invalid 5773 | exception is raised if either operand is a NaN. The comparison is performed 5774 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5775 *----------------------------------------------------------------------------*/ 5776 5777 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5778 { 5779 flag aSign, bSign; 5780 5781 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5782 || (extractFloatx80Exp(a) == 0x7FFF 5783 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5784 || (extractFloatx80Exp(b) == 0x7FFF 5785 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5786 ) { 5787 float_raise(float_flag_invalid, status); 5788 return 0; 5789 } 5790 aSign = extractFloatx80Sign( a ); 5791 bSign = extractFloatx80Sign( b ); 5792 if ( aSign != bSign ) { 5793 return 5794 aSign 5795 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5796 != 0 ); 5797 } 5798 return 5799 aSign ? lt128( b.high, b.low, a.high, a.low ) 5800 : lt128( a.high, a.low, b.high, b.low ); 5801 5802 } 5803 5804 /*---------------------------------------------------------------------------- 5805 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5806 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5807 | either operand is a NaN. The comparison is performed according to the 5808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5809 *----------------------------------------------------------------------------*/ 5810 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5811 { 5812 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5813 || (extractFloatx80Exp(a) == 0x7FFF 5814 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5815 || (extractFloatx80Exp(b) == 0x7FFF 5816 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5817 ) { 5818 float_raise(float_flag_invalid, status); 5819 return 1; 5820 } 5821 return 0; 5822 } 5823 5824 /*---------------------------------------------------------------------------- 5825 | Returns 1 if the extended double-precision floating-point value `a' is 5826 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5827 | cause an exception. The comparison is performed according to the IEC/IEEE 5828 | Standard for Binary Floating-Point Arithmetic. 5829 *----------------------------------------------------------------------------*/ 5830 5831 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5832 { 5833 5834 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5835 float_raise(float_flag_invalid, status); 5836 return 0; 5837 } 5838 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5839 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5840 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5841 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5842 ) { 5843 if (floatx80_is_signaling_nan(a, status) 5844 || floatx80_is_signaling_nan(b, status)) { 5845 float_raise(float_flag_invalid, status); 5846 } 5847 return 0; 5848 } 5849 return 5850 ( a.low == b.low ) 5851 && ( ( a.high == b.high ) 5852 || ( ( a.low == 0 ) 5853 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5854 ); 5855 5856 } 5857 5858 /*---------------------------------------------------------------------------- 5859 | Returns 1 if the extended double-precision floating-point value `a' is less 5860 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5861 | do not cause an exception. Otherwise, the comparison is performed according 5862 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5863 *----------------------------------------------------------------------------*/ 5864 5865 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5866 { 5867 flag aSign, bSign; 5868 5869 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5870 float_raise(float_flag_invalid, status); 5871 return 0; 5872 } 5873 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5874 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5875 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5876 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5877 ) { 5878 if (floatx80_is_signaling_nan(a, status) 5879 || floatx80_is_signaling_nan(b, status)) { 5880 float_raise(float_flag_invalid, status); 5881 } 5882 return 0; 5883 } 5884 aSign = extractFloatx80Sign( a ); 5885 bSign = extractFloatx80Sign( b ); 5886 if ( aSign != bSign ) { 5887 return 5888 aSign 5889 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5890 == 0 ); 5891 } 5892 return 5893 aSign ? le128( b.high, b.low, a.high, a.low ) 5894 : le128( a.high, a.low, b.high, b.low ); 5895 5896 } 5897 5898 /*---------------------------------------------------------------------------- 5899 | Returns 1 if the extended double-precision floating-point value `a' is less 5900 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5901 | an exception. Otherwise, the comparison is performed according to the 5902 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5903 *----------------------------------------------------------------------------*/ 5904 5905 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5906 { 5907 flag aSign, bSign; 5908 5909 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5910 float_raise(float_flag_invalid, status); 5911 return 0; 5912 } 5913 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5914 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5915 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5916 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5917 ) { 5918 if (floatx80_is_signaling_nan(a, status) 5919 || floatx80_is_signaling_nan(b, status)) { 5920 float_raise(float_flag_invalid, status); 5921 } 5922 return 0; 5923 } 5924 aSign = extractFloatx80Sign( a ); 5925 bSign = extractFloatx80Sign( b ); 5926 if ( aSign != bSign ) { 5927 return 5928 aSign 5929 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5930 != 0 ); 5931 } 5932 return 5933 aSign ? lt128( b.high, b.low, a.high, a.low ) 5934 : lt128( a.high, a.low, b.high, b.low ); 5935 5936 } 5937 5938 /*---------------------------------------------------------------------------- 5939 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5940 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5941 | The comparison is performed according to the IEC/IEEE Standard for Binary 5942 | Floating-Point Arithmetic. 5943 *----------------------------------------------------------------------------*/ 5944 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5945 { 5946 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5947 float_raise(float_flag_invalid, status); 5948 return 1; 5949 } 5950 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5951 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5952 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5953 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5954 ) { 5955 if (floatx80_is_signaling_nan(a, status) 5956 || floatx80_is_signaling_nan(b, status)) { 5957 float_raise(float_flag_invalid, status); 5958 } 5959 return 1; 5960 } 5961 return 0; 5962 } 5963 5964 /*---------------------------------------------------------------------------- 5965 | Returns the result of converting the quadruple-precision floating-point 5966 | value `a' to the 32-bit two's complement integer format. The conversion 5967 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5968 | Arithmetic---which means in particular that the conversion is rounded 5969 | according to the current rounding mode. If `a' is a NaN, the largest 5970 | positive integer is returned. Otherwise, if the conversion overflows, the 5971 | largest integer with the same sign as `a' is returned. 5972 *----------------------------------------------------------------------------*/ 5973 5974 int32_t float128_to_int32(float128 a, float_status *status) 5975 { 5976 flag aSign; 5977 int32_t aExp, shiftCount; 5978 uint64_t aSig0, aSig1; 5979 5980 aSig1 = extractFloat128Frac1( a ); 5981 aSig0 = extractFloat128Frac0( a ); 5982 aExp = extractFloat128Exp( a ); 5983 aSign = extractFloat128Sign( a ); 5984 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5985 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5986 aSig0 |= ( aSig1 != 0 ); 5987 shiftCount = 0x4028 - aExp; 5988 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5989 return roundAndPackInt32(aSign, aSig0, status); 5990 5991 } 5992 5993 /*---------------------------------------------------------------------------- 5994 | Returns the result of converting the quadruple-precision floating-point 5995 | value `a' to the 32-bit two's complement integer format. The conversion 5996 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5997 | Arithmetic, except that the conversion is always rounded toward zero. If 5998 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5999 | conversion overflows, the largest integer with the same sign as `a' is 6000 | returned. 6001 *----------------------------------------------------------------------------*/ 6002 6003 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 6004 { 6005 flag aSign; 6006 int32_t aExp, shiftCount; 6007 uint64_t aSig0, aSig1, savedASig; 6008 int32_t z; 6009 6010 aSig1 = extractFloat128Frac1( a ); 6011 aSig0 = extractFloat128Frac0( a ); 6012 aExp = extractFloat128Exp( a ); 6013 aSign = extractFloat128Sign( a ); 6014 aSig0 |= ( aSig1 != 0 ); 6015 if ( 0x401E < aExp ) { 6016 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 6017 goto invalid; 6018 } 6019 else if ( aExp < 0x3FFF ) { 6020 if (aExp || aSig0) { 6021 status->float_exception_flags |= float_flag_inexact; 6022 } 6023 return 0; 6024 } 6025 aSig0 |= LIT64( 0x0001000000000000 ); 6026 shiftCount = 0x402F - aExp; 6027 savedASig = aSig0; 6028 aSig0 >>= shiftCount; 6029 z = aSig0; 6030 if ( aSign ) z = - z; 6031 if ( ( z < 0 ) ^ aSign ) { 6032 invalid: 6033 float_raise(float_flag_invalid, status); 6034 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 6035 } 6036 if ( ( aSig0<<shiftCount ) != savedASig ) { 6037 status->float_exception_flags |= float_flag_inexact; 6038 } 6039 return z; 6040 6041 } 6042 6043 /*---------------------------------------------------------------------------- 6044 | Returns the result of converting the quadruple-precision floating-point 6045 | value `a' to the 64-bit two's complement integer format. The conversion 6046 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6047 | Arithmetic---which means in particular that the conversion is rounded 6048 | according to the current rounding mode. If `a' is a NaN, the largest 6049 | positive integer is returned. Otherwise, if the conversion overflows, the 6050 | largest integer with the same sign as `a' is returned. 6051 *----------------------------------------------------------------------------*/ 6052 6053 int64_t float128_to_int64(float128 a, float_status *status) 6054 { 6055 flag aSign; 6056 int32_t aExp, shiftCount; 6057 uint64_t aSig0, aSig1; 6058 6059 aSig1 = extractFloat128Frac1( a ); 6060 aSig0 = extractFloat128Frac0( a ); 6061 aExp = extractFloat128Exp( a ); 6062 aSign = extractFloat128Sign( a ); 6063 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6064 shiftCount = 0x402F - aExp; 6065 if ( shiftCount <= 0 ) { 6066 if ( 0x403E < aExp ) { 6067 float_raise(float_flag_invalid, status); 6068 if ( ! aSign 6069 || ( ( aExp == 0x7FFF ) 6070 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 6071 ) 6072 ) { 6073 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6074 } 6075 return (int64_t) LIT64( 0x8000000000000000 ); 6076 } 6077 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 6078 } 6079 else { 6080 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 6081 } 6082 return roundAndPackInt64(aSign, aSig0, aSig1, status); 6083 6084 } 6085 6086 /*---------------------------------------------------------------------------- 6087 | Returns the result of converting the quadruple-precision floating-point 6088 | value `a' to the 64-bit two's complement integer format. The conversion 6089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6090 | Arithmetic, except that the conversion is always rounded toward zero. 6091 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 6092 | the conversion overflows, the largest integer with the same sign as `a' is 6093 | returned. 6094 *----------------------------------------------------------------------------*/ 6095 6096 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 6097 { 6098 flag aSign; 6099 int32_t aExp, shiftCount; 6100 uint64_t aSig0, aSig1; 6101 int64_t z; 6102 6103 aSig1 = extractFloat128Frac1( a ); 6104 aSig0 = extractFloat128Frac0( a ); 6105 aExp = extractFloat128Exp( a ); 6106 aSign = extractFloat128Sign( a ); 6107 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 6108 shiftCount = aExp - 0x402F; 6109 if ( 0 < shiftCount ) { 6110 if ( 0x403E <= aExp ) { 6111 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 6112 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 6113 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 6114 if (aSig1) { 6115 status->float_exception_flags |= float_flag_inexact; 6116 } 6117 } 6118 else { 6119 float_raise(float_flag_invalid, status); 6120 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 6121 return LIT64( 0x7FFFFFFFFFFFFFFF ); 6122 } 6123 } 6124 return (int64_t) LIT64( 0x8000000000000000 ); 6125 } 6126 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 6127 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 6128 status->float_exception_flags |= float_flag_inexact; 6129 } 6130 } 6131 else { 6132 if ( aExp < 0x3FFF ) { 6133 if ( aExp | aSig0 | aSig1 ) { 6134 status->float_exception_flags |= float_flag_inexact; 6135 } 6136 return 0; 6137 } 6138 z = aSig0>>( - shiftCount ); 6139 if ( aSig1 6140 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 6141 status->float_exception_flags |= float_flag_inexact; 6142 } 6143 } 6144 if ( aSign ) z = - z; 6145 return z; 6146 6147 } 6148 6149 /*---------------------------------------------------------------------------- 6150 | Returns the result of converting the quadruple-precision floating-point value 6151 | `a' to the 64-bit unsigned integer format. The conversion is 6152 | performed according to the IEC/IEEE Standard for Binary Floating-Point 6153 | Arithmetic---which means in particular that the conversion is rounded 6154 | according to the current rounding mode. If `a' is a NaN, the largest 6155 | positive integer is returned. If the conversion overflows, the 6156 | largest unsigned integer is returned. If 'a' is negative, the value is 6157 | rounded and zero is returned; negative values that do not round to zero 6158 | will raise the inexact exception. 6159 *----------------------------------------------------------------------------*/ 6160 6161 uint64_t float128_to_uint64(float128 a, float_status *status) 6162 { 6163 flag aSign; 6164 int aExp; 6165 int shiftCount; 6166 uint64_t aSig0, aSig1; 6167 6168 aSig0 = extractFloat128Frac0(a); 6169 aSig1 = extractFloat128Frac1(a); 6170 aExp = extractFloat128Exp(a); 6171 aSign = extractFloat128Sign(a); 6172 if (aSign && (aExp > 0x3FFE)) { 6173 float_raise(float_flag_invalid, status); 6174 if (float128_is_any_nan(a)) { 6175 return LIT64(0xFFFFFFFFFFFFFFFF); 6176 } else { 6177 return 0; 6178 } 6179 } 6180 if (aExp) { 6181 aSig0 |= LIT64(0x0001000000000000); 6182 } 6183 shiftCount = 0x402F - aExp; 6184 if (shiftCount <= 0) { 6185 if (0x403E < aExp) { 6186 float_raise(float_flag_invalid, status); 6187 return LIT64(0xFFFFFFFFFFFFFFFF); 6188 } 6189 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 6190 } else { 6191 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 6192 } 6193 return roundAndPackUint64(aSign, aSig0, aSig1, status); 6194 } 6195 6196 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 6197 { 6198 uint64_t v; 6199 signed char current_rounding_mode = status->float_rounding_mode; 6200 6201 set_float_rounding_mode(float_round_to_zero, status); 6202 v = float128_to_uint64(a, status); 6203 set_float_rounding_mode(current_rounding_mode, status); 6204 6205 return v; 6206 } 6207 6208 /*---------------------------------------------------------------------------- 6209 | Returns the result of converting the quadruple-precision floating-point 6210 | value `a' to the 32-bit unsigned integer format. The conversion 6211 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6212 | Arithmetic except that the conversion is always rounded toward zero. 6213 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 6214 | if the conversion overflows, the largest unsigned integer is returned. 6215 | If 'a' is negative, the value is rounded and zero is returned; negative 6216 | values that do not round to zero will raise the inexact exception. 6217 *----------------------------------------------------------------------------*/ 6218 6219 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 6220 { 6221 uint64_t v; 6222 uint32_t res; 6223 int old_exc_flags = get_float_exception_flags(status); 6224 6225 v = float128_to_uint64_round_to_zero(a, status); 6226 if (v > 0xffffffff) { 6227 res = 0xffffffff; 6228 } else { 6229 return v; 6230 } 6231 set_float_exception_flags(old_exc_flags, status); 6232 float_raise(float_flag_invalid, status); 6233 return res; 6234 } 6235 6236 /*---------------------------------------------------------------------------- 6237 | Returns the result of converting the quadruple-precision floating-point 6238 | value `a' to the single-precision floating-point format. The conversion 6239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6240 | Arithmetic. 6241 *----------------------------------------------------------------------------*/ 6242 6243 float32 float128_to_float32(float128 a, float_status *status) 6244 { 6245 flag aSign; 6246 int32_t aExp; 6247 uint64_t aSig0, aSig1; 6248 uint32_t zSig; 6249 6250 aSig1 = extractFloat128Frac1( a ); 6251 aSig0 = extractFloat128Frac0( a ); 6252 aExp = extractFloat128Exp( a ); 6253 aSign = extractFloat128Sign( a ); 6254 if ( aExp == 0x7FFF ) { 6255 if ( aSig0 | aSig1 ) { 6256 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6257 } 6258 return packFloat32( aSign, 0xFF, 0 ); 6259 } 6260 aSig0 |= ( aSig1 != 0 ); 6261 shift64RightJamming( aSig0, 18, &aSig0 ); 6262 zSig = aSig0; 6263 if ( aExp || zSig ) { 6264 zSig |= 0x40000000; 6265 aExp -= 0x3F81; 6266 } 6267 return roundAndPackFloat32(aSign, aExp, zSig, status); 6268 6269 } 6270 6271 /*---------------------------------------------------------------------------- 6272 | Returns the result of converting the quadruple-precision floating-point 6273 | value `a' to the double-precision floating-point format. The conversion 6274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6275 | Arithmetic. 6276 *----------------------------------------------------------------------------*/ 6277 6278 float64 float128_to_float64(float128 a, float_status *status) 6279 { 6280 flag aSign; 6281 int32_t aExp; 6282 uint64_t aSig0, aSig1; 6283 6284 aSig1 = extractFloat128Frac1( a ); 6285 aSig0 = extractFloat128Frac0( a ); 6286 aExp = extractFloat128Exp( a ); 6287 aSign = extractFloat128Sign( a ); 6288 if ( aExp == 0x7FFF ) { 6289 if ( aSig0 | aSig1 ) { 6290 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6291 } 6292 return packFloat64( aSign, 0x7FF, 0 ); 6293 } 6294 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6295 aSig0 |= ( aSig1 != 0 ); 6296 if ( aExp || aSig0 ) { 6297 aSig0 |= LIT64( 0x4000000000000000 ); 6298 aExp -= 0x3C01; 6299 } 6300 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6301 6302 } 6303 6304 /*---------------------------------------------------------------------------- 6305 | Returns the result of converting the quadruple-precision floating-point 6306 | value `a' to the extended double-precision floating-point format. The 6307 | conversion is performed according to the IEC/IEEE Standard for Binary 6308 | Floating-Point Arithmetic. 6309 *----------------------------------------------------------------------------*/ 6310 6311 floatx80 float128_to_floatx80(float128 a, float_status *status) 6312 { 6313 flag aSign; 6314 int32_t aExp; 6315 uint64_t aSig0, aSig1; 6316 6317 aSig1 = extractFloat128Frac1( a ); 6318 aSig0 = extractFloat128Frac0( a ); 6319 aExp = extractFloat128Exp( a ); 6320 aSign = extractFloat128Sign( a ); 6321 if ( aExp == 0x7FFF ) { 6322 if ( aSig0 | aSig1 ) { 6323 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6324 } 6325 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6326 } 6327 if ( aExp == 0 ) { 6328 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6329 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6330 } 6331 else { 6332 aSig0 |= LIT64( 0x0001000000000000 ); 6333 } 6334 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6335 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6336 6337 } 6338 6339 /*---------------------------------------------------------------------------- 6340 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6341 | returns the result as a quadruple-precision floating-point value. The 6342 | operation is performed according to the IEC/IEEE Standard for Binary 6343 | Floating-Point Arithmetic. 6344 *----------------------------------------------------------------------------*/ 6345 6346 float128 float128_round_to_int(float128 a, float_status *status) 6347 { 6348 flag aSign; 6349 int32_t aExp; 6350 uint64_t lastBitMask, roundBitsMask; 6351 float128 z; 6352 6353 aExp = extractFloat128Exp( a ); 6354 if ( 0x402F <= aExp ) { 6355 if ( 0x406F <= aExp ) { 6356 if ( ( aExp == 0x7FFF ) 6357 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6358 ) { 6359 return propagateFloat128NaN(a, a, status); 6360 } 6361 return a; 6362 } 6363 lastBitMask = 1; 6364 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6365 roundBitsMask = lastBitMask - 1; 6366 z = a; 6367 switch (status->float_rounding_mode) { 6368 case float_round_nearest_even: 6369 if ( lastBitMask ) { 6370 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6371 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6372 } 6373 else { 6374 if ( (int64_t) z.low < 0 ) { 6375 ++z.high; 6376 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6377 } 6378 } 6379 break; 6380 case float_round_ties_away: 6381 if (lastBitMask) { 6382 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6383 } else { 6384 if ((int64_t) z.low < 0) { 6385 ++z.high; 6386 } 6387 } 6388 break; 6389 case float_round_to_zero: 6390 break; 6391 case float_round_up: 6392 if (!extractFloat128Sign(z)) { 6393 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6394 } 6395 break; 6396 case float_round_down: 6397 if (extractFloat128Sign(z)) { 6398 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6399 } 6400 break; 6401 default: 6402 abort(); 6403 } 6404 z.low &= ~ roundBitsMask; 6405 } 6406 else { 6407 if ( aExp < 0x3FFF ) { 6408 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6409 status->float_exception_flags |= float_flag_inexact; 6410 aSign = extractFloat128Sign( a ); 6411 switch (status->float_rounding_mode) { 6412 case float_round_nearest_even: 6413 if ( ( aExp == 0x3FFE ) 6414 && ( extractFloat128Frac0( a ) 6415 | extractFloat128Frac1( a ) ) 6416 ) { 6417 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6418 } 6419 break; 6420 case float_round_ties_away: 6421 if (aExp == 0x3FFE) { 6422 return packFloat128(aSign, 0x3FFF, 0, 0); 6423 } 6424 break; 6425 case float_round_down: 6426 return 6427 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6428 : packFloat128( 0, 0, 0, 0 ); 6429 case float_round_up: 6430 return 6431 aSign ? packFloat128( 1, 0, 0, 0 ) 6432 : packFloat128( 0, 0x3FFF, 0, 0 ); 6433 } 6434 return packFloat128( aSign, 0, 0, 0 ); 6435 } 6436 lastBitMask = 1; 6437 lastBitMask <<= 0x402F - aExp; 6438 roundBitsMask = lastBitMask - 1; 6439 z.low = 0; 6440 z.high = a.high; 6441 switch (status->float_rounding_mode) { 6442 case float_round_nearest_even: 6443 z.high += lastBitMask>>1; 6444 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6445 z.high &= ~ lastBitMask; 6446 } 6447 break; 6448 case float_round_ties_away: 6449 z.high += lastBitMask>>1; 6450 break; 6451 case float_round_to_zero: 6452 break; 6453 case float_round_up: 6454 if (!extractFloat128Sign(z)) { 6455 z.high |= ( a.low != 0 ); 6456 z.high += roundBitsMask; 6457 } 6458 break; 6459 case float_round_down: 6460 if (extractFloat128Sign(z)) { 6461 z.high |= (a.low != 0); 6462 z.high += roundBitsMask; 6463 } 6464 break; 6465 default: 6466 abort(); 6467 } 6468 z.high &= ~ roundBitsMask; 6469 } 6470 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6471 status->float_exception_flags |= float_flag_inexact; 6472 } 6473 return z; 6474 6475 } 6476 6477 /*---------------------------------------------------------------------------- 6478 | Returns the result of adding the absolute values of the quadruple-precision 6479 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6480 | before being returned. `zSign' is ignored if the result is a NaN. 6481 | The addition is performed according to the IEC/IEEE Standard for Binary 6482 | Floating-Point Arithmetic. 6483 *----------------------------------------------------------------------------*/ 6484 6485 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6486 float_status *status) 6487 { 6488 int32_t aExp, bExp, zExp; 6489 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6490 int32_t expDiff; 6491 6492 aSig1 = extractFloat128Frac1( a ); 6493 aSig0 = extractFloat128Frac0( a ); 6494 aExp = extractFloat128Exp( a ); 6495 bSig1 = extractFloat128Frac1( b ); 6496 bSig0 = extractFloat128Frac0( b ); 6497 bExp = extractFloat128Exp( b ); 6498 expDiff = aExp - bExp; 6499 if ( 0 < expDiff ) { 6500 if ( aExp == 0x7FFF ) { 6501 if (aSig0 | aSig1) { 6502 return propagateFloat128NaN(a, b, status); 6503 } 6504 return a; 6505 } 6506 if ( bExp == 0 ) { 6507 --expDiff; 6508 } 6509 else { 6510 bSig0 |= LIT64( 0x0001000000000000 ); 6511 } 6512 shift128ExtraRightJamming( 6513 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6514 zExp = aExp; 6515 } 6516 else if ( expDiff < 0 ) { 6517 if ( bExp == 0x7FFF ) { 6518 if (bSig0 | bSig1) { 6519 return propagateFloat128NaN(a, b, status); 6520 } 6521 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6522 } 6523 if ( aExp == 0 ) { 6524 ++expDiff; 6525 } 6526 else { 6527 aSig0 |= LIT64( 0x0001000000000000 ); 6528 } 6529 shift128ExtraRightJamming( 6530 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6531 zExp = bExp; 6532 } 6533 else { 6534 if ( aExp == 0x7FFF ) { 6535 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6536 return propagateFloat128NaN(a, b, status); 6537 } 6538 return a; 6539 } 6540 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6541 if ( aExp == 0 ) { 6542 if (status->flush_to_zero) { 6543 if (zSig0 | zSig1) { 6544 float_raise(float_flag_output_denormal, status); 6545 } 6546 return packFloat128(zSign, 0, 0, 0); 6547 } 6548 return packFloat128( zSign, 0, zSig0, zSig1 ); 6549 } 6550 zSig2 = 0; 6551 zSig0 |= LIT64( 0x0002000000000000 ); 6552 zExp = aExp; 6553 goto shiftRight1; 6554 } 6555 aSig0 |= LIT64( 0x0001000000000000 ); 6556 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6557 --zExp; 6558 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6559 ++zExp; 6560 shiftRight1: 6561 shift128ExtraRightJamming( 6562 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6563 roundAndPack: 6564 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6565 6566 } 6567 6568 /*---------------------------------------------------------------------------- 6569 | Returns the result of subtracting the absolute values of the quadruple- 6570 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6571 | difference is negated before being returned. `zSign' is ignored if the 6572 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6573 | Standard for Binary Floating-Point Arithmetic. 6574 *----------------------------------------------------------------------------*/ 6575 6576 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6577 float_status *status) 6578 { 6579 int32_t aExp, bExp, zExp; 6580 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6581 int32_t expDiff; 6582 6583 aSig1 = extractFloat128Frac1( a ); 6584 aSig0 = extractFloat128Frac0( a ); 6585 aExp = extractFloat128Exp( a ); 6586 bSig1 = extractFloat128Frac1( b ); 6587 bSig0 = extractFloat128Frac0( b ); 6588 bExp = extractFloat128Exp( b ); 6589 expDiff = aExp - bExp; 6590 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6591 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6592 if ( 0 < expDiff ) goto aExpBigger; 6593 if ( expDiff < 0 ) goto bExpBigger; 6594 if ( aExp == 0x7FFF ) { 6595 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6596 return propagateFloat128NaN(a, b, status); 6597 } 6598 float_raise(float_flag_invalid, status); 6599 return float128_default_nan(status); 6600 } 6601 if ( aExp == 0 ) { 6602 aExp = 1; 6603 bExp = 1; 6604 } 6605 if ( bSig0 < aSig0 ) goto aBigger; 6606 if ( aSig0 < bSig0 ) goto bBigger; 6607 if ( bSig1 < aSig1 ) goto aBigger; 6608 if ( aSig1 < bSig1 ) goto bBigger; 6609 return packFloat128(status->float_rounding_mode == float_round_down, 6610 0, 0, 0); 6611 bExpBigger: 6612 if ( bExp == 0x7FFF ) { 6613 if (bSig0 | bSig1) { 6614 return propagateFloat128NaN(a, b, status); 6615 } 6616 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6617 } 6618 if ( aExp == 0 ) { 6619 ++expDiff; 6620 } 6621 else { 6622 aSig0 |= LIT64( 0x4000000000000000 ); 6623 } 6624 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6625 bSig0 |= LIT64( 0x4000000000000000 ); 6626 bBigger: 6627 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6628 zExp = bExp; 6629 zSign ^= 1; 6630 goto normalizeRoundAndPack; 6631 aExpBigger: 6632 if ( aExp == 0x7FFF ) { 6633 if (aSig0 | aSig1) { 6634 return propagateFloat128NaN(a, b, status); 6635 } 6636 return a; 6637 } 6638 if ( bExp == 0 ) { 6639 --expDiff; 6640 } 6641 else { 6642 bSig0 |= LIT64( 0x4000000000000000 ); 6643 } 6644 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6645 aSig0 |= LIT64( 0x4000000000000000 ); 6646 aBigger: 6647 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6648 zExp = aExp; 6649 normalizeRoundAndPack: 6650 --zExp; 6651 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6652 status); 6653 6654 } 6655 6656 /*---------------------------------------------------------------------------- 6657 | Returns the result of adding the quadruple-precision floating-point values 6658 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6659 | for Binary Floating-Point Arithmetic. 6660 *----------------------------------------------------------------------------*/ 6661 6662 float128 float128_add(float128 a, float128 b, float_status *status) 6663 { 6664 flag aSign, bSign; 6665 6666 aSign = extractFloat128Sign( a ); 6667 bSign = extractFloat128Sign( b ); 6668 if ( aSign == bSign ) { 6669 return addFloat128Sigs(a, b, aSign, status); 6670 } 6671 else { 6672 return subFloat128Sigs(a, b, aSign, status); 6673 } 6674 6675 } 6676 6677 /*---------------------------------------------------------------------------- 6678 | Returns the result of subtracting the quadruple-precision floating-point 6679 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6680 | Standard for Binary Floating-Point Arithmetic. 6681 *----------------------------------------------------------------------------*/ 6682 6683 float128 float128_sub(float128 a, float128 b, float_status *status) 6684 { 6685 flag aSign, bSign; 6686 6687 aSign = extractFloat128Sign( a ); 6688 bSign = extractFloat128Sign( b ); 6689 if ( aSign == bSign ) { 6690 return subFloat128Sigs(a, b, aSign, status); 6691 } 6692 else { 6693 return addFloat128Sigs(a, b, aSign, status); 6694 } 6695 6696 } 6697 6698 /*---------------------------------------------------------------------------- 6699 | Returns the result of multiplying the quadruple-precision floating-point 6700 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6701 | Standard for Binary Floating-Point Arithmetic. 6702 *----------------------------------------------------------------------------*/ 6703 6704 float128 float128_mul(float128 a, float128 b, float_status *status) 6705 { 6706 flag aSign, bSign, zSign; 6707 int32_t aExp, bExp, zExp; 6708 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6709 6710 aSig1 = extractFloat128Frac1( a ); 6711 aSig0 = extractFloat128Frac0( a ); 6712 aExp = extractFloat128Exp( a ); 6713 aSign = extractFloat128Sign( a ); 6714 bSig1 = extractFloat128Frac1( b ); 6715 bSig0 = extractFloat128Frac0( b ); 6716 bExp = extractFloat128Exp( b ); 6717 bSign = extractFloat128Sign( b ); 6718 zSign = aSign ^ bSign; 6719 if ( aExp == 0x7FFF ) { 6720 if ( ( aSig0 | aSig1 ) 6721 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6722 return propagateFloat128NaN(a, b, status); 6723 } 6724 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6725 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6726 } 6727 if ( bExp == 0x7FFF ) { 6728 if (bSig0 | bSig1) { 6729 return propagateFloat128NaN(a, b, status); 6730 } 6731 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6732 invalid: 6733 float_raise(float_flag_invalid, status); 6734 return float128_default_nan(status); 6735 } 6736 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6737 } 6738 if ( aExp == 0 ) { 6739 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6740 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6741 } 6742 if ( bExp == 0 ) { 6743 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6744 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6745 } 6746 zExp = aExp + bExp - 0x4000; 6747 aSig0 |= LIT64( 0x0001000000000000 ); 6748 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6749 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6750 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6751 zSig2 |= ( zSig3 != 0 ); 6752 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6753 shift128ExtraRightJamming( 6754 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6755 ++zExp; 6756 } 6757 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6758 6759 } 6760 6761 /*---------------------------------------------------------------------------- 6762 | Returns the result of dividing the quadruple-precision floating-point value 6763 | `a' by the corresponding value `b'. The operation is performed according to 6764 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6765 *----------------------------------------------------------------------------*/ 6766 6767 float128 float128_div(float128 a, float128 b, float_status *status) 6768 { 6769 flag aSign, bSign, zSign; 6770 int32_t aExp, bExp, zExp; 6771 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6772 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6773 6774 aSig1 = extractFloat128Frac1( a ); 6775 aSig0 = extractFloat128Frac0( a ); 6776 aExp = extractFloat128Exp( a ); 6777 aSign = extractFloat128Sign( a ); 6778 bSig1 = extractFloat128Frac1( b ); 6779 bSig0 = extractFloat128Frac0( b ); 6780 bExp = extractFloat128Exp( b ); 6781 bSign = extractFloat128Sign( b ); 6782 zSign = aSign ^ bSign; 6783 if ( aExp == 0x7FFF ) { 6784 if (aSig0 | aSig1) { 6785 return propagateFloat128NaN(a, b, status); 6786 } 6787 if ( bExp == 0x7FFF ) { 6788 if (bSig0 | bSig1) { 6789 return propagateFloat128NaN(a, b, status); 6790 } 6791 goto invalid; 6792 } 6793 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6794 } 6795 if ( bExp == 0x7FFF ) { 6796 if (bSig0 | bSig1) { 6797 return propagateFloat128NaN(a, b, status); 6798 } 6799 return packFloat128( zSign, 0, 0, 0 ); 6800 } 6801 if ( bExp == 0 ) { 6802 if ( ( bSig0 | bSig1 ) == 0 ) { 6803 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6804 invalid: 6805 float_raise(float_flag_invalid, status); 6806 return float128_default_nan(status); 6807 } 6808 float_raise(float_flag_divbyzero, status); 6809 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6810 } 6811 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6812 } 6813 if ( aExp == 0 ) { 6814 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6815 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6816 } 6817 zExp = aExp - bExp + 0x3FFD; 6818 shortShift128Left( 6819 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6820 shortShift128Left( 6821 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6822 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6823 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6824 ++zExp; 6825 } 6826 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6827 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6828 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6829 while ( (int64_t) rem0 < 0 ) { 6830 --zSig0; 6831 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6832 } 6833 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6834 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6835 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6836 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6837 while ( (int64_t) rem1 < 0 ) { 6838 --zSig1; 6839 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6840 } 6841 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6842 } 6843 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6844 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6845 6846 } 6847 6848 /*---------------------------------------------------------------------------- 6849 | Returns the remainder of the quadruple-precision floating-point value `a' 6850 | with respect to the corresponding value `b'. The operation is performed 6851 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6852 *----------------------------------------------------------------------------*/ 6853 6854 float128 float128_rem(float128 a, float128 b, float_status *status) 6855 { 6856 flag aSign, zSign; 6857 int32_t aExp, bExp, expDiff; 6858 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6859 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6860 int64_t sigMean0; 6861 6862 aSig1 = extractFloat128Frac1( a ); 6863 aSig0 = extractFloat128Frac0( a ); 6864 aExp = extractFloat128Exp( a ); 6865 aSign = extractFloat128Sign( a ); 6866 bSig1 = extractFloat128Frac1( b ); 6867 bSig0 = extractFloat128Frac0( b ); 6868 bExp = extractFloat128Exp( b ); 6869 if ( aExp == 0x7FFF ) { 6870 if ( ( aSig0 | aSig1 ) 6871 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6872 return propagateFloat128NaN(a, b, status); 6873 } 6874 goto invalid; 6875 } 6876 if ( bExp == 0x7FFF ) { 6877 if (bSig0 | bSig1) { 6878 return propagateFloat128NaN(a, b, status); 6879 } 6880 return a; 6881 } 6882 if ( bExp == 0 ) { 6883 if ( ( bSig0 | bSig1 ) == 0 ) { 6884 invalid: 6885 float_raise(float_flag_invalid, status); 6886 return float128_default_nan(status); 6887 } 6888 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6889 } 6890 if ( aExp == 0 ) { 6891 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6892 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6893 } 6894 expDiff = aExp - bExp; 6895 if ( expDiff < -1 ) return a; 6896 shortShift128Left( 6897 aSig0 | LIT64( 0x0001000000000000 ), 6898 aSig1, 6899 15 - ( expDiff < 0 ), 6900 &aSig0, 6901 &aSig1 6902 ); 6903 shortShift128Left( 6904 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6905 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6906 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6907 expDiff -= 64; 6908 while ( 0 < expDiff ) { 6909 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6910 q = ( 4 < q ) ? q - 4 : 0; 6911 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6912 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6913 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6914 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6915 expDiff -= 61; 6916 } 6917 if ( -64 < expDiff ) { 6918 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6919 q = ( 4 < q ) ? q - 4 : 0; 6920 q >>= - expDiff; 6921 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6922 expDiff += 52; 6923 if ( expDiff < 0 ) { 6924 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6925 } 6926 else { 6927 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6928 } 6929 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6930 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6931 } 6932 else { 6933 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6934 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6935 } 6936 do { 6937 alternateASig0 = aSig0; 6938 alternateASig1 = aSig1; 6939 ++q; 6940 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6941 } while ( 0 <= (int64_t) aSig0 ); 6942 add128( 6943 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6944 if ( ( sigMean0 < 0 ) 6945 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6946 aSig0 = alternateASig0; 6947 aSig1 = alternateASig1; 6948 } 6949 zSign = ( (int64_t) aSig0 < 0 ); 6950 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6951 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6952 status); 6953 } 6954 6955 /*---------------------------------------------------------------------------- 6956 | Returns the square root of the quadruple-precision floating-point value `a'. 6957 | The operation is performed according to the IEC/IEEE Standard for Binary 6958 | Floating-Point Arithmetic. 6959 *----------------------------------------------------------------------------*/ 6960 6961 float128 float128_sqrt(float128 a, float_status *status) 6962 { 6963 flag aSign; 6964 int32_t aExp, zExp; 6965 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6966 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6967 6968 aSig1 = extractFloat128Frac1( a ); 6969 aSig0 = extractFloat128Frac0( a ); 6970 aExp = extractFloat128Exp( a ); 6971 aSign = extractFloat128Sign( a ); 6972 if ( aExp == 0x7FFF ) { 6973 if (aSig0 | aSig1) { 6974 return propagateFloat128NaN(a, a, status); 6975 } 6976 if ( ! aSign ) return a; 6977 goto invalid; 6978 } 6979 if ( aSign ) { 6980 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6981 invalid: 6982 float_raise(float_flag_invalid, status); 6983 return float128_default_nan(status); 6984 } 6985 if ( aExp == 0 ) { 6986 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6987 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6988 } 6989 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6990 aSig0 |= LIT64( 0x0001000000000000 ); 6991 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6992 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6993 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6994 doubleZSig0 = zSig0<<1; 6995 mul64To128( zSig0, zSig0, &term0, &term1 ); 6996 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6997 while ( (int64_t) rem0 < 0 ) { 6998 --zSig0; 6999 doubleZSig0 -= 2; 7000 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 7001 } 7002 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 7003 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 7004 if ( zSig1 == 0 ) zSig1 = 1; 7005 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 7006 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 7007 mul64To128( zSig1, zSig1, &term2, &term3 ); 7008 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 7009 while ( (int64_t) rem1 < 0 ) { 7010 --zSig1; 7011 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 7012 term3 |= 1; 7013 term2 |= doubleZSig0; 7014 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 7015 } 7016 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 7017 } 7018 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 7019 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 7020 7021 } 7022 7023 /*---------------------------------------------------------------------------- 7024 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7025 | the corresponding value `b', and 0 otherwise. The invalid exception is 7026 | raised if either operand is a NaN. Otherwise, the comparison is performed 7027 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7028 *----------------------------------------------------------------------------*/ 7029 7030 int float128_eq(float128 a, float128 b, float_status *status) 7031 { 7032 7033 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7034 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7035 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7036 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7037 ) { 7038 float_raise(float_flag_invalid, status); 7039 return 0; 7040 } 7041 return 7042 ( a.low == b.low ) 7043 && ( ( a.high == b.high ) 7044 || ( ( a.low == 0 ) 7045 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7046 ); 7047 7048 } 7049 7050 /*---------------------------------------------------------------------------- 7051 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7052 | or equal to the corresponding value `b', and 0 otherwise. The invalid 7053 | exception is raised if either operand is a NaN. The comparison is performed 7054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7055 *----------------------------------------------------------------------------*/ 7056 7057 int float128_le(float128 a, float128 b, float_status *status) 7058 { 7059 flag aSign, bSign; 7060 7061 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7062 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7063 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7064 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7065 ) { 7066 float_raise(float_flag_invalid, status); 7067 return 0; 7068 } 7069 aSign = extractFloat128Sign( a ); 7070 bSign = extractFloat128Sign( b ); 7071 if ( aSign != bSign ) { 7072 return 7073 aSign 7074 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7075 == 0 ); 7076 } 7077 return 7078 aSign ? le128( b.high, b.low, a.high, a.low ) 7079 : le128( a.high, a.low, b.high, b.low ); 7080 7081 } 7082 7083 /*---------------------------------------------------------------------------- 7084 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7085 | the corresponding value `b', and 0 otherwise. The invalid exception is 7086 | raised if either operand is a NaN. The comparison is performed according 7087 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7088 *----------------------------------------------------------------------------*/ 7089 7090 int float128_lt(float128 a, float128 b, float_status *status) 7091 { 7092 flag aSign, bSign; 7093 7094 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7095 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7096 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7097 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7098 ) { 7099 float_raise(float_flag_invalid, status); 7100 return 0; 7101 } 7102 aSign = extractFloat128Sign( a ); 7103 bSign = extractFloat128Sign( b ); 7104 if ( aSign != bSign ) { 7105 return 7106 aSign 7107 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7108 != 0 ); 7109 } 7110 return 7111 aSign ? lt128( b.high, b.low, a.high, a.low ) 7112 : lt128( a.high, a.low, b.high, b.low ); 7113 7114 } 7115 7116 /*---------------------------------------------------------------------------- 7117 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7118 | be compared, and 0 otherwise. The invalid exception is raised if either 7119 | operand is a NaN. The comparison is performed according to the IEC/IEEE 7120 | Standard for Binary Floating-Point Arithmetic. 7121 *----------------------------------------------------------------------------*/ 7122 7123 int float128_unordered(float128 a, float128 b, float_status *status) 7124 { 7125 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7126 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7127 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7128 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7129 ) { 7130 float_raise(float_flag_invalid, status); 7131 return 1; 7132 } 7133 return 0; 7134 } 7135 7136 /*---------------------------------------------------------------------------- 7137 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 7138 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7139 | exception. The comparison is performed according to the IEC/IEEE Standard 7140 | for Binary Floating-Point Arithmetic. 7141 *----------------------------------------------------------------------------*/ 7142 7143 int float128_eq_quiet(float128 a, float128 b, float_status *status) 7144 { 7145 7146 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7147 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7148 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7149 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7150 ) { 7151 if (float128_is_signaling_nan(a, status) 7152 || float128_is_signaling_nan(b, status)) { 7153 float_raise(float_flag_invalid, status); 7154 } 7155 return 0; 7156 } 7157 return 7158 ( a.low == b.low ) 7159 && ( ( a.high == b.high ) 7160 || ( ( a.low == 0 ) 7161 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 7162 ); 7163 7164 } 7165 7166 /*---------------------------------------------------------------------------- 7167 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7168 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 7169 | cause an exception. Otherwise, the comparison is performed according to the 7170 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 7171 *----------------------------------------------------------------------------*/ 7172 7173 int float128_le_quiet(float128 a, float128 b, float_status *status) 7174 { 7175 flag aSign, bSign; 7176 7177 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7178 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7179 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7180 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7181 ) { 7182 if (float128_is_signaling_nan(a, status) 7183 || float128_is_signaling_nan(b, status)) { 7184 float_raise(float_flag_invalid, status); 7185 } 7186 return 0; 7187 } 7188 aSign = extractFloat128Sign( a ); 7189 bSign = extractFloat128Sign( b ); 7190 if ( aSign != bSign ) { 7191 return 7192 aSign 7193 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7194 == 0 ); 7195 } 7196 return 7197 aSign ? le128( b.high, b.low, a.high, a.low ) 7198 : le128( a.high, a.low, b.high, b.low ); 7199 7200 } 7201 7202 /*---------------------------------------------------------------------------- 7203 | Returns 1 if the quadruple-precision floating-point value `a' is less than 7204 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 7205 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 7206 | Standard for Binary Floating-Point Arithmetic. 7207 *----------------------------------------------------------------------------*/ 7208 7209 int float128_lt_quiet(float128 a, float128 b, float_status *status) 7210 { 7211 flag aSign, bSign; 7212 7213 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7214 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7215 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7216 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7217 ) { 7218 if (float128_is_signaling_nan(a, status) 7219 || float128_is_signaling_nan(b, status)) { 7220 float_raise(float_flag_invalid, status); 7221 } 7222 return 0; 7223 } 7224 aSign = extractFloat128Sign( a ); 7225 bSign = extractFloat128Sign( b ); 7226 if ( aSign != bSign ) { 7227 return 7228 aSign 7229 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 7230 != 0 ); 7231 } 7232 return 7233 aSign ? lt128( b.high, b.low, a.high, a.low ) 7234 : lt128( a.high, a.low, b.high, b.low ); 7235 7236 } 7237 7238 /*---------------------------------------------------------------------------- 7239 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7240 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7241 | comparison is performed according to the IEC/IEEE Standard for Binary 7242 | Floating-Point Arithmetic. 7243 *----------------------------------------------------------------------------*/ 7244 7245 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7246 { 7247 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7248 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7249 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7250 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7251 ) { 7252 if (float128_is_signaling_nan(a, status) 7253 || float128_is_signaling_nan(b, status)) { 7254 float_raise(float_flag_invalid, status); 7255 } 7256 return 1; 7257 } 7258 return 0; 7259 } 7260 7261 /* misc functions */ 7262 float32 uint32_to_float32(uint32_t a, float_status *status) 7263 { 7264 return int64_to_float32(a, status); 7265 } 7266 7267 float64 uint32_to_float64(uint32_t a, float_status *status) 7268 { 7269 return int64_to_float64(a, status); 7270 } 7271 7272 uint32_t float32_to_uint32(float32 a, float_status *status) 7273 { 7274 int64_t v; 7275 uint32_t res; 7276 int old_exc_flags = get_float_exception_flags(status); 7277 7278 v = float32_to_int64(a, status); 7279 if (v < 0) { 7280 res = 0; 7281 } else if (v > 0xffffffff) { 7282 res = 0xffffffff; 7283 } else { 7284 return v; 7285 } 7286 set_float_exception_flags(old_exc_flags, status); 7287 float_raise(float_flag_invalid, status); 7288 return res; 7289 } 7290 7291 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status) 7292 { 7293 int64_t v; 7294 uint32_t res; 7295 int old_exc_flags = get_float_exception_flags(status); 7296 7297 v = float32_to_int64_round_to_zero(a, status); 7298 if (v < 0) { 7299 res = 0; 7300 } else if (v > 0xffffffff) { 7301 res = 0xffffffff; 7302 } else { 7303 return v; 7304 } 7305 set_float_exception_flags(old_exc_flags, status); 7306 float_raise(float_flag_invalid, status); 7307 return res; 7308 } 7309 7310 int16_t float32_to_int16(float32 a, float_status *status) 7311 { 7312 int32_t v; 7313 int16_t res; 7314 int old_exc_flags = get_float_exception_flags(status); 7315 7316 v = float32_to_int32(a, status); 7317 if (v < -0x8000) { 7318 res = -0x8000; 7319 } else if (v > 0x7fff) { 7320 res = 0x7fff; 7321 } else { 7322 return v; 7323 } 7324 7325 set_float_exception_flags(old_exc_flags, status); 7326 float_raise(float_flag_invalid, status); 7327 return res; 7328 } 7329 7330 uint16_t float32_to_uint16(float32 a, float_status *status) 7331 { 7332 int32_t v; 7333 uint16_t res; 7334 int old_exc_flags = get_float_exception_flags(status); 7335 7336 v = float32_to_int32(a, status); 7337 if (v < 0) { 7338 res = 0; 7339 } else if (v > 0xffff) { 7340 res = 0xffff; 7341 } else { 7342 return v; 7343 } 7344 7345 set_float_exception_flags(old_exc_flags, status); 7346 float_raise(float_flag_invalid, status); 7347 return res; 7348 } 7349 7350 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status) 7351 { 7352 int64_t v; 7353 uint16_t res; 7354 int old_exc_flags = get_float_exception_flags(status); 7355 7356 v = float32_to_int64_round_to_zero(a, status); 7357 if (v < 0) { 7358 res = 0; 7359 } else if (v > 0xffff) { 7360 res = 0xffff; 7361 } else { 7362 return v; 7363 } 7364 set_float_exception_flags(old_exc_flags, status); 7365 float_raise(float_flag_invalid, status); 7366 return res; 7367 } 7368 7369 uint32_t float64_to_uint32(float64 a, float_status *status) 7370 { 7371 uint64_t v; 7372 uint32_t res; 7373 int old_exc_flags = get_float_exception_flags(status); 7374 7375 v = float64_to_uint64(a, status); 7376 if (v > 0xffffffff) { 7377 res = 0xffffffff; 7378 } else { 7379 return v; 7380 } 7381 set_float_exception_flags(old_exc_flags, status); 7382 float_raise(float_flag_invalid, status); 7383 return res; 7384 } 7385 7386 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status) 7387 { 7388 uint64_t v; 7389 uint32_t res; 7390 int old_exc_flags = get_float_exception_flags(status); 7391 7392 v = float64_to_uint64_round_to_zero(a, status); 7393 if (v > 0xffffffff) { 7394 res = 0xffffffff; 7395 } else { 7396 return v; 7397 } 7398 set_float_exception_flags(old_exc_flags, status); 7399 float_raise(float_flag_invalid, status); 7400 return res; 7401 } 7402 7403 int16_t float64_to_int16(float64 a, float_status *status) 7404 { 7405 int64_t v; 7406 int16_t res; 7407 int old_exc_flags = get_float_exception_flags(status); 7408 7409 v = float64_to_int32(a, status); 7410 if (v < -0x8000) { 7411 res = -0x8000; 7412 } else if (v > 0x7fff) { 7413 res = 0x7fff; 7414 } else { 7415 return v; 7416 } 7417 7418 set_float_exception_flags(old_exc_flags, status); 7419 float_raise(float_flag_invalid, status); 7420 return res; 7421 } 7422 7423 uint16_t float64_to_uint16(float64 a, float_status *status) 7424 { 7425 int64_t v; 7426 uint16_t res; 7427 int old_exc_flags = get_float_exception_flags(status); 7428 7429 v = float64_to_int32(a, status); 7430 if (v < 0) { 7431 res = 0; 7432 } else if (v > 0xffff) { 7433 res = 0xffff; 7434 } else { 7435 return v; 7436 } 7437 7438 set_float_exception_flags(old_exc_flags, status); 7439 float_raise(float_flag_invalid, status); 7440 return res; 7441 } 7442 7443 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status) 7444 { 7445 int64_t v; 7446 uint16_t res; 7447 int old_exc_flags = get_float_exception_flags(status); 7448 7449 v = float64_to_int64_round_to_zero(a, status); 7450 if (v < 0) { 7451 res = 0; 7452 } else if (v > 0xffff) { 7453 res = 0xffff; 7454 } else { 7455 return v; 7456 } 7457 set_float_exception_flags(old_exc_flags, status); 7458 float_raise(float_flag_invalid, status); 7459 return res; 7460 } 7461 7462 /*---------------------------------------------------------------------------- 7463 | Returns the result of converting the double-precision floating-point value 7464 | `a' to the 64-bit unsigned integer format. The conversion is 7465 | performed according to the IEC/IEEE Standard for Binary Floating-Point 7466 | Arithmetic---which means in particular that the conversion is rounded 7467 | according to the current rounding mode. If `a' is a NaN, the largest 7468 | positive integer is returned. If the conversion overflows, the 7469 | largest unsigned integer is returned. If 'a' is negative, the value is 7470 | rounded and zero is returned; negative values that do not round to zero 7471 | will raise the inexact exception. 7472 *----------------------------------------------------------------------------*/ 7473 7474 uint64_t float64_to_uint64(float64 a, float_status *status) 7475 { 7476 flag aSign; 7477 int aExp; 7478 int shiftCount; 7479 uint64_t aSig, aSigExtra; 7480 a = float64_squash_input_denormal(a, status); 7481 7482 aSig = extractFloat64Frac(a); 7483 aExp = extractFloat64Exp(a); 7484 aSign = extractFloat64Sign(a); 7485 if (aSign && (aExp > 1022)) { 7486 float_raise(float_flag_invalid, status); 7487 if (float64_is_any_nan(a)) { 7488 return LIT64(0xFFFFFFFFFFFFFFFF); 7489 } else { 7490 return 0; 7491 } 7492 } 7493 if (aExp) { 7494 aSig |= LIT64(0x0010000000000000); 7495 } 7496 shiftCount = 0x433 - aExp; 7497 if (shiftCount <= 0) { 7498 if (0x43E < aExp) { 7499 float_raise(float_flag_invalid, status); 7500 return LIT64(0xFFFFFFFFFFFFFFFF); 7501 } 7502 aSigExtra = 0; 7503 aSig <<= -shiftCount; 7504 } else { 7505 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra); 7506 } 7507 return roundAndPackUint64(aSign, aSig, aSigExtra, status); 7508 } 7509 7510 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status) 7511 { 7512 signed char current_rounding_mode = status->float_rounding_mode; 7513 set_float_rounding_mode(float_round_to_zero, status); 7514 uint64_t v = float64_to_uint64(a, status); 7515 set_float_rounding_mode(current_rounding_mode, status); 7516 return v; 7517 } 7518 7519 #define COMPARE(s, nan_exp) \ 7520 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\ 7521 int is_quiet, float_status *status) \ 7522 { \ 7523 flag aSign, bSign; \ 7524 uint ## s ## _t av, bv; \ 7525 a = float ## s ## _squash_input_denormal(a, status); \ 7526 b = float ## s ## _squash_input_denormal(b, status); \ 7527 \ 7528 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ 7529 extractFloat ## s ## Frac( a ) ) || \ 7530 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ 7531 extractFloat ## s ## Frac( b ) )) { \ 7532 if (!is_quiet || \ 7533 float ## s ## _is_signaling_nan(a, status) || \ 7534 float ## s ## _is_signaling_nan(b, status)) { \ 7535 float_raise(float_flag_invalid, status); \ 7536 } \ 7537 return float_relation_unordered; \ 7538 } \ 7539 aSign = extractFloat ## s ## Sign( a ); \ 7540 bSign = extractFloat ## s ## Sign( b ); \ 7541 av = float ## s ## _val(a); \ 7542 bv = float ## s ## _val(b); \ 7543 if ( aSign != bSign ) { \ 7544 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \ 7545 /* zero case */ \ 7546 return float_relation_equal; \ 7547 } else { \ 7548 return 1 - (2 * aSign); \ 7549 } \ 7550 } else { \ 7551 if (av == bv) { \ 7552 return float_relation_equal; \ 7553 } else { \ 7554 return 1 - 2 * (aSign ^ ( av < bv )); \ 7555 } \ 7556 } \ 7557 } \ 7558 \ 7559 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \ 7560 { \ 7561 return float ## s ## _compare_internal(a, b, 0, status); \ 7562 } \ 7563 \ 7564 int float ## s ## _compare_quiet(float ## s a, float ## s b, \ 7565 float_status *status) \ 7566 { \ 7567 return float ## s ## _compare_internal(a, b, 1, status); \ 7568 } 7569 7570 COMPARE(32, 0xff) 7571 COMPARE(64, 0x7ff) 7572 7573 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7574 int is_quiet, float_status *status) 7575 { 7576 flag aSign, bSign; 7577 7578 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7579 float_raise(float_flag_invalid, status); 7580 return float_relation_unordered; 7581 } 7582 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7583 ( extractFloatx80Frac( a )<<1 ) ) || 7584 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7585 ( extractFloatx80Frac( b )<<1 ) )) { 7586 if (!is_quiet || 7587 floatx80_is_signaling_nan(a, status) || 7588 floatx80_is_signaling_nan(b, status)) { 7589 float_raise(float_flag_invalid, status); 7590 } 7591 return float_relation_unordered; 7592 } 7593 aSign = extractFloatx80Sign( a ); 7594 bSign = extractFloatx80Sign( b ); 7595 if ( aSign != bSign ) { 7596 7597 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7598 ( ( a.low | b.low ) == 0 ) ) { 7599 /* zero case */ 7600 return float_relation_equal; 7601 } else { 7602 return 1 - (2 * aSign); 7603 } 7604 } else { 7605 if (a.low == b.low && a.high == b.high) { 7606 return float_relation_equal; 7607 } else { 7608 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7609 } 7610 } 7611 } 7612 7613 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7614 { 7615 return floatx80_compare_internal(a, b, 0, status); 7616 } 7617 7618 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7619 { 7620 return floatx80_compare_internal(a, b, 1, status); 7621 } 7622 7623 static inline int float128_compare_internal(float128 a, float128 b, 7624 int is_quiet, float_status *status) 7625 { 7626 flag aSign, bSign; 7627 7628 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7629 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7630 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7631 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7632 if (!is_quiet || 7633 float128_is_signaling_nan(a, status) || 7634 float128_is_signaling_nan(b, status)) { 7635 float_raise(float_flag_invalid, status); 7636 } 7637 return float_relation_unordered; 7638 } 7639 aSign = extractFloat128Sign( a ); 7640 bSign = extractFloat128Sign( b ); 7641 if ( aSign != bSign ) { 7642 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7643 /* zero case */ 7644 return float_relation_equal; 7645 } else { 7646 return 1 - (2 * aSign); 7647 } 7648 } else { 7649 if (a.low == b.low && a.high == b.high) { 7650 return float_relation_equal; 7651 } else { 7652 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7653 } 7654 } 7655 } 7656 7657 int float128_compare(float128 a, float128 b, float_status *status) 7658 { 7659 return float128_compare_internal(a, b, 0, status); 7660 } 7661 7662 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7663 { 7664 return float128_compare_internal(a, b, 1, status); 7665 } 7666 7667 /* min() and max() functions. These can't be implemented as 7668 * 'compare and pick one input' because that would mishandle 7669 * NaNs and +0 vs -0. 7670 * 7671 * minnum() and maxnum() functions. These are similar to the min() 7672 * and max() functions but if one of the arguments is a QNaN and 7673 * the other is numerical then the numerical argument is returned. 7674 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 7675 * and maxNum() operations. min() and max() are the typical min/max 7676 * semantics provided by many CPUs which predate that specification. 7677 * 7678 * minnummag() and maxnummag() functions correspond to minNumMag() 7679 * and minNumMag() from the IEEE-754 2008. 7680 */ 7681 #define MINMAX(s) \ 7682 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \ 7683 int ismin, int isieee, \ 7684 int ismag, \ 7685 float_status *status) \ 7686 { \ 7687 flag aSign, bSign; \ 7688 uint ## s ## _t av, bv, aav, abv; \ 7689 a = float ## s ## _squash_input_denormal(a, status); \ 7690 b = float ## s ## _squash_input_denormal(b, status); \ 7691 if (float ## s ## _is_any_nan(a) || \ 7692 float ## s ## _is_any_nan(b)) { \ 7693 if (isieee) { \ 7694 if (float ## s ## _is_quiet_nan(a, status) && \ 7695 !float ## s ##_is_any_nan(b)) { \ 7696 return b; \ 7697 } else if (float ## s ## _is_quiet_nan(b, status) && \ 7698 !float ## s ## _is_any_nan(a)) { \ 7699 return a; \ 7700 } \ 7701 } \ 7702 return propagateFloat ## s ## NaN(a, b, status); \ 7703 } \ 7704 aSign = extractFloat ## s ## Sign(a); \ 7705 bSign = extractFloat ## s ## Sign(b); \ 7706 av = float ## s ## _val(a); \ 7707 bv = float ## s ## _val(b); \ 7708 if (ismag) { \ 7709 aav = float ## s ## _abs(av); \ 7710 abv = float ## s ## _abs(bv); \ 7711 if (aav != abv) { \ 7712 if (ismin) { \ 7713 return (aav < abv) ? a : b; \ 7714 } else { \ 7715 return (aav < abv) ? b : a; \ 7716 } \ 7717 } \ 7718 } \ 7719 if (aSign != bSign) { \ 7720 if (ismin) { \ 7721 return aSign ? a : b; \ 7722 } else { \ 7723 return aSign ? b : a; \ 7724 } \ 7725 } else { \ 7726 if (ismin) { \ 7727 return (aSign ^ (av < bv)) ? a : b; \ 7728 } else { \ 7729 return (aSign ^ (av < bv)) ? b : a; \ 7730 } \ 7731 } \ 7732 } \ 7733 \ 7734 float ## s float ## s ## _min(float ## s a, float ## s b, \ 7735 float_status *status) \ 7736 { \ 7737 return float ## s ## _minmax(a, b, 1, 0, 0, status); \ 7738 } \ 7739 \ 7740 float ## s float ## s ## _max(float ## s a, float ## s b, \ 7741 float_status *status) \ 7742 { \ 7743 return float ## s ## _minmax(a, b, 0, 0, 0, status); \ 7744 } \ 7745 \ 7746 float ## s float ## s ## _minnum(float ## s a, float ## s b, \ 7747 float_status *status) \ 7748 { \ 7749 return float ## s ## _minmax(a, b, 1, 1, 0, status); \ 7750 } \ 7751 \ 7752 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \ 7753 float_status *status) \ 7754 { \ 7755 return float ## s ## _minmax(a, b, 0, 1, 0, status); \ 7756 } \ 7757 \ 7758 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \ 7759 float_status *status) \ 7760 { \ 7761 return float ## s ## _minmax(a, b, 1, 1, 1, status); \ 7762 } \ 7763 \ 7764 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \ 7765 float_status *status) \ 7766 { \ 7767 return float ## s ## _minmax(a, b, 0, 1, 1, status); \ 7768 } 7769 7770 MINMAX(32) 7771 MINMAX(64) 7772 7773 7774 /* Multiply A by 2 raised to the power N. */ 7775 float32 float32_scalbn(float32 a, int n, float_status *status) 7776 { 7777 flag aSign; 7778 int16_t aExp; 7779 uint32_t aSig; 7780 7781 a = float32_squash_input_denormal(a, status); 7782 aSig = extractFloat32Frac( a ); 7783 aExp = extractFloat32Exp( a ); 7784 aSign = extractFloat32Sign( a ); 7785 7786 if ( aExp == 0xFF ) { 7787 if ( aSig ) { 7788 return propagateFloat32NaN(a, a, status); 7789 } 7790 return a; 7791 } 7792 if (aExp != 0) { 7793 aSig |= 0x00800000; 7794 } else if (aSig == 0) { 7795 return a; 7796 } else { 7797 aExp++; 7798 } 7799 7800 if (n > 0x200) { 7801 n = 0x200; 7802 } else if (n < -0x200) { 7803 n = -0x200; 7804 } 7805 7806 aExp += n - 1; 7807 aSig <<= 7; 7808 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status); 7809 } 7810 7811 float64 float64_scalbn(float64 a, int n, float_status *status) 7812 { 7813 flag aSign; 7814 int16_t aExp; 7815 uint64_t aSig; 7816 7817 a = float64_squash_input_denormal(a, status); 7818 aSig = extractFloat64Frac( a ); 7819 aExp = extractFloat64Exp( a ); 7820 aSign = extractFloat64Sign( a ); 7821 7822 if ( aExp == 0x7FF ) { 7823 if ( aSig ) { 7824 return propagateFloat64NaN(a, a, status); 7825 } 7826 return a; 7827 } 7828 if (aExp != 0) { 7829 aSig |= LIT64( 0x0010000000000000 ); 7830 } else if (aSig == 0) { 7831 return a; 7832 } else { 7833 aExp++; 7834 } 7835 7836 if (n > 0x1000) { 7837 n = 0x1000; 7838 } else if (n < -0x1000) { 7839 n = -0x1000; 7840 } 7841 7842 aExp += n - 1; 7843 aSig <<= 10; 7844 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status); 7845 } 7846 7847 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7848 { 7849 flag aSign; 7850 int32_t aExp; 7851 uint64_t aSig; 7852 7853 if (floatx80_invalid_encoding(a)) { 7854 float_raise(float_flag_invalid, status); 7855 return floatx80_default_nan(status); 7856 } 7857 aSig = extractFloatx80Frac( a ); 7858 aExp = extractFloatx80Exp( a ); 7859 aSign = extractFloatx80Sign( a ); 7860 7861 if ( aExp == 0x7FFF ) { 7862 if ( aSig<<1 ) { 7863 return propagateFloatx80NaN(a, a, status); 7864 } 7865 return a; 7866 } 7867 7868 if (aExp == 0) { 7869 if (aSig == 0) { 7870 return a; 7871 } 7872 aExp++; 7873 } 7874 7875 if (n > 0x10000) { 7876 n = 0x10000; 7877 } else if (n < -0x10000) { 7878 n = -0x10000; 7879 } 7880 7881 aExp += n; 7882 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7883 aSign, aExp, aSig, 0, status); 7884 } 7885 7886 float128 float128_scalbn(float128 a, int n, float_status *status) 7887 { 7888 flag aSign; 7889 int32_t aExp; 7890 uint64_t aSig0, aSig1; 7891 7892 aSig1 = extractFloat128Frac1( a ); 7893 aSig0 = extractFloat128Frac0( a ); 7894 aExp = extractFloat128Exp( a ); 7895 aSign = extractFloat128Sign( a ); 7896 if ( aExp == 0x7FFF ) { 7897 if ( aSig0 | aSig1 ) { 7898 return propagateFloat128NaN(a, a, status); 7899 } 7900 return a; 7901 } 7902 if (aExp != 0) { 7903 aSig0 |= LIT64( 0x0001000000000000 ); 7904 } else if (aSig0 == 0 && aSig1 == 0) { 7905 return a; 7906 } else { 7907 aExp++; 7908 } 7909 7910 if (n > 0x10000) { 7911 n = 0x10000; 7912 } else if (n < -0x10000) { 7913 n = -0x10000; 7914 } 7915 7916 aExp += n - 1; 7917 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7918 , status); 7919 7920 } 7921