1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Returns the fraction bits of the half-precision floating-point value `a'. 100 *----------------------------------------------------------------------------*/ 101 102 static inline uint32_t extractFloat16Frac(float16 a) 103 { 104 return float16_val(a) & 0x3ff; 105 } 106 107 /*---------------------------------------------------------------------------- 108 | Returns the exponent bits of the half-precision floating-point value `a'. 109 *----------------------------------------------------------------------------*/ 110 111 static inline int extractFloat16Exp(float16 a) 112 { 113 return (float16_val(a) >> 10) & 0x1f; 114 } 115 116 /*---------------------------------------------------------------------------- 117 | Returns the fraction bits of the single-precision floating-point value `a'. 118 *----------------------------------------------------------------------------*/ 119 120 static inline uint32_t extractFloat32Frac(float32 a) 121 { 122 return float32_val(a) & 0x007FFFFF; 123 } 124 125 /*---------------------------------------------------------------------------- 126 | Returns the exponent bits of the single-precision floating-point value `a'. 127 *----------------------------------------------------------------------------*/ 128 129 static inline int extractFloat32Exp(float32 a) 130 { 131 return (float32_val(a) >> 23) & 0xFF; 132 } 133 134 /*---------------------------------------------------------------------------- 135 | Returns the sign bit of the single-precision floating-point value `a'. 136 *----------------------------------------------------------------------------*/ 137 138 static inline flag extractFloat32Sign(float32 a) 139 { 140 return float32_val(a) >> 31; 141 } 142 143 /*---------------------------------------------------------------------------- 144 | Returns the fraction bits of the double-precision floating-point value `a'. 145 *----------------------------------------------------------------------------*/ 146 147 static inline uint64_t extractFloat64Frac(float64 a) 148 { 149 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 150 } 151 152 /*---------------------------------------------------------------------------- 153 | Returns the exponent bits of the double-precision floating-point value `a'. 154 *----------------------------------------------------------------------------*/ 155 156 static inline int extractFloat64Exp(float64 a) 157 { 158 return (float64_val(a) >> 52) & 0x7FF; 159 } 160 161 /*---------------------------------------------------------------------------- 162 | Returns the sign bit of the double-precision floating-point value `a'. 163 *----------------------------------------------------------------------------*/ 164 165 static inline flag extractFloat64Sign(float64 a) 166 { 167 return float64_val(a) >> 63; 168 } 169 170 /* 171 * Classify a floating point number. Everything above float_class_qnan 172 * is a NaN so cls >= float_class_qnan is any NaN. 173 */ 174 175 typedef enum __attribute__ ((__packed__)) { 176 float_class_unclassified, 177 float_class_zero, 178 float_class_normal, 179 float_class_inf, 180 float_class_qnan, /* all NaNs from here */ 181 float_class_snan, 182 } FloatClass; 183 184 /* Simple helpers for checking if, or what kind of, NaN we have */ 185 static inline __attribute__((unused)) bool is_nan(FloatClass c) 186 { 187 return unlikely(c >= float_class_qnan); 188 } 189 190 static inline __attribute__((unused)) bool is_snan(FloatClass c) 191 { 192 return c == float_class_snan; 193 } 194 195 static inline __attribute__((unused)) bool is_qnan(FloatClass c) 196 { 197 return c == float_class_qnan; 198 } 199 200 /* 201 * Structure holding all of the decomposed parts of a float. The 202 * exponent is unbiased and the fraction is normalized. All 203 * calculations are done with a 64 bit fraction and then rounded as 204 * appropriate for the final format. 205 * 206 * Thanks to the packed FloatClass a decent compiler should be able to 207 * fit the whole structure into registers and avoid using the stack 208 * for parameter passing. 209 */ 210 211 typedef struct { 212 uint64_t frac; 213 int32_t exp; 214 FloatClass cls; 215 bool sign; 216 } FloatParts; 217 218 #define DECOMPOSED_BINARY_POINT (64 - 2) 219 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 220 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 221 222 /* Structure holding all of the relevant parameters for a format. 223 * exp_size: the size of the exponent field 224 * exp_bias: the offset applied to the exponent field 225 * exp_max: the maximum normalised exponent 226 * frac_size: the size of the fraction field 227 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 228 * The following are computed based the size of fraction 229 * frac_lsb: least significant bit of fraction 230 * frac_lsbm1: the bit below the least significant bit (for rounding) 231 * round_mask/roundeven_mask: masks used for rounding 232 * The following optional modifiers are available: 233 * arm_althp: handle ARM Alternative Half Precision 234 */ 235 typedef struct { 236 int exp_size; 237 int exp_bias; 238 int exp_max; 239 int frac_size; 240 int frac_shift; 241 uint64_t frac_lsb; 242 uint64_t frac_lsbm1; 243 uint64_t round_mask; 244 uint64_t roundeven_mask; 245 bool arm_althp; 246 } FloatFmt; 247 248 /* Expand fields based on the size of exponent and fraction */ 249 #define FLOAT_PARAMS(E, F) \ 250 .exp_size = E, \ 251 .exp_bias = ((1 << E) - 1) >> 1, \ 252 .exp_max = (1 << E) - 1, \ 253 .frac_size = F, \ 254 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 255 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 256 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 257 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 258 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 259 260 static const FloatFmt float16_params = { 261 FLOAT_PARAMS(5, 10) 262 }; 263 264 static const FloatFmt float16_params_ahp = { 265 FLOAT_PARAMS(5, 10), 266 .arm_althp = true 267 }; 268 269 static const FloatFmt float32_params = { 270 FLOAT_PARAMS(8, 23) 271 }; 272 273 static const FloatFmt float64_params = { 274 FLOAT_PARAMS(11, 52) 275 }; 276 277 /* Unpack a float to parts, but do not canonicalize. */ 278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 279 { 280 const int sign_pos = fmt.frac_size + fmt.exp_size; 281 282 return (FloatParts) { 283 .cls = float_class_unclassified, 284 .sign = extract64(raw, sign_pos, 1), 285 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 286 .frac = extract64(raw, 0, fmt.frac_size), 287 }; 288 } 289 290 static inline FloatParts float16_unpack_raw(float16 f) 291 { 292 return unpack_raw(float16_params, f); 293 } 294 295 static inline FloatParts float32_unpack_raw(float32 f) 296 { 297 return unpack_raw(float32_params, f); 298 } 299 300 static inline FloatParts float64_unpack_raw(float64 f) 301 { 302 return unpack_raw(float64_params, f); 303 } 304 305 /* Pack a float from parts, but do not canonicalize. */ 306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 307 { 308 const int sign_pos = fmt.frac_size + fmt.exp_size; 309 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 310 return deposit64(ret, sign_pos, 1, p.sign); 311 } 312 313 static inline float16 float16_pack_raw(FloatParts p) 314 { 315 return make_float16(pack_raw(float16_params, p)); 316 } 317 318 static inline float32 float32_pack_raw(FloatParts p) 319 { 320 return make_float32(pack_raw(float32_params, p)); 321 } 322 323 static inline float64 float64_pack_raw(FloatParts p) 324 { 325 return make_float64(pack_raw(float64_params, p)); 326 } 327 328 /*---------------------------------------------------------------------------- 329 | Functions and definitions to determine: (1) whether tininess for underflow 330 | is detected before or after rounding by default, (2) what (if anything) 331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 333 | are propagated from function inputs to output. These details are target- 334 | specific. 335 *----------------------------------------------------------------------------*/ 336 #include "softfloat-specialize.h" 337 338 /* Canonicalize EXP and FRAC, setting CLS. */ 339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 340 float_status *status) 341 { 342 if (part.exp == parm->exp_max && !parm->arm_althp) { 343 if (part.frac == 0) { 344 part.cls = float_class_inf; 345 } else { 346 part.frac <<= parm->frac_shift; 347 part.cls = (parts_is_snan_frac(part.frac, status) 348 ? float_class_snan : float_class_qnan); 349 } 350 } else if (part.exp == 0) { 351 if (likely(part.frac == 0)) { 352 part.cls = float_class_zero; 353 } else if (status->flush_inputs_to_zero) { 354 float_raise(float_flag_input_denormal, status); 355 part.cls = float_class_zero; 356 part.frac = 0; 357 } else { 358 int shift = clz64(part.frac) - 1; 359 part.cls = float_class_normal; 360 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 361 part.frac <<= shift; 362 } 363 } else { 364 part.cls = float_class_normal; 365 part.exp -= parm->exp_bias; 366 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 367 } 368 return part; 369 } 370 371 /* Round and uncanonicalize a floating-point number by parts. There 372 * are FRAC_SHIFT bits that may require rounding at the bottom of the 373 * fraction; these bits will be removed. The exponent will be biased 374 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 375 */ 376 377 static FloatParts round_canonical(FloatParts p, float_status *s, 378 const FloatFmt *parm) 379 { 380 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 381 const uint64_t round_mask = parm->round_mask; 382 const uint64_t roundeven_mask = parm->roundeven_mask; 383 const int exp_max = parm->exp_max; 384 const int frac_shift = parm->frac_shift; 385 uint64_t frac, inc; 386 int exp, flags = 0; 387 bool overflow_norm; 388 389 frac = p.frac; 390 exp = p.exp; 391 392 switch (p.cls) { 393 case float_class_normal: 394 switch (s->float_rounding_mode) { 395 case float_round_nearest_even: 396 overflow_norm = false; 397 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 398 break; 399 case float_round_ties_away: 400 overflow_norm = false; 401 inc = frac_lsbm1; 402 break; 403 case float_round_to_zero: 404 overflow_norm = true; 405 inc = 0; 406 break; 407 case float_round_up: 408 inc = p.sign ? 0 : round_mask; 409 overflow_norm = p.sign; 410 break; 411 case float_round_down: 412 inc = p.sign ? round_mask : 0; 413 overflow_norm = !p.sign; 414 break; 415 default: 416 g_assert_not_reached(); 417 } 418 419 exp += parm->exp_bias; 420 if (likely(exp > 0)) { 421 if (frac & round_mask) { 422 flags |= float_flag_inexact; 423 frac += inc; 424 if (frac & DECOMPOSED_OVERFLOW_BIT) { 425 frac >>= 1; 426 exp++; 427 } 428 } 429 frac >>= frac_shift; 430 431 if (parm->arm_althp) { 432 /* ARM Alt HP eschews Inf and NaN for a wider exponent. */ 433 if (unlikely(exp > exp_max)) { 434 /* Overflow. Return the maximum normal. */ 435 flags = float_flag_invalid; 436 exp = exp_max; 437 frac = -1; 438 } 439 } else if (unlikely(exp >= exp_max)) { 440 flags |= float_flag_overflow | float_flag_inexact; 441 if (overflow_norm) { 442 exp = exp_max - 1; 443 frac = -1; 444 } else { 445 p.cls = float_class_inf; 446 goto do_inf; 447 } 448 } 449 } else if (s->flush_to_zero) { 450 flags |= float_flag_output_denormal; 451 p.cls = float_class_zero; 452 goto do_zero; 453 } else { 454 bool is_tiny = (s->float_detect_tininess 455 == float_tininess_before_rounding) 456 || (exp < 0) 457 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 458 459 shift64RightJamming(frac, 1 - exp, &frac); 460 if (frac & round_mask) { 461 /* Need to recompute round-to-even. */ 462 if (s->float_rounding_mode == float_round_nearest_even) { 463 inc = ((frac & roundeven_mask) != frac_lsbm1 464 ? frac_lsbm1 : 0); 465 } 466 flags |= float_flag_inexact; 467 frac += inc; 468 } 469 470 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 471 frac >>= frac_shift; 472 473 if (is_tiny && (flags & float_flag_inexact)) { 474 flags |= float_flag_underflow; 475 } 476 if (exp == 0 && frac == 0) { 477 p.cls = float_class_zero; 478 } 479 } 480 break; 481 482 case float_class_zero: 483 do_zero: 484 exp = 0; 485 frac = 0; 486 break; 487 488 case float_class_inf: 489 do_inf: 490 assert(!parm->arm_althp); 491 exp = exp_max; 492 frac = 0; 493 break; 494 495 case float_class_qnan: 496 case float_class_snan: 497 assert(!parm->arm_althp); 498 exp = exp_max; 499 frac >>= parm->frac_shift; 500 break; 501 502 default: 503 g_assert_not_reached(); 504 } 505 506 float_raise(flags, s); 507 p.exp = exp; 508 p.frac = frac; 509 return p; 510 } 511 512 /* Explicit FloatFmt version */ 513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s, 514 const FloatFmt *params) 515 { 516 return canonicalize(float16_unpack_raw(f), params, s); 517 } 518 519 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 520 { 521 return float16a_unpack_canonical(f, s, &float16_params); 522 } 523 524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s, 525 const FloatFmt *params) 526 { 527 return float16_pack_raw(round_canonical(p, s, params)); 528 } 529 530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 531 { 532 return float16a_round_pack_canonical(p, s, &float16_params); 533 } 534 535 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 536 { 537 return canonicalize(float32_unpack_raw(f), &float32_params, s); 538 } 539 540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 541 { 542 return float32_pack_raw(round_canonical(p, s, &float32_params)); 543 } 544 545 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 546 { 547 return canonicalize(float64_unpack_raw(f), &float64_params, s); 548 } 549 550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 551 { 552 return float64_pack_raw(round_canonical(p, s, &float64_params)); 553 } 554 555 static FloatParts return_nan(FloatParts a, float_status *s) 556 { 557 switch (a.cls) { 558 case float_class_snan: 559 s->float_exception_flags |= float_flag_invalid; 560 a = parts_silence_nan(a, s); 561 /* fall through */ 562 case float_class_qnan: 563 if (s->default_nan_mode) { 564 return parts_default_nan(s); 565 } 566 break; 567 568 default: 569 g_assert_not_reached(); 570 } 571 return a; 572 } 573 574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 575 { 576 if (is_snan(a.cls) || is_snan(b.cls)) { 577 s->float_exception_flags |= float_flag_invalid; 578 } 579 580 if (s->default_nan_mode) { 581 return parts_default_nan(s); 582 } else { 583 if (pickNaN(a.cls, b.cls, 584 a.frac > b.frac || 585 (a.frac == b.frac && a.sign < b.sign))) { 586 a = b; 587 } 588 if (is_snan(a.cls)) { 589 return parts_silence_nan(a, s); 590 } 591 } 592 return a; 593 } 594 595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 596 bool inf_zero, float_status *s) 597 { 598 int which; 599 600 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 601 s->float_exception_flags |= float_flag_invalid; 602 } 603 604 which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s); 605 606 if (s->default_nan_mode) { 607 /* Note that this check is after pickNaNMulAdd so that function 608 * has an opportunity to set the Invalid flag. 609 */ 610 which = 3; 611 } 612 613 switch (which) { 614 case 0: 615 break; 616 case 1: 617 a = b; 618 break; 619 case 2: 620 a = c; 621 break; 622 case 3: 623 return parts_default_nan(s); 624 default: 625 g_assert_not_reached(); 626 } 627 628 if (is_snan(a.cls)) { 629 return parts_silence_nan(a, s); 630 } 631 return a; 632 } 633 634 /* 635 * Returns the result of adding or subtracting the values of the 636 * floating-point values `a' and `b'. The operation is performed 637 * according to the IEC/IEEE Standard for Binary Floating-Point 638 * Arithmetic. 639 */ 640 641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 642 float_status *s) 643 { 644 bool a_sign = a.sign; 645 bool b_sign = b.sign ^ subtract; 646 647 if (a_sign != b_sign) { 648 /* Subtraction */ 649 650 if (a.cls == float_class_normal && b.cls == float_class_normal) { 651 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 652 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 653 a.frac = a.frac - b.frac; 654 } else { 655 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 656 a.frac = b.frac - a.frac; 657 a.exp = b.exp; 658 a_sign ^= 1; 659 } 660 661 if (a.frac == 0) { 662 a.cls = float_class_zero; 663 a.sign = s->float_rounding_mode == float_round_down; 664 } else { 665 int shift = clz64(a.frac) - 1; 666 a.frac = a.frac << shift; 667 a.exp = a.exp - shift; 668 a.sign = a_sign; 669 } 670 return a; 671 } 672 if (is_nan(a.cls) || is_nan(b.cls)) { 673 return pick_nan(a, b, s); 674 } 675 if (a.cls == float_class_inf) { 676 if (b.cls == float_class_inf) { 677 float_raise(float_flag_invalid, s); 678 return parts_default_nan(s); 679 } 680 return a; 681 } 682 if (a.cls == float_class_zero && b.cls == float_class_zero) { 683 a.sign = s->float_rounding_mode == float_round_down; 684 return a; 685 } 686 if (a.cls == float_class_zero || b.cls == float_class_inf) { 687 b.sign = a_sign ^ 1; 688 return b; 689 } 690 if (b.cls == float_class_zero) { 691 return a; 692 } 693 } else { 694 /* Addition */ 695 if (a.cls == float_class_normal && b.cls == float_class_normal) { 696 if (a.exp > b.exp) { 697 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 698 } else if (a.exp < b.exp) { 699 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 700 a.exp = b.exp; 701 } 702 a.frac += b.frac; 703 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 704 shift64RightJamming(a.frac, 1, &a.frac); 705 a.exp += 1; 706 } 707 return a; 708 } 709 if (is_nan(a.cls) || is_nan(b.cls)) { 710 return pick_nan(a, b, s); 711 } 712 if (a.cls == float_class_inf || b.cls == float_class_zero) { 713 return a; 714 } 715 if (b.cls == float_class_inf || a.cls == float_class_zero) { 716 b.sign = b_sign; 717 return b; 718 } 719 } 720 g_assert_not_reached(); 721 } 722 723 /* 724 * Returns the result of adding or subtracting the floating-point 725 * values `a' and `b'. The operation is performed according to the 726 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 727 */ 728 729 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 730 float_status *status) 731 { 732 FloatParts pa = float16_unpack_canonical(a, status); 733 FloatParts pb = float16_unpack_canonical(b, status); 734 FloatParts pr = addsub_floats(pa, pb, false, status); 735 736 return float16_round_pack_canonical(pr, status); 737 } 738 739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 740 float_status *status) 741 { 742 FloatParts pa = float32_unpack_canonical(a, status); 743 FloatParts pb = float32_unpack_canonical(b, status); 744 FloatParts pr = addsub_floats(pa, pb, false, status); 745 746 return float32_round_pack_canonical(pr, status); 747 } 748 749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 750 float_status *status) 751 { 752 FloatParts pa = float64_unpack_canonical(a, status); 753 FloatParts pb = float64_unpack_canonical(b, status); 754 FloatParts pr = addsub_floats(pa, pb, false, status); 755 756 return float64_round_pack_canonical(pr, status); 757 } 758 759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 760 float_status *status) 761 { 762 FloatParts pa = float16_unpack_canonical(a, status); 763 FloatParts pb = float16_unpack_canonical(b, status); 764 FloatParts pr = addsub_floats(pa, pb, true, status); 765 766 return float16_round_pack_canonical(pr, status); 767 } 768 769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 770 float_status *status) 771 { 772 FloatParts pa = float32_unpack_canonical(a, status); 773 FloatParts pb = float32_unpack_canonical(b, status); 774 FloatParts pr = addsub_floats(pa, pb, true, status); 775 776 return float32_round_pack_canonical(pr, status); 777 } 778 779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 780 float_status *status) 781 { 782 FloatParts pa = float64_unpack_canonical(a, status); 783 FloatParts pb = float64_unpack_canonical(b, status); 784 FloatParts pr = addsub_floats(pa, pb, true, status); 785 786 return float64_round_pack_canonical(pr, status); 787 } 788 789 /* 790 * Returns the result of multiplying the floating-point values `a' and 791 * `b'. The operation is performed according to the IEC/IEEE Standard 792 * for Binary Floating-Point Arithmetic. 793 */ 794 795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 796 { 797 bool sign = a.sign ^ b.sign; 798 799 if (a.cls == float_class_normal && b.cls == float_class_normal) { 800 uint64_t hi, lo; 801 int exp = a.exp + b.exp; 802 803 mul64To128(a.frac, b.frac, &hi, &lo); 804 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 805 if (lo & DECOMPOSED_OVERFLOW_BIT) { 806 shift64RightJamming(lo, 1, &lo); 807 exp += 1; 808 } 809 810 /* Re-use a */ 811 a.exp = exp; 812 a.sign = sign; 813 a.frac = lo; 814 return a; 815 } 816 /* handle all the NaN cases */ 817 if (is_nan(a.cls) || is_nan(b.cls)) { 818 return pick_nan(a, b, s); 819 } 820 /* Inf * Zero == NaN */ 821 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 822 (a.cls == float_class_zero && b.cls == float_class_inf)) { 823 s->float_exception_flags |= float_flag_invalid; 824 return parts_default_nan(s); 825 } 826 /* Multiply by 0 or Inf */ 827 if (a.cls == float_class_inf || a.cls == float_class_zero) { 828 a.sign = sign; 829 return a; 830 } 831 if (b.cls == float_class_inf || b.cls == float_class_zero) { 832 b.sign = sign; 833 return b; 834 } 835 g_assert_not_reached(); 836 } 837 838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 839 float_status *status) 840 { 841 FloatParts pa = float16_unpack_canonical(a, status); 842 FloatParts pb = float16_unpack_canonical(b, status); 843 FloatParts pr = mul_floats(pa, pb, status); 844 845 return float16_round_pack_canonical(pr, status); 846 } 847 848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 849 float_status *status) 850 { 851 FloatParts pa = float32_unpack_canonical(a, status); 852 FloatParts pb = float32_unpack_canonical(b, status); 853 FloatParts pr = mul_floats(pa, pb, status); 854 855 return float32_round_pack_canonical(pr, status); 856 } 857 858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 859 float_status *status) 860 { 861 FloatParts pa = float64_unpack_canonical(a, status); 862 FloatParts pb = float64_unpack_canonical(b, status); 863 FloatParts pr = mul_floats(pa, pb, status); 864 865 return float64_round_pack_canonical(pr, status); 866 } 867 868 /* 869 * Returns the result of multiplying the floating-point values `a' and 870 * `b' then adding 'c', with no intermediate rounding step after the 871 * multiplication. The operation is performed according to the 872 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 873 * The flags argument allows the caller to select negation of the 874 * addend, the intermediate product, or the final result. (The 875 * difference between this and having the caller do a separate 876 * negation is that negating externally will flip the sign bit on 877 * NaNs.) 878 */ 879 880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 881 int flags, float_status *s) 882 { 883 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 884 ((1 << float_class_inf) | (1 << float_class_zero)); 885 bool p_sign; 886 bool sign_flip = flags & float_muladd_negate_result; 887 FloatClass p_class; 888 uint64_t hi, lo; 889 int p_exp; 890 891 /* It is implementation-defined whether the cases of (0,inf,qnan) 892 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 893 * they return if they do), so we have to hand this information 894 * off to the target-specific pick-a-NaN routine. 895 */ 896 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 897 return pick_nan_muladd(a, b, c, inf_zero, s); 898 } 899 900 if (inf_zero) { 901 s->float_exception_flags |= float_flag_invalid; 902 return parts_default_nan(s); 903 } 904 905 if (flags & float_muladd_negate_c) { 906 c.sign ^= 1; 907 } 908 909 p_sign = a.sign ^ b.sign; 910 911 if (flags & float_muladd_negate_product) { 912 p_sign ^= 1; 913 } 914 915 if (a.cls == float_class_inf || b.cls == float_class_inf) { 916 p_class = float_class_inf; 917 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 918 p_class = float_class_zero; 919 } else { 920 p_class = float_class_normal; 921 } 922 923 if (c.cls == float_class_inf) { 924 if (p_class == float_class_inf && p_sign != c.sign) { 925 s->float_exception_flags |= float_flag_invalid; 926 return parts_default_nan(s); 927 } else { 928 a.cls = float_class_inf; 929 a.sign = c.sign ^ sign_flip; 930 return a; 931 } 932 } 933 934 if (p_class == float_class_inf) { 935 a.cls = float_class_inf; 936 a.sign = p_sign ^ sign_flip; 937 return a; 938 } 939 940 if (p_class == float_class_zero) { 941 if (c.cls == float_class_zero) { 942 if (p_sign != c.sign) { 943 p_sign = s->float_rounding_mode == float_round_down; 944 } 945 c.sign = p_sign; 946 } else if (flags & float_muladd_halve_result) { 947 c.exp -= 1; 948 } 949 c.sign ^= sign_flip; 950 return c; 951 } 952 953 /* a & b should be normals now... */ 954 assert(a.cls == float_class_normal && 955 b.cls == float_class_normal); 956 957 p_exp = a.exp + b.exp; 958 959 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 960 * result. 961 */ 962 mul64To128(a.frac, b.frac, &hi, &lo); 963 /* binary point now at bit 124 */ 964 965 /* check for overflow */ 966 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 967 shift128RightJamming(hi, lo, 1, &hi, &lo); 968 p_exp += 1; 969 } 970 971 /* + add/sub */ 972 if (c.cls == float_class_zero) { 973 /* move binary point back to 62 */ 974 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 975 } else { 976 int exp_diff = p_exp - c.exp; 977 if (p_sign == c.sign) { 978 /* Addition */ 979 if (exp_diff <= 0) { 980 shift128RightJamming(hi, lo, 981 DECOMPOSED_BINARY_POINT - exp_diff, 982 &hi, &lo); 983 lo += c.frac; 984 p_exp = c.exp; 985 } else { 986 uint64_t c_hi, c_lo; 987 /* shift c to the same binary point as the product (124) */ 988 c_hi = c.frac >> 2; 989 c_lo = 0; 990 shift128RightJamming(c_hi, c_lo, 991 exp_diff, 992 &c_hi, &c_lo); 993 add128(hi, lo, c_hi, c_lo, &hi, &lo); 994 /* move binary point back to 62 */ 995 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 996 } 997 998 if (lo & DECOMPOSED_OVERFLOW_BIT) { 999 shift64RightJamming(lo, 1, &lo); 1000 p_exp += 1; 1001 } 1002 1003 } else { 1004 /* Subtraction */ 1005 uint64_t c_hi, c_lo; 1006 /* make C binary point match product at bit 124 */ 1007 c_hi = c.frac >> 2; 1008 c_lo = 0; 1009 1010 if (exp_diff <= 0) { 1011 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1012 if (exp_diff == 0 1013 && 1014 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1015 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1016 } else { 1017 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1018 p_sign ^= 1; 1019 p_exp = c.exp; 1020 } 1021 } else { 1022 shift128RightJamming(c_hi, c_lo, 1023 exp_diff, 1024 &c_hi, &c_lo); 1025 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1026 } 1027 1028 if (hi == 0 && lo == 0) { 1029 a.cls = float_class_zero; 1030 a.sign = s->float_rounding_mode == float_round_down; 1031 a.sign ^= sign_flip; 1032 return a; 1033 } else { 1034 int shift; 1035 if (hi != 0) { 1036 shift = clz64(hi); 1037 } else { 1038 shift = clz64(lo) + 64; 1039 } 1040 /* Normalizing to a binary point of 124 is the 1041 correct adjust for the exponent. However since we're 1042 shifting, we might as well put the binary point back 1043 at 62 where we really want it. Therefore shift as 1044 if we're leaving 1 bit at the top of the word, but 1045 adjust the exponent as if we're leaving 3 bits. */ 1046 shift -= 1; 1047 if (shift >= 64) { 1048 lo = lo << (shift - 64); 1049 } else { 1050 hi = (hi << shift) | (lo >> (64 - shift)); 1051 lo = hi | ((lo << shift) != 0); 1052 } 1053 p_exp -= shift - 2; 1054 } 1055 } 1056 } 1057 1058 if (flags & float_muladd_halve_result) { 1059 p_exp -= 1; 1060 } 1061 1062 /* finally prepare our result */ 1063 a.cls = float_class_normal; 1064 a.sign = p_sign ^ sign_flip; 1065 a.exp = p_exp; 1066 a.frac = lo; 1067 1068 return a; 1069 } 1070 1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1072 int flags, float_status *status) 1073 { 1074 FloatParts pa = float16_unpack_canonical(a, status); 1075 FloatParts pb = float16_unpack_canonical(b, status); 1076 FloatParts pc = float16_unpack_canonical(c, status); 1077 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1078 1079 return float16_round_pack_canonical(pr, status); 1080 } 1081 1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1083 int flags, float_status *status) 1084 { 1085 FloatParts pa = float32_unpack_canonical(a, status); 1086 FloatParts pb = float32_unpack_canonical(b, status); 1087 FloatParts pc = float32_unpack_canonical(c, status); 1088 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1089 1090 return float32_round_pack_canonical(pr, status); 1091 } 1092 1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1094 int flags, float_status *status) 1095 { 1096 FloatParts pa = float64_unpack_canonical(a, status); 1097 FloatParts pb = float64_unpack_canonical(b, status); 1098 FloatParts pc = float64_unpack_canonical(c, status); 1099 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1100 1101 return float64_round_pack_canonical(pr, status); 1102 } 1103 1104 /* 1105 * Returns the result of dividing the floating-point value `a' by the 1106 * corresponding value `b'. The operation is performed according to 1107 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1108 */ 1109 1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1111 { 1112 bool sign = a.sign ^ b.sign; 1113 1114 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1115 uint64_t temp_lo, temp_hi; 1116 int exp = a.exp - b.exp; 1117 if (a.frac < b.frac) { 1118 exp -= 1; 1119 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1120 &temp_hi, &temp_lo); 1121 } else { 1122 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1123 &temp_hi, &temp_lo); 1124 } 1125 /* LSB of quot is set if inexact which roundandpack will use 1126 * to set flags. Yet again we re-use a for the result */ 1127 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1128 a.sign = sign; 1129 a.exp = exp; 1130 return a; 1131 } 1132 /* handle all the NaN cases */ 1133 if (is_nan(a.cls) || is_nan(b.cls)) { 1134 return pick_nan(a, b, s); 1135 } 1136 /* 0/0 or Inf/Inf */ 1137 if (a.cls == b.cls 1138 && 1139 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1140 s->float_exception_flags |= float_flag_invalid; 1141 return parts_default_nan(s); 1142 } 1143 /* Inf / x or 0 / x */ 1144 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1145 a.sign = sign; 1146 return a; 1147 } 1148 /* Div 0 => Inf */ 1149 if (b.cls == float_class_zero) { 1150 s->float_exception_flags |= float_flag_divbyzero; 1151 a.cls = float_class_inf; 1152 a.sign = sign; 1153 return a; 1154 } 1155 /* Div by Inf */ 1156 if (b.cls == float_class_inf) { 1157 a.cls = float_class_zero; 1158 a.sign = sign; 1159 return a; 1160 } 1161 g_assert_not_reached(); 1162 } 1163 1164 float16 float16_div(float16 a, float16 b, float_status *status) 1165 { 1166 FloatParts pa = float16_unpack_canonical(a, status); 1167 FloatParts pb = float16_unpack_canonical(b, status); 1168 FloatParts pr = div_floats(pa, pb, status); 1169 1170 return float16_round_pack_canonical(pr, status); 1171 } 1172 1173 float32 float32_div(float32 a, float32 b, float_status *status) 1174 { 1175 FloatParts pa = float32_unpack_canonical(a, status); 1176 FloatParts pb = float32_unpack_canonical(b, status); 1177 FloatParts pr = div_floats(pa, pb, status); 1178 1179 return float32_round_pack_canonical(pr, status); 1180 } 1181 1182 float64 float64_div(float64 a, float64 b, float_status *status) 1183 { 1184 FloatParts pa = float64_unpack_canonical(a, status); 1185 FloatParts pb = float64_unpack_canonical(b, status); 1186 FloatParts pr = div_floats(pa, pb, status); 1187 1188 return float64_round_pack_canonical(pr, status); 1189 } 1190 1191 /* 1192 * Float to Float conversions 1193 * 1194 * Returns the result of converting one float format to another. The 1195 * conversion is performed according to the IEC/IEEE Standard for 1196 * Binary Floating-Point Arithmetic. 1197 * 1198 * The float_to_float helper only needs to take care of raising 1199 * invalid exceptions and handling the conversion on NaNs. 1200 */ 1201 1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf, 1203 float_status *s) 1204 { 1205 if (dstf->arm_althp) { 1206 switch (a.cls) { 1207 case float_class_qnan: 1208 case float_class_snan: 1209 /* There is no NaN in the destination format. Raise Invalid 1210 * and return a zero with the sign of the input NaN. 1211 */ 1212 s->float_exception_flags |= float_flag_invalid; 1213 a.cls = float_class_zero; 1214 a.frac = 0; 1215 a.exp = 0; 1216 break; 1217 1218 case float_class_inf: 1219 /* There is no Inf in the destination format. Raise Invalid 1220 * and return the maximum normal with the correct sign. 1221 */ 1222 s->float_exception_flags |= float_flag_invalid; 1223 a.cls = float_class_normal; 1224 a.exp = dstf->exp_max; 1225 a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift; 1226 break; 1227 1228 default: 1229 break; 1230 } 1231 } else if (is_nan(a.cls)) { 1232 if (is_snan(a.cls)) { 1233 s->float_exception_flags |= float_flag_invalid; 1234 a = parts_silence_nan(a, s); 1235 } 1236 if (s->default_nan_mode) { 1237 return parts_default_nan(s); 1238 } 1239 } 1240 return a; 1241 } 1242 1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s) 1244 { 1245 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1246 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1247 FloatParts pr = float_to_float(p, &float32_params, s); 1248 return float32_round_pack_canonical(pr, s); 1249 } 1250 1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s) 1252 { 1253 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1254 FloatParts p = float16a_unpack_canonical(a, s, fmt16); 1255 FloatParts pr = float_to_float(p, &float64_params, s); 1256 return float64_round_pack_canonical(pr, s); 1257 } 1258 1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s) 1260 { 1261 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1262 FloatParts p = float32_unpack_canonical(a, s); 1263 FloatParts pr = float_to_float(p, fmt16, s); 1264 return float16a_round_pack_canonical(pr, s, fmt16); 1265 } 1266 1267 float64 float32_to_float64(float32 a, float_status *s) 1268 { 1269 FloatParts p = float32_unpack_canonical(a, s); 1270 FloatParts pr = float_to_float(p, &float64_params, s); 1271 return float64_round_pack_canonical(pr, s); 1272 } 1273 1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s) 1275 { 1276 const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp; 1277 FloatParts p = float64_unpack_canonical(a, s); 1278 FloatParts pr = float_to_float(p, fmt16, s); 1279 return float16a_round_pack_canonical(pr, s, fmt16); 1280 } 1281 1282 float32 float64_to_float32(float64 a, float_status *s) 1283 { 1284 FloatParts p = float64_unpack_canonical(a, s); 1285 FloatParts pr = float_to_float(p, &float32_params, s); 1286 return float32_round_pack_canonical(pr, s); 1287 } 1288 1289 /* 1290 * Rounds the floating-point value `a' to an integer, and returns the 1291 * result as a floating-point value. The operation is performed 1292 * according to the IEC/IEEE Standard for Binary Floating-Point 1293 * Arithmetic. 1294 */ 1295 1296 static FloatParts round_to_int(FloatParts a, int rmode, 1297 int scale, float_status *s) 1298 { 1299 switch (a.cls) { 1300 case float_class_qnan: 1301 case float_class_snan: 1302 return return_nan(a, s); 1303 1304 case float_class_zero: 1305 case float_class_inf: 1306 /* already "integral" */ 1307 break; 1308 1309 case float_class_normal: 1310 scale = MIN(MAX(scale, -0x10000), 0x10000); 1311 a.exp += scale; 1312 1313 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1314 /* already integral */ 1315 break; 1316 } 1317 if (a.exp < 0) { 1318 bool one; 1319 /* all fractional */ 1320 s->float_exception_flags |= float_flag_inexact; 1321 switch (rmode) { 1322 case float_round_nearest_even: 1323 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1324 break; 1325 case float_round_ties_away: 1326 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1327 break; 1328 case float_round_to_zero: 1329 one = false; 1330 break; 1331 case float_round_up: 1332 one = !a.sign; 1333 break; 1334 case float_round_down: 1335 one = a.sign; 1336 break; 1337 default: 1338 g_assert_not_reached(); 1339 } 1340 1341 if (one) { 1342 a.frac = DECOMPOSED_IMPLICIT_BIT; 1343 a.exp = 0; 1344 } else { 1345 a.cls = float_class_zero; 1346 } 1347 } else { 1348 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1349 uint64_t frac_lsbm1 = frac_lsb >> 1; 1350 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1351 uint64_t rnd_mask = rnd_even_mask >> 1; 1352 uint64_t inc; 1353 1354 switch (rmode) { 1355 case float_round_nearest_even: 1356 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1357 break; 1358 case float_round_ties_away: 1359 inc = frac_lsbm1; 1360 break; 1361 case float_round_to_zero: 1362 inc = 0; 1363 break; 1364 case float_round_up: 1365 inc = a.sign ? 0 : rnd_mask; 1366 break; 1367 case float_round_down: 1368 inc = a.sign ? rnd_mask : 0; 1369 break; 1370 default: 1371 g_assert_not_reached(); 1372 } 1373 1374 if (a.frac & rnd_mask) { 1375 s->float_exception_flags |= float_flag_inexact; 1376 a.frac += inc; 1377 a.frac &= ~rnd_mask; 1378 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1379 a.frac >>= 1; 1380 a.exp++; 1381 } 1382 } 1383 } 1384 break; 1385 default: 1386 g_assert_not_reached(); 1387 } 1388 return a; 1389 } 1390 1391 float16 float16_round_to_int(float16 a, float_status *s) 1392 { 1393 FloatParts pa = float16_unpack_canonical(a, s); 1394 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1395 return float16_round_pack_canonical(pr, s); 1396 } 1397 1398 float32 float32_round_to_int(float32 a, float_status *s) 1399 { 1400 FloatParts pa = float32_unpack_canonical(a, s); 1401 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1402 return float32_round_pack_canonical(pr, s); 1403 } 1404 1405 float64 float64_round_to_int(float64 a, float_status *s) 1406 { 1407 FloatParts pa = float64_unpack_canonical(a, s); 1408 FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s); 1409 return float64_round_pack_canonical(pr, s); 1410 } 1411 1412 /* 1413 * Returns the result of converting the floating-point value `a' to 1414 * the two's complement integer format. The conversion is performed 1415 * according to the IEC/IEEE Standard for Binary Floating-Point 1416 * Arithmetic---which means in particular that the conversion is 1417 * rounded according to the current rounding mode. If `a' is a NaN, 1418 * the largest positive integer is returned. Otherwise, if the 1419 * conversion overflows, the largest integer with the same sign as `a' 1420 * is returned. 1421 */ 1422 1423 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale, 1424 int64_t min, int64_t max, 1425 float_status *s) 1426 { 1427 uint64_t r; 1428 int orig_flags = get_float_exception_flags(s); 1429 FloatParts p = round_to_int(in, rmode, scale, s); 1430 1431 switch (p.cls) { 1432 case float_class_snan: 1433 case float_class_qnan: 1434 s->float_exception_flags = orig_flags | float_flag_invalid; 1435 return max; 1436 case float_class_inf: 1437 s->float_exception_flags = orig_flags | float_flag_invalid; 1438 return p.sign ? min : max; 1439 case float_class_zero: 1440 return 0; 1441 case float_class_normal: 1442 if (p.exp < DECOMPOSED_BINARY_POINT) { 1443 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1444 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1445 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1446 } else { 1447 r = UINT64_MAX; 1448 } 1449 if (p.sign) { 1450 if (r <= -(uint64_t) min) { 1451 return -r; 1452 } else { 1453 s->float_exception_flags = orig_flags | float_flag_invalid; 1454 return min; 1455 } 1456 } else { 1457 if (r <= max) { 1458 return r; 1459 } else { 1460 s->float_exception_flags = orig_flags | float_flag_invalid; 1461 return max; 1462 } 1463 } 1464 default: 1465 g_assert_not_reached(); 1466 } 1467 } 1468 1469 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale, 1470 float_status *s) 1471 { 1472 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1473 rmode, scale, INT16_MIN, INT16_MAX, s); 1474 } 1475 1476 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale, 1477 float_status *s) 1478 { 1479 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1480 rmode, scale, INT32_MIN, INT32_MAX, s); 1481 } 1482 1483 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale, 1484 float_status *s) 1485 { 1486 return round_to_int_and_pack(float16_unpack_canonical(a, s), 1487 rmode, scale, INT64_MIN, INT64_MAX, s); 1488 } 1489 1490 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale, 1491 float_status *s) 1492 { 1493 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1494 rmode, scale, INT16_MIN, INT16_MAX, s); 1495 } 1496 1497 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale, 1498 float_status *s) 1499 { 1500 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1501 rmode, scale, INT32_MIN, INT32_MAX, s); 1502 } 1503 1504 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale, 1505 float_status *s) 1506 { 1507 return round_to_int_and_pack(float32_unpack_canonical(a, s), 1508 rmode, scale, INT64_MIN, INT64_MAX, s); 1509 } 1510 1511 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale, 1512 float_status *s) 1513 { 1514 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1515 rmode, scale, INT16_MIN, INT16_MAX, s); 1516 } 1517 1518 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale, 1519 float_status *s) 1520 { 1521 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1522 rmode, scale, INT32_MIN, INT32_MAX, s); 1523 } 1524 1525 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale, 1526 float_status *s) 1527 { 1528 return round_to_int_and_pack(float64_unpack_canonical(a, s), 1529 rmode, scale, INT64_MIN, INT64_MAX, s); 1530 } 1531 1532 int16_t float16_to_int16(float16 a, float_status *s) 1533 { 1534 return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1535 } 1536 1537 int32_t float16_to_int32(float16 a, float_status *s) 1538 { 1539 return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1540 } 1541 1542 int64_t float16_to_int64(float16 a, float_status *s) 1543 { 1544 return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1545 } 1546 1547 int16_t float32_to_int16(float32 a, float_status *s) 1548 { 1549 return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1550 } 1551 1552 int32_t float32_to_int32(float32 a, float_status *s) 1553 { 1554 return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1555 } 1556 1557 int64_t float32_to_int64(float32 a, float_status *s) 1558 { 1559 return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1560 } 1561 1562 int16_t float64_to_int16(float64 a, float_status *s) 1563 { 1564 return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s); 1565 } 1566 1567 int32_t float64_to_int32(float64 a, float_status *s) 1568 { 1569 return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s); 1570 } 1571 1572 int64_t float64_to_int64(float64 a, float_status *s) 1573 { 1574 return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s); 1575 } 1576 1577 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s) 1578 { 1579 return float16_to_int16_scalbn(a, float_round_to_zero, 0, s); 1580 } 1581 1582 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s) 1583 { 1584 return float16_to_int32_scalbn(a, float_round_to_zero, 0, s); 1585 } 1586 1587 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s) 1588 { 1589 return float16_to_int64_scalbn(a, float_round_to_zero, 0, s); 1590 } 1591 1592 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s) 1593 { 1594 return float32_to_int16_scalbn(a, float_round_to_zero, 0, s); 1595 } 1596 1597 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s) 1598 { 1599 return float32_to_int32_scalbn(a, float_round_to_zero, 0, s); 1600 } 1601 1602 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s) 1603 { 1604 return float32_to_int64_scalbn(a, float_round_to_zero, 0, s); 1605 } 1606 1607 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s) 1608 { 1609 return float64_to_int16_scalbn(a, float_round_to_zero, 0, s); 1610 } 1611 1612 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s) 1613 { 1614 return float64_to_int32_scalbn(a, float_round_to_zero, 0, s); 1615 } 1616 1617 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s) 1618 { 1619 return float64_to_int64_scalbn(a, float_round_to_zero, 0, s); 1620 } 1621 1622 /* 1623 * Returns the result of converting the floating-point value `a' to 1624 * the unsigned integer format. The conversion is performed according 1625 * to the IEC/IEEE Standard for Binary Floating-Point 1626 * Arithmetic---which means in particular that the conversion is 1627 * rounded according to the current rounding mode. If `a' is a NaN, 1628 * the largest unsigned integer is returned. Otherwise, if the 1629 * conversion overflows, the largest unsigned integer is returned. If 1630 * the 'a' is negative, the result is rounded and zero is returned; 1631 * values that do not round to zero will raise the inexact exception 1632 * flag. 1633 */ 1634 1635 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale, 1636 uint64_t max, float_status *s) 1637 { 1638 int orig_flags = get_float_exception_flags(s); 1639 FloatParts p = round_to_int(in, rmode, scale, s); 1640 uint64_t r; 1641 1642 switch (p.cls) { 1643 case float_class_snan: 1644 case float_class_qnan: 1645 s->float_exception_flags = orig_flags | float_flag_invalid; 1646 return max; 1647 case float_class_inf: 1648 s->float_exception_flags = orig_flags | float_flag_invalid; 1649 return p.sign ? 0 : max; 1650 case float_class_zero: 1651 return 0; 1652 case float_class_normal: 1653 if (p.sign) { 1654 s->float_exception_flags = orig_flags | float_flag_invalid; 1655 return 0; 1656 } 1657 1658 if (p.exp < DECOMPOSED_BINARY_POINT) { 1659 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1660 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1661 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1662 } else { 1663 s->float_exception_flags = orig_flags | float_flag_invalid; 1664 return max; 1665 } 1666 1667 /* For uint64 this will never trip, but if p.exp is too large 1668 * to shift a decomposed fraction we shall have exited via the 1669 * 3rd leg above. 1670 */ 1671 if (r > max) { 1672 s->float_exception_flags = orig_flags | float_flag_invalid; 1673 return max; 1674 } 1675 return r; 1676 default: 1677 g_assert_not_reached(); 1678 } 1679 } 1680 1681 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale, 1682 float_status *s) 1683 { 1684 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1685 rmode, scale, UINT16_MAX, s); 1686 } 1687 1688 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale, 1689 float_status *s) 1690 { 1691 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1692 rmode, scale, UINT32_MAX, s); 1693 } 1694 1695 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale, 1696 float_status *s) 1697 { 1698 return round_to_uint_and_pack(float16_unpack_canonical(a, s), 1699 rmode, scale, UINT64_MAX, s); 1700 } 1701 1702 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale, 1703 float_status *s) 1704 { 1705 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1706 rmode, scale, UINT16_MAX, s); 1707 } 1708 1709 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale, 1710 float_status *s) 1711 { 1712 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1713 rmode, scale, UINT32_MAX, s); 1714 } 1715 1716 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale, 1717 float_status *s) 1718 { 1719 return round_to_uint_and_pack(float32_unpack_canonical(a, s), 1720 rmode, scale, UINT64_MAX, s); 1721 } 1722 1723 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale, 1724 float_status *s) 1725 { 1726 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1727 rmode, scale, UINT16_MAX, s); 1728 } 1729 1730 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale, 1731 float_status *s) 1732 { 1733 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1734 rmode, scale, UINT32_MAX, s); 1735 } 1736 1737 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale, 1738 float_status *s) 1739 { 1740 return round_to_uint_and_pack(float64_unpack_canonical(a, s), 1741 rmode, scale, UINT64_MAX, s); 1742 } 1743 1744 uint16_t float16_to_uint16(float16 a, float_status *s) 1745 { 1746 return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1747 } 1748 1749 uint32_t float16_to_uint32(float16 a, float_status *s) 1750 { 1751 return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1752 } 1753 1754 uint64_t float16_to_uint64(float16 a, float_status *s) 1755 { 1756 return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1757 } 1758 1759 uint16_t float32_to_uint16(float32 a, float_status *s) 1760 { 1761 return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1762 } 1763 1764 uint32_t float32_to_uint32(float32 a, float_status *s) 1765 { 1766 return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1767 } 1768 1769 uint64_t float32_to_uint64(float32 a, float_status *s) 1770 { 1771 return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1772 } 1773 1774 uint16_t float64_to_uint16(float64 a, float_status *s) 1775 { 1776 return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s); 1777 } 1778 1779 uint32_t float64_to_uint32(float64 a, float_status *s) 1780 { 1781 return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s); 1782 } 1783 1784 uint64_t float64_to_uint64(float64 a, float_status *s) 1785 { 1786 return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s); 1787 } 1788 1789 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s) 1790 { 1791 return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1792 } 1793 1794 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s) 1795 { 1796 return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1797 } 1798 1799 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s) 1800 { 1801 return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1802 } 1803 1804 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s) 1805 { 1806 return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1807 } 1808 1809 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s) 1810 { 1811 return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1812 } 1813 1814 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s) 1815 { 1816 return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1817 } 1818 1819 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s) 1820 { 1821 return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s); 1822 } 1823 1824 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s) 1825 { 1826 return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s); 1827 } 1828 1829 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s) 1830 { 1831 return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s); 1832 } 1833 1834 /* 1835 * Integer to float conversions 1836 * 1837 * Returns the result of converting the two's complement integer `a' 1838 * to the floating-point format. The conversion is performed according 1839 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1840 */ 1841 1842 static FloatParts int_to_float(int64_t a, int scale, float_status *status) 1843 { 1844 FloatParts r = { .sign = false }; 1845 1846 if (a == 0) { 1847 r.cls = float_class_zero; 1848 } else { 1849 uint64_t f = a; 1850 int shift; 1851 1852 r.cls = float_class_normal; 1853 if (a < 0) { 1854 f = -f; 1855 r.sign = true; 1856 } 1857 shift = clz64(f) - 1; 1858 scale = MIN(MAX(scale, -0x10000), 0x10000); 1859 1860 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 1861 r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift); 1862 } 1863 1864 return r; 1865 } 1866 1867 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status) 1868 { 1869 FloatParts pa = int_to_float(a, scale, status); 1870 return float16_round_pack_canonical(pa, status); 1871 } 1872 1873 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status) 1874 { 1875 return int64_to_float16_scalbn(a, scale, status); 1876 } 1877 1878 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status) 1879 { 1880 return int64_to_float16_scalbn(a, scale, status); 1881 } 1882 1883 float16 int64_to_float16(int64_t a, float_status *status) 1884 { 1885 return int64_to_float16_scalbn(a, 0, status); 1886 } 1887 1888 float16 int32_to_float16(int32_t a, float_status *status) 1889 { 1890 return int64_to_float16_scalbn(a, 0, status); 1891 } 1892 1893 float16 int16_to_float16(int16_t a, float_status *status) 1894 { 1895 return int64_to_float16_scalbn(a, 0, status); 1896 } 1897 1898 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status) 1899 { 1900 FloatParts pa = int_to_float(a, scale, status); 1901 return float32_round_pack_canonical(pa, status); 1902 } 1903 1904 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status) 1905 { 1906 return int64_to_float32_scalbn(a, scale, status); 1907 } 1908 1909 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status) 1910 { 1911 return int64_to_float32_scalbn(a, scale, status); 1912 } 1913 1914 float32 int64_to_float32(int64_t a, float_status *status) 1915 { 1916 return int64_to_float32_scalbn(a, 0, status); 1917 } 1918 1919 float32 int32_to_float32(int32_t a, float_status *status) 1920 { 1921 return int64_to_float32_scalbn(a, 0, status); 1922 } 1923 1924 float32 int16_to_float32(int16_t a, float_status *status) 1925 { 1926 return int64_to_float32_scalbn(a, 0, status); 1927 } 1928 1929 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status) 1930 { 1931 FloatParts pa = int_to_float(a, scale, status); 1932 return float64_round_pack_canonical(pa, status); 1933 } 1934 1935 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status) 1936 { 1937 return int64_to_float64_scalbn(a, scale, status); 1938 } 1939 1940 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status) 1941 { 1942 return int64_to_float64_scalbn(a, scale, status); 1943 } 1944 1945 float64 int64_to_float64(int64_t a, float_status *status) 1946 { 1947 return int64_to_float64_scalbn(a, 0, status); 1948 } 1949 1950 float64 int32_to_float64(int32_t a, float_status *status) 1951 { 1952 return int64_to_float64_scalbn(a, 0, status); 1953 } 1954 1955 float64 int16_to_float64(int16_t a, float_status *status) 1956 { 1957 return int64_to_float64_scalbn(a, 0, status); 1958 } 1959 1960 1961 /* 1962 * Unsigned Integer to float conversions 1963 * 1964 * Returns the result of converting the unsigned integer `a' to the 1965 * floating-point format. The conversion is performed according to the 1966 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1967 */ 1968 1969 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status) 1970 { 1971 FloatParts r = { .sign = false }; 1972 1973 if (a == 0) { 1974 r.cls = float_class_zero; 1975 } else { 1976 scale = MIN(MAX(scale, -0x10000), 0x10000); 1977 r.cls = float_class_normal; 1978 if ((int64_t)a < 0) { 1979 r.exp = DECOMPOSED_BINARY_POINT + 1 + scale; 1980 shift64RightJamming(a, 1, &a); 1981 r.frac = a; 1982 } else { 1983 int shift = clz64(a) - 1; 1984 r.exp = DECOMPOSED_BINARY_POINT - shift + scale; 1985 r.frac = a << shift; 1986 } 1987 } 1988 1989 return r; 1990 } 1991 1992 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status) 1993 { 1994 FloatParts pa = uint_to_float(a, scale, status); 1995 return float16_round_pack_canonical(pa, status); 1996 } 1997 1998 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status) 1999 { 2000 return uint64_to_float16_scalbn(a, scale, status); 2001 } 2002 2003 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status) 2004 { 2005 return uint64_to_float16_scalbn(a, scale, status); 2006 } 2007 2008 float16 uint64_to_float16(uint64_t a, float_status *status) 2009 { 2010 return uint64_to_float16_scalbn(a, 0, status); 2011 } 2012 2013 float16 uint32_to_float16(uint32_t a, float_status *status) 2014 { 2015 return uint64_to_float16_scalbn(a, 0, status); 2016 } 2017 2018 float16 uint16_to_float16(uint16_t a, float_status *status) 2019 { 2020 return uint64_to_float16_scalbn(a, 0, status); 2021 } 2022 2023 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status) 2024 { 2025 FloatParts pa = uint_to_float(a, scale, status); 2026 return float32_round_pack_canonical(pa, status); 2027 } 2028 2029 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status) 2030 { 2031 return uint64_to_float32_scalbn(a, scale, status); 2032 } 2033 2034 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status) 2035 { 2036 return uint64_to_float32_scalbn(a, scale, status); 2037 } 2038 2039 float32 uint64_to_float32(uint64_t a, float_status *status) 2040 { 2041 return uint64_to_float32_scalbn(a, 0, status); 2042 } 2043 2044 float32 uint32_to_float32(uint32_t a, float_status *status) 2045 { 2046 return uint64_to_float32_scalbn(a, 0, status); 2047 } 2048 2049 float32 uint16_to_float32(uint16_t a, float_status *status) 2050 { 2051 return uint64_to_float32_scalbn(a, 0, status); 2052 } 2053 2054 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status) 2055 { 2056 FloatParts pa = uint_to_float(a, scale, status); 2057 return float64_round_pack_canonical(pa, status); 2058 } 2059 2060 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status) 2061 { 2062 return uint64_to_float64_scalbn(a, scale, status); 2063 } 2064 2065 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status) 2066 { 2067 return uint64_to_float64_scalbn(a, scale, status); 2068 } 2069 2070 float64 uint64_to_float64(uint64_t a, float_status *status) 2071 { 2072 return uint64_to_float64_scalbn(a, 0, status); 2073 } 2074 2075 float64 uint32_to_float64(uint32_t a, float_status *status) 2076 { 2077 return uint64_to_float64_scalbn(a, 0, status); 2078 } 2079 2080 float64 uint16_to_float64(uint16_t a, float_status *status) 2081 { 2082 return uint64_to_float64_scalbn(a, 0, status); 2083 } 2084 2085 /* Float Min/Max */ 2086 /* min() and max() functions. These can't be implemented as 2087 * 'compare and pick one input' because that would mishandle 2088 * NaNs and +0 vs -0. 2089 * 2090 * minnum() and maxnum() functions. These are similar to the min() 2091 * and max() functions but if one of the arguments is a QNaN and 2092 * the other is numerical then the numerical argument is returned. 2093 * SNaNs will get quietened before being returned. 2094 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 2095 * and maxNum() operations. min() and max() are the typical min/max 2096 * semantics provided by many CPUs which predate that specification. 2097 * 2098 * minnummag() and maxnummag() functions correspond to minNumMag() 2099 * and minNumMag() from the IEEE-754 2008. 2100 */ 2101 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 2102 bool ieee, bool ismag, float_status *s) 2103 { 2104 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 2105 if (ieee) { 2106 /* Takes two floating-point values `a' and `b', one of 2107 * which is a NaN, and returns the appropriate NaN 2108 * result. If either `a' or `b' is a signaling NaN, 2109 * the invalid exception is raised. 2110 */ 2111 if (is_snan(a.cls) || is_snan(b.cls)) { 2112 return pick_nan(a, b, s); 2113 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 2114 return b; 2115 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 2116 return a; 2117 } 2118 } 2119 return pick_nan(a, b, s); 2120 } else { 2121 int a_exp, b_exp; 2122 2123 switch (a.cls) { 2124 case float_class_normal: 2125 a_exp = a.exp; 2126 break; 2127 case float_class_inf: 2128 a_exp = INT_MAX; 2129 break; 2130 case float_class_zero: 2131 a_exp = INT_MIN; 2132 break; 2133 default: 2134 g_assert_not_reached(); 2135 break; 2136 } 2137 switch (b.cls) { 2138 case float_class_normal: 2139 b_exp = b.exp; 2140 break; 2141 case float_class_inf: 2142 b_exp = INT_MAX; 2143 break; 2144 case float_class_zero: 2145 b_exp = INT_MIN; 2146 break; 2147 default: 2148 g_assert_not_reached(); 2149 break; 2150 } 2151 2152 if (ismag && (a_exp != b_exp || a.frac != b.frac)) { 2153 bool a_less = a_exp < b_exp; 2154 if (a_exp == b_exp) { 2155 a_less = a.frac < b.frac; 2156 } 2157 return a_less ^ ismin ? b : a; 2158 } 2159 2160 if (a.sign == b.sign) { 2161 bool a_less = a_exp < b_exp; 2162 if (a_exp == b_exp) { 2163 a_less = a.frac < b.frac; 2164 } 2165 return a.sign ^ a_less ^ ismin ? b : a; 2166 } else { 2167 return a.sign ^ ismin ? b : a; 2168 } 2169 } 2170 } 2171 2172 #define MINMAX(sz, name, ismin, isiee, ismag) \ 2173 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 2174 float_status *s) \ 2175 { \ 2176 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2177 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2178 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 2179 \ 2180 return float ## sz ## _round_pack_canonical(pr, s); \ 2181 } 2182 2183 MINMAX(16, min, true, false, false) 2184 MINMAX(16, minnum, true, true, false) 2185 MINMAX(16, minnummag, true, true, true) 2186 MINMAX(16, max, false, false, false) 2187 MINMAX(16, maxnum, false, true, false) 2188 MINMAX(16, maxnummag, false, true, true) 2189 2190 MINMAX(32, min, true, false, false) 2191 MINMAX(32, minnum, true, true, false) 2192 MINMAX(32, minnummag, true, true, true) 2193 MINMAX(32, max, false, false, false) 2194 MINMAX(32, maxnum, false, true, false) 2195 MINMAX(32, maxnummag, false, true, true) 2196 2197 MINMAX(64, min, true, false, false) 2198 MINMAX(64, minnum, true, true, false) 2199 MINMAX(64, minnummag, true, true, true) 2200 MINMAX(64, max, false, false, false) 2201 MINMAX(64, maxnum, false, true, false) 2202 MINMAX(64, maxnummag, false, true, true) 2203 2204 #undef MINMAX 2205 2206 /* Floating point compare */ 2207 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 2208 float_status *s) 2209 { 2210 if (is_nan(a.cls) || is_nan(b.cls)) { 2211 if (!is_quiet || 2212 a.cls == float_class_snan || 2213 b.cls == float_class_snan) { 2214 s->float_exception_flags |= float_flag_invalid; 2215 } 2216 return float_relation_unordered; 2217 } 2218 2219 if (a.cls == float_class_zero) { 2220 if (b.cls == float_class_zero) { 2221 return float_relation_equal; 2222 } 2223 return b.sign ? float_relation_greater : float_relation_less; 2224 } else if (b.cls == float_class_zero) { 2225 return a.sign ? float_relation_less : float_relation_greater; 2226 } 2227 2228 /* The only really important thing about infinity is its sign. If 2229 * both are infinities the sign marks the smallest of the two. 2230 */ 2231 if (a.cls == float_class_inf) { 2232 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 2233 return float_relation_equal; 2234 } 2235 return a.sign ? float_relation_less : float_relation_greater; 2236 } else if (b.cls == float_class_inf) { 2237 return b.sign ? float_relation_greater : float_relation_less; 2238 } 2239 2240 if (a.sign != b.sign) { 2241 return a.sign ? float_relation_less : float_relation_greater; 2242 } 2243 2244 if (a.exp == b.exp) { 2245 if (a.frac == b.frac) { 2246 return float_relation_equal; 2247 } 2248 if (a.sign) { 2249 return a.frac > b.frac ? 2250 float_relation_less : float_relation_greater; 2251 } else { 2252 return a.frac > b.frac ? 2253 float_relation_greater : float_relation_less; 2254 } 2255 } else { 2256 if (a.sign) { 2257 return a.exp > b.exp ? float_relation_less : float_relation_greater; 2258 } else { 2259 return a.exp > b.exp ? float_relation_greater : float_relation_less; 2260 } 2261 } 2262 } 2263 2264 #define COMPARE(sz) \ 2265 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 2266 float_status *s) \ 2267 { \ 2268 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2269 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2270 return compare_floats(pa, pb, false, s); \ 2271 } \ 2272 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 2273 float_status *s) \ 2274 { \ 2275 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 2276 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 2277 return compare_floats(pa, pb, true, s); \ 2278 } 2279 2280 COMPARE(16) 2281 COMPARE(32) 2282 COMPARE(64) 2283 2284 #undef COMPARE 2285 2286 /* Multiply A by 2 raised to the power N. */ 2287 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 2288 { 2289 if (unlikely(is_nan(a.cls))) { 2290 return return_nan(a, s); 2291 } 2292 if (a.cls == float_class_normal) { 2293 /* The largest float type (even though not supported by FloatParts) 2294 * is float128, which has a 15 bit exponent. Bounding N to 16 bits 2295 * still allows rounding to infinity, without allowing overflow 2296 * within the int32_t that backs FloatParts.exp. 2297 */ 2298 n = MIN(MAX(n, -0x10000), 0x10000); 2299 a.exp += n; 2300 } 2301 return a; 2302 } 2303 2304 float16 float16_scalbn(float16 a, int n, float_status *status) 2305 { 2306 FloatParts pa = float16_unpack_canonical(a, status); 2307 FloatParts pr = scalbn_decomposed(pa, n, status); 2308 return float16_round_pack_canonical(pr, status); 2309 } 2310 2311 float32 float32_scalbn(float32 a, int n, float_status *status) 2312 { 2313 FloatParts pa = float32_unpack_canonical(a, status); 2314 FloatParts pr = scalbn_decomposed(pa, n, status); 2315 return float32_round_pack_canonical(pr, status); 2316 } 2317 2318 float64 float64_scalbn(float64 a, int n, float_status *status) 2319 { 2320 FloatParts pa = float64_unpack_canonical(a, status); 2321 FloatParts pr = scalbn_decomposed(pa, n, status); 2322 return float64_round_pack_canonical(pr, status); 2323 } 2324 2325 /* 2326 * Square Root 2327 * 2328 * The old softfloat code did an approximation step before zeroing in 2329 * on the final result. However for simpleness we just compute the 2330 * square root by iterating down from the implicit bit to enough extra 2331 * bits to ensure we get a correctly rounded result. 2332 * 2333 * This does mean however the calculation is slower than before, 2334 * especially for 64 bit floats. 2335 */ 2336 2337 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 2338 { 2339 uint64_t a_frac, r_frac, s_frac; 2340 int bit, last_bit; 2341 2342 if (is_nan(a.cls)) { 2343 return return_nan(a, s); 2344 } 2345 if (a.cls == float_class_zero) { 2346 return a; /* sqrt(+-0) = +-0 */ 2347 } 2348 if (a.sign) { 2349 s->float_exception_flags |= float_flag_invalid; 2350 return parts_default_nan(s); 2351 } 2352 if (a.cls == float_class_inf) { 2353 return a; /* sqrt(+inf) = +inf */ 2354 } 2355 2356 assert(a.cls == float_class_normal); 2357 2358 /* We need two overflow bits at the top. Adding room for that is a 2359 * right shift. If the exponent is odd, we can discard the low bit 2360 * by multiplying the fraction by 2; that's a left shift. Combine 2361 * those and we shift right if the exponent is even. 2362 */ 2363 a_frac = a.frac; 2364 if (!(a.exp & 1)) { 2365 a_frac >>= 1; 2366 } 2367 a.exp >>= 1; 2368 2369 /* Bit-by-bit computation of sqrt. */ 2370 r_frac = 0; 2371 s_frac = 0; 2372 2373 /* Iterate from implicit bit down to the 3 extra bits to compute a 2374 * properly rounded result. Remember we've inserted one more bit 2375 * at the top, so these positions are one less. 2376 */ 2377 bit = DECOMPOSED_BINARY_POINT - 1; 2378 last_bit = MAX(p->frac_shift - 4, 0); 2379 do { 2380 uint64_t q = 1ULL << bit; 2381 uint64_t t_frac = s_frac + q; 2382 if (t_frac <= a_frac) { 2383 s_frac = t_frac + q; 2384 a_frac -= t_frac; 2385 r_frac += q; 2386 } 2387 a_frac <<= 1; 2388 } while (--bit >= last_bit); 2389 2390 /* Undo the right shift done above. If there is any remaining 2391 * fraction, the result is inexact. Set the sticky bit. 2392 */ 2393 a.frac = (r_frac << 1) + (a_frac != 0); 2394 2395 return a; 2396 } 2397 2398 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 2399 { 2400 FloatParts pa = float16_unpack_canonical(a, status); 2401 FloatParts pr = sqrt_float(pa, status, &float16_params); 2402 return float16_round_pack_canonical(pr, status); 2403 } 2404 2405 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 2406 { 2407 FloatParts pa = float32_unpack_canonical(a, status); 2408 FloatParts pr = sqrt_float(pa, status, &float32_params); 2409 return float32_round_pack_canonical(pr, status); 2410 } 2411 2412 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 2413 { 2414 FloatParts pa = float64_unpack_canonical(a, status); 2415 FloatParts pr = sqrt_float(pa, status, &float64_params); 2416 return float64_round_pack_canonical(pr, status); 2417 } 2418 2419 /*---------------------------------------------------------------------------- 2420 | The pattern for a default generated NaN. 2421 *----------------------------------------------------------------------------*/ 2422 2423 float16 float16_default_nan(float_status *status) 2424 { 2425 FloatParts p = parts_default_nan(status); 2426 p.frac >>= float16_params.frac_shift; 2427 return float16_pack_raw(p); 2428 } 2429 2430 float32 float32_default_nan(float_status *status) 2431 { 2432 FloatParts p = parts_default_nan(status); 2433 p.frac >>= float32_params.frac_shift; 2434 return float32_pack_raw(p); 2435 } 2436 2437 float64 float64_default_nan(float_status *status) 2438 { 2439 FloatParts p = parts_default_nan(status); 2440 p.frac >>= float64_params.frac_shift; 2441 return float64_pack_raw(p); 2442 } 2443 2444 float128 float128_default_nan(float_status *status) 2445 { 2446 FloatParts p = parts_default_nan(status); 2447 float128 r; 2448 2449 /* Extrapolate from the choices made by parts_default_nan to fill 2450 * in the quad-floating format. If the low bit is set, assume we 2451 * want to set all non-snan bits. 2452 */ 2453 r.low = -(p.frac & 1); 2454 r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48); 2455 r.high |= LIT64(0x7FFF000000000000); 2456 r.high |= (uint64_t)p.sign << 63; 2457 2458 return r; 2459 } 2460 2461 /*---------------------------------------------------------------------------- 2462 | Returns a quiet NaN from a signalling NaN for the floating point value `a'. 2463 *----------------------------------------------------------------------------*/ 2464 2465 float16 float16_silence_nan(float16 a, float_status *status) 2466 { 2467 FloatParts p = float16_unpack_raw(a); 2468 p.frac <<= float16_params.frac_shift; 2469 p = parts_silence_nan(p, status); 2470 p.frac >>= float16_params.frac_shift; 2471 return float16_pack_raw(p); 2472 } 2473 2474 float32 float32_silence_nan(float32 a, float_status *status) 2475 { 2476 FloatParts p = float32_unpack_raw(a); 2477 p.frac <<= float32_params.frac_shift; 2478 p = parts_silence_nan(p, status); 2479 p.frac >>= float32_params.frac_shift; 2480 return float32_pack_raw(p); 2481 } 2482 2483 float64 float64_silence_nan(float64 a, float_status *status) 2484 { 2485 FloatParts p = float64_unpack_raw(a); 2486 p.frac <<= float64_params.frac_shift; 2487 p = parts_silence_nan(p, status); 2488 p.frac >>= float64_params.frac_shift; 2489 return float64_pack_raw(p); 2490 } 2491 2492 /*---------------------------------------------------------------------------- 2493 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 2494 | and 7, and returns the properly rounded 32-bit integer corresponding to the 2495 | input. If `zSign' is 1, the input is negated before being converted to an 2496 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2497 | is simply rounded to an integer, with the inexact exception raised if the 2498 | input cannot be represented exactly as an integer. However, if the fixed- 2499 | point input is too large, the invalid exception is raised and the largest 2500 | positive or negative integer is returned. 2501 *----------------------------------------------------------------------------*/ 2502 2503 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2504 { 2505 int8_t roundingMode; 2506 flag roundNearestEven; 2507 int8_t roundIncrement, roundBits; 2508 int32_t z; 2509 2510 roundingMode = status->float_rounding_mode; 2511 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2512 switch (roundingMode) { 2513 case float_round_nearest_even: 2514 case float_round_ties_away: 2515 roundIncrement = 0x40; 2516 break; 2517 case float_round_to_zero: 2518 roundIncrement = 0; 2519 break; 2520 case float_round_up: 2521 roundIncrement = zSign ? 0 : 0x7f; 2522 break; 2523 case float_round_down: 2524 roundIncrement = zSign ? 0x7f : 0; 2525 break; 2526 default: 2527 abort(); 2528 } 2529 roundBits = absZ & 0x7F; 2530 absZ = ( absZ + roundIncrement )>>7; 2531 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2532 z = absZ; 2533 if ( zSign ) z = - z; 2534 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2535 float_raise(float_flag_invalid, status); 2536 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2537 } 2538 if (roundBits) { 2539 status->float_exception_flags |= float_flag_inexact; 2540 } 2541 return z; 2542 2543 } 2544 2545 /*---------------------------------------------------------------------------- 2546 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2547 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2548 | and returns the properly rounded 64-bit integer corresponding to the input. 2549 | If `zSign' is 1, the input is negated before being converted to an integer. 2550 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2551 | the inexact exception raised if the input cannot be represented exactly as 2552 | an integer. However, if the fixed-point input is too large, the invalid 2553 | exception is raised and the largest positive or negative integer is 2554 | returned. 2555 *----------------------------------------------------------------------------*/ 2556 2557 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2558 float_status *status) 2559 { 2560 int8_t roundingMode; 2561 flag roundNearestEven, increment; 2562 int64_t z; 2563 2564 roundingMode = status->float_rounding_mode; 2565 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2566 switch (roundingMode) { 2567 case float_round_nearest_even: 2568 case float_round_ties_away: 2569 increment = ((int64_t) absZ1 < 0); 2570 break; 2571 case float_round_to_zero: 2572 increment = 0; 2573 break; 2574 case float_round_up: 2575 increment = !zSign && absZ1; 2576 break; 2577 case float_round_down: 2578 increment = zSign && absZ1; 2579 break; 2580 default: 2581 abort(); 2582 } 2583 if ( increment ) { 2584 ++absZ0; 2585 if ( absZ0 == 0 ) goto overflow; 2586 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2587 } 2588 z = absZ0; 2589 if ( zSign ) z = - z; 2590 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2591 overflow: 2592 float_raise(float_flag_invalid, status); 2593 return 2594 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2595 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2596 } 2597 if (absZ1) { 2598 status->float_exception_flags |= float_flag_inexact; 2599 } 2600 return z; 2601 2602 } 2603 2604 /*---------------------------------------------------------------------------- 2605 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2606 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2607 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2608 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2609 | with the inexact exception raised if the input cannot be represented exactly 2610 | as an integer. However, if the fixed-point input is too large, the invalid 2611 | exception is raised and the largest unsigned integer is returned. 2612 *----------------------------------------------------------------------------*/ 2613 2614 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2615 uint64_t absZ1, float_status *status) 2616 { 2617 int8_t roundingMode; 2618 flag roundNearestEven, increment; 2619 2620 roundingMode = status->float_rounding_mode; 2621 roundNearestEven = (roundingMode == float_round_nearest_even); 2622 switch (roundingMode) { 2623 case float_round_nearest_even: 2624 case float_round_ties_away: 2625 increment = ((int64_t)absZ1 < 0); 2626 break; 2627 case float_round_to_zero: 2628 increment = 0; 2629 break; 2630 case float_round_up: 2631 increment = !zSign && absZ1; 2632 break; 2633 case float_round_down: 2634 increment = zSign && absZ1; 2635 break; 2636 default: 2637 abort(); 2638 } 2639 if (increment) { 2640 ++absZ0; 2641 if (absZ0 == 0) { 2642 float_raise(float_flag_invalid, status); 2643 return LIT64(0xFFFFFFFFFFFFFFFF); 2644 } 2645 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2646 } 2647 2648 if (zSign && absZ0) { 2649 float_raise(float_flag_invalid, status); 2650 return 0; 2651 } 2652 2653 if (absZ1) { 2654 status->float_exception_flags |= float_flag_inexact; 2655 } 2656 return absZ0; 2657 } 2658 2659 /*---------------------------------------------------------------------------- 2660 | If `a' is denormal and we are in flush-to-zero mode then set the 2661 | input-denormal exception and return zero. Otherwise just return the value. 2662 *----------------------------------------------------------------------------*/ 2663 float32 float32_squash_input_denormal(float32 a, float_status *status) 2664 { 2665 if (status->flush_inputs_to_zero) { 2666 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2667 float_raise(float_flag_input_denormal, status); 2668 return make_float32(float32_val(a) & 0x80000000); 2669 } 2670 } 2671 return a; 2672 } 2673 2674 /*---------------------------------------------------------------------------- 2675 | Normalizes the subnormal single-precision floating-point value represented 2676 | by the denormalized significand `aSig'. The normalized exponent and 2677 | significand are stored at the locations pointed to by `zExpPtr' and 2678 | `zSigPtr', respectively. 2679 *----------------------------------------------------------------------------*/ 2680 2681 static void 2682 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2683 { 2684 int8_t shiftCount; 2685 2686 shiftCount = countLeadingZeros32( aSig ) - 8; 2687 *zSigPtr = aSig<<shiftCount; 2688 *zExpPtr = 1 - shiftCount; 2689 2690 } 2691 2692 /*---------------------------------------------------------------------------- 2693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2694 | and significand `zSig', and returns the proper single-precision floating- 2695 | point value corresponding to the abstract input. Ordinarily, the abstract 2696 | value is simply rounded and packed into the single-precision format, with 2697 | the inexact exception raised if the abstract input cannot be represented 2698 | exactly. However, if the abstract value is too large, the overflow and 2699 | inexact exceptions are raised and an infinity or maximal finite value is 2700 | returned. If the abstract value is too small, the input value is rounded to 2701 | a subnormal number, and the underflow and inexact exceptions are raised if 2702 | the abstract input cannot be represented exactly as a subnormal single- 2703 | precision floating-point number. 2704 | The input significand `zSig' has its binary point between bits 30 2705 | and 29, which is 7 bits to the left of the usual location. This shifted 2706 | significand must be normalized or smaller. If `zSig' is not normalized, 2707 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2708 | and it must not require rounding. In the usual case that `zSig' is 2709 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2710 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2711 | Binary Floating-Point Arithmetic. 2712 *----------------------------------------------------------------------------*/ 2713 2714 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2715 float_status *status) 2716 { 2717 int8_t roundingMode; 2718 flag roundNearestEven; 2719 int8_t roundIncrement, roundBits; 2720 flag isTiny; 2721 2722 roundingMode = status->float_rounding_mode; 2723 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2724 switch (roundingMode) { 2725 case float_round_nearest_even: 2726 case float_round_ties_away: 2727 roundIncrement = 0x40; 2728 break; 2729 case float_round_to_zero: 2730 roundIncrement = 0; 2731 break; 2732 case float_round_up: 2733 roundIncrement = zSign ? 0 : 0x7f; 2734 break; 2735 case float_round_down: 2736 roundIncrement = zSign ? 0x7f : 0; 2737 break; 2738 default: 2739 abort(); 2740 break; 2741 } 2742 roundBits = zSig & 0x7F; 2743 if ( 0xFD <= (uint16_t) zExp ) { 2744 if ( ( 0xFD < zExp ) 2745 || ( ( zExp == 0xFD ) 2746 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2747 ) { 2748 float_raise(float_flag_overflow | float_flag_inexact, status); 2749 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2750 } 2751 if ( zExp < 0 ) { 2752 if (status->flush_to_zero) { 2753 float_raise(float_flag_output_denormal, status); 2754 return packFloat32(zSign, 0, 0); 2755 } 2756 isTiny = 2757 (status->float_detect_tininess 2758 == float_tininess_before_rounding) 2759 || ( zExp < -1 ) 2760 || ( zSig + roundIncrement < 0x80000000 ); 2761 shift32RightJamming( zSig, - zExp, &zSig ); 2762 zExp = 0; 2763 roundBits = zSig & 0x7F; 2764 if (isTiny && roundBits) { 2765 float_raise(float_flag_underflow, status); 2766 } 2767 } 2768 } 2769 if (roundBits) { 2770 status->float_exception_flags |= float_flag_inexact; 2771 } 2772 zSig = ( zSig + roundIncrement )>>7; 2773 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2774 if ( zSig == 0 ) zExp = 0; 2775 return packFloat32( zSign, zExp, zSig ); 2776 2777 } 2778 2779 /*---------------------------------------------------------------------------- 2780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2781 | and significand `zSig', and returns the proper single-precision floating- 2782 | point value corresponding to the abstract input. This routine is just like 2783 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2784 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2785 | floating-point exponent. 2786 *----------------------------------------------------------------------------*/ 2787 2788 static float32 2789 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2790 float_status *status) 2791 { 2792 int8_t shiftCount; 2793 2794 shiftCount = countLeadingZeros32( zSig ) - 1; 2795 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2796 status); 2797 2798 } 2799 2800 /*---------------------------------------------------------------------------- 2801 | If `a' is denormal and we are in flush-to-zero mode then set the 2802 | input-denormal exception and return zero. Otherwise just return the value. 2803 *----------------------------------------------------------------------------*/ 2804 float64 float64_squash_input_denormal(float64 a, float_status *status) 2805 { 2806 if (status->flush_inputs_to_zero) { 2807 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2808 float_raise(float_flag_input_denormal, status); 2809 return make_float64(float64_val(a) & (1ULL << 63)); 2810 } 2811 } 2812 return a; 2813 } 2814 2815 /*---------------------------------------------------------------------------- 2816 | Normalizes the subnormal double-precision floating-point value represented 2817 | by the denormalized significand `aSig'. The normalized exponent and 2818 | significand are stored at the locations pointed to by `zExpPtr' and 2819 | `zSigPtr', respectively. 2820 *----------------------------------------------------------------------------*/ 2821 2822 static void 2823 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2824 { 2825 int8_t shiftCount; 2826 2827 shiftCount = countLeadingZeros64( aSig ) - 11; 2828 *zSigPtr = aSig<<shiftCount; 2829 *zExpPtr = 1 - shiftCount; 2830 2831 } 2832 2833 /*---------------------------------------------------------------------------- 2834 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2835 | double-precision floating-point value, returning the result. After being 2836 | shifted into the proper positions, the three fields are simply added 2837 | together to form the result. This means that any integer portion of `zSig' 2838 | will be added into the exponent. Since a properly normalized significand 2839 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2840 | than the desired result exponent whenever `zSig' is a complete, normalized 2841 | significand. 2842 *----------------------------------------------------------------------------*/ 2843 2844 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2845 { 2846 2847 return make_float64( 2848 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2849 2850 } 2851 2852 /*---------------------------------------------------------------------------- 2853 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2854 | and significand `zSig', and returns the proper double-precision floating- 2855 | point value corresponding to the abstract input. Ordinarily, the abstract 2856 | value is simply rounded and packed into the double-precision format, with 2857 | the inexact exception raised if the abstract input cannot be represented 2858 | exactly. However, if the abstract value is too large, the overflow and 2859 | inexact exceptions are raised and an infinity or maximal finite value is 2860 | returned. If the abstract value is too small, the input value is rounded to 2861 | a subnormal number, and the underflow and inexact exceptions are raised if 2862 | the abstract input cannot be represented exactly as a subnormal double- 2863 | precision floating-point number. 2864 | The input significand `zSig' has its binary point between bits 62 2865 | and 61, which is 10 bits to the left of the usual location. This shifted 2866 | significand must be normalized or smaller. If `zSig' is not normalized, 2867 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2868 | and it must not require rounding. In the usual case that `zSig' is 2869 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2870 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2871 | Binary Floating-Point Arithmetic. 2872 *----------------------------------------------------------------------------*/ 2873 2874 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2875 float_status *status) 2876 { 2877 int8_t roundingMode; 2878 flag roundNearestEven; 2879 int roundIncrement, roundBits; 2880 flag isTiny; 2881 2882 roundingMode = status->float_rounding_mode; 2883 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2884 switch (roundingMode) { 2885 case float_round_nearest_even: 2886 case float_round_ties_away: 2887 roundIncrement = 0x200; 2888 break; 2889 case float_round_to_zero: 2890 roundIncrement = 0; 2891 break; 2892 case float_round_up: 2893 roundIncrement = zSign ? 0 : 0x3ff; 2894 break; 2895 case float_round_down: 2896 roundIncrement = zSign ? 0x3ff : 0; 2897 break; 2898 case float_round_to_odd: 2899 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2900 break; 2901 default: 2902 abort(); 2903 } 2904 roundBits = zSig & 0x3FF; 2905 if ( 0x7FD <= (uint16_t) zExp ) { 2906 if ( ( 0x7FD < zExp ) 2907 || ( ( zExp == 0x7FD ) 2908 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2909 ) { 2910 bool overflow_to_inf = roundingMode != float_round_to_odd && 2911 roundIncrement != 0; 2912 float_raise(float_flag_overflow | float_flag_inexact, status); 2913 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2914 } 2915 if ( zExp < 0 ) { 2916 if (status->flush_to_zero) { 2917 float_raise(float_flag_output_denormal, status); 2918 return packFloat64(zSign, 0, 0); 2919 } 2920 isTiny = 2921 (status->float_detect_tininess 2922 == float_tininess_before_rounding) 2923 || ( zExp < -1 ) 2924 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2925 shift64RightJamming( zSig, - zExp, &zSig ); 2926 zExp = 0; 2927 roundBits = zSig & 0x3FF; 2928 if (isTiny && roundBits) { 2929 float_raise(float_flag_underflow, status); 2930 } 2931 if (roundingMode == float_round_to_odd) { 2932 /* 2933 * For round-to-odd case, the roundIncrement depends on 2934 * zSig which just changed. 2935 */ 2936 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2937 } 2938 } 2939 } 2940 if (roundBits) { 2941 status->float_exception_flags |= float_flag_inexact; 2942 } 2943 zSig = ( zSig + roundIncrement )>>10; 2944 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2945 if ( zSig == 0 ) zExp = 0; 2946 return packFloat64( zSign, zExp, zSig ); 2947 2948 } 2949 2950 /*---------------------------------------------------------------------------- 2951 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2952 | and significand `zSig', and returns the proper double-precision floating- 2953 | point value corresponding to the abstract input. This routine is just like 2954 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2955 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2956 | floating-point exponent. 2957 *----------------------------------------------------------------------------*/ 2958 2959 static float64 2960 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2961 float_status *status) 2962 { 2963 int8_t shiftCount; 2964 2965 shiftCount = countLeadingZeros64( zSig ) - 1; 2966 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2967 status); 2968 2969 } 2970 2971 /*---------------------------------------------------------------------------- 2972 | Normalizes the subnormal extended double-precision floating-point value 2973 | represented by the denormalized significand `aSig'. The normalized exponent 2974 | and significand are stored at the locations pointed to by `zExpPtr' and 2975 | `zSigPtr', respectively. 2976 *----------------------------------------------------------------------------*/ 2977 2978 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2979 uint64_t *zSigPtr) 2980 { 2981 int8_t shiftCount; 2982 2983 shiftCount = countLeadingZeros64( aSig ); 2984 *zSigPtr = aSig<<shiftCount; 2985 *zExpPtr = 1 - shiftCount; 2986 } 2987 2988 /*---------------------------------------------------------------------------- 2989 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2990 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2991 | and returns the proper extended double-precision floating-point value 2992 | corresponding to the abstract input. Ordinarily, the abstract value is 2993 | rounded and packed into the extended double-precision format, with the 2994 | inexact exception raised if the abstract input cannot be represented 2995 | exactly. However, if the abstract value is too large, the overflow and 2996 | inexact exceptions are raised and an infinity or maximal finite value is 2997 | returned. If the abstract value is too small, the input value is rounded to 2998 | a subnormal number, and the underflow and inexact exceptions are raised if 2999 | the abstract input cannot be represented exactly as a subnormal extended 3000 | double-precision floating-point number. 3001 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 3002 | number of bits as single or double precision, respectively. Otherwise, the 3003 | result is rounded to the full precision of the extended double-precision 3004 | format. 3005 | The input significand must be normalized or smaller. If the input 3006 | significand is not normalized, `zExp' must be 0; in that case, the result 3007 | returned is a subnormal number, and it must not require rounding. The 3008 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 3009 | Floating-Point Arithmetic. 3010 *----------------------------------------------------------------------------*/ 3011 3012 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 3013 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 3014 float_status *status) 3015 { 3016 int8_t roundingMode; 3017 flag roundNearestEven, increment, isTiny; 3018 int64_t roundIncrement, roundMask, roundBits; 3019 3020 roundingMode = status->float_rounding_mode; 3021 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3022 if ( roundingPrecision == 80 ) goto precision80; 3023 if ( roundingPrecision == 64 ) { 3024 roundIncrement = LIT64( 0x0000000000000400 ); 3025 roundMask = LIT64( 0x00000000000007FF ); 3026 } 3027 else if ( roundingPrecision == 32 ) { 3028 roundIncrement = LIT64( 0x0000008000000000 ); 3029 roundMask = LIT64( 0x000000FFFFFFFFFF ); 3030 } 3031 else { 3032 goto precision80; 3033 } 3034 zSig0 |= ( zSig1 != 0 ); 3035 switch (roundingMode) { 3036 case float_round_nearest_even: 3037 case float_round_ties_away: 3038 break; 3039 case float_round_to_zero: 3040 roundIncrement = 0; 3041 break; 3042 case float_round_up: 3043 roundIncrement = zSign ? 0 : roundMask; 3044 break; 3045 case float_round_down: 3046 roundIncrement = zSign ? roundMask : 0; 3047 break; 3048 default: 3049 abort(); 3050 } 3051 roundBits = zSig0 & roundMask; 3052 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3053 if ( ( 0x7FFE < zExp ) 3054 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 3055 ) { 3056 goto overflow; 3057 } 3058 if ( zExp <= 0 ) { 3059 if (status->flush_to_zero) { 3060 float_raise(float_flag_output_denormal, status); 3061 return packFloatx80(zSign, 0, 0); 3062 } 3063 isTiny = 3064 (status->float_detect_tininess 3065 == float_tininess_before_rounding) 3066 || ( zExp < 0 ) 3067 || ( zSig0 <= zSig0 + roundIncrement ); 3068 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 3069 zExp = 0; 3070 roundBits = zSig0 & roundMask; 3071 if (isTiny && roundBits) { 3072 float_raise(float_flag_underflow, status); 3073 } 3074 if (roundBits) { 3075 status->float_exception_flags |= float_flag_inexact; 3076 } 3077 zSig0 += roundIncrement; 3078 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3079 roundIncrement = roundMask + 1; 3080 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3081 roundMask |= roundIncrement; 3082 } 3083 zSig0 &= ~ roundMask; 3084 return packFloatx80( zSign, zExp, zSig0 ); 3085 } 3086 } 3087 if (roundBits) { 3088 status->float_exception_flags |= float_flag_inexact; 3089 } 3090 zSig0 += roundIncrement; 3091 if ( zSig0 < roundIncrement ) { 3092 ++zExp; 3093 zSig0 = LIT64( 0x8000000000000000 ); 3094 } 3095 roundIncrement = roundMask + 1; 3096 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 3097 roundMask |= roundIncrement; 3098 } 3099 zSig0 &= ~ roundMask; 3100 if ( zSig0 == 0 ) zExp = 0; 3101 return packFloatx80( zSign, zExp, zSig0 ); 3102 precision80: 3103 switch (roundingMode) { 3104 case float_round_nearest_even: 3105 case float_round_ties_away: 3106 increment = ((int64_t)zSig1 < 0); 3107 break; 3108 case float_round_to_zero: 3109 increment = 0; 3110 break; 3111 case float_round_up: 3112 increment = !zSign && zSig1; 3113 break; 3114 case float_round_down: 3115 increment = zSign && zSig1; 3116 break; 3117 default: 3118 abort(); 3119 } 3120 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 3121 if ( ( 0x7FFE < zExp ) 3122 || ( ( zExp == 0x7FFE ) 3123 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 3124 && increment 3125 ) 3126 ) { 3127 roundMask = 0; 3128 overflow: 3129 float_raise(float_flag_overflow | float_flag_inexact, status); 3130 if ( ( roundingMode == float_round_to_zero ) 3131 || ( zSign && ( roundingMode == float_round_up ) ) 3132 || ( ! zSign && ( roundingMode == float_round_down ) ) 3133 ) { 3134 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 3135 } 3136 return packFloatx80(zSign, 3137 floatx80_infinity_high, 3138 floatx80_infinity_low); 3139 } 3140 if ( zExp <= 0 ) { 3141 isTiny = 3142 (status->float_detect_tininess 3143 == float_tininess_before_rounding) 3144 || ( zExp < 0 ) 3145 || ! increment 3146 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 3147 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 3148 zExp = 0; 3149 if (isTiny && zSig1) { 3150 float_raise(float_flag_underflow, status); 3151 } 3152 if (zSig1) { 3153 status->float_exception_flags |= float_flag_inexact; 3154 } 3155 switch (roundingMode) { 3156 case float_round_nearest_even: 3157 case float_round_ties_away: 3158 increment = ((int64_t)zSig1 < 0); 3159 break; 3160 case float_round_to_zero: 3161 increment = 0; 3162 break; 3163 case float_round_up: 3164 increment = !zSign && zSig1; 3165 break; 3166 case float_round_down: 3167 increment = zSign && zSig1; 3168 break; 3169 default: 3170 abort(); 3171 } 3172 if ( increment ) { 3173 ++zSig0; 3174 zSig0 &= 3175 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3176 if ( (int64_t) zSig0 < 0 ) zExp = 1; 3177 } 3178 return packFloatx80( zSign, zExp, zSig0 ); 3179 } 3180 } 3181 if (zSig1) { 3182 status->float_exception_flags |= float_flag_inexact; 3183 } 3184 if ( increment ) { 3185 ++zSig0; 3186 if ( zSig0 == 0 ) { 3187 ++zExp; 3188 zSig0 = LIT64( 0x8000000000000000 ); 3189 } 3190 else { 3191 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 3192 } 3193 } 3194 else { 3195 if ( zSig0 == 0 ) zExp = 0; 3196 } 3197 return packFloatx80( zSign, zExp, zSig0 ); 3198 3199 } 3200 3201 /*---------------------------------------------------------------------------- 3202 | Takes an abstract floating-point value having sign `zSign', exponent 3203 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 3204 | and returns the proper extended double-precision floating-point value 3205 | corresponding to the abstract input. This routine is just like 3206 | `roundAndPackFloatx80' except that the input significand does not have to be 3207 | normalized. 3208 *----------------------------------------------------------------------------*/ 3209 3210 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 3211 flag zSign, int32_t zExp, 3212 uint64_t zSig0, uint64_t zSig1, 3213 float_status *status) 3214 { 3215 int8_t shiftCount; 3216 3217 if ( zSig0 == 0 ) { 3218 zSig0 = zSig1; 3219 zSig1 = 0; 3220 zExp -= 64; 3221 } 3222 shiftCount = countLeadingZeros64( zSig0 ); 3223 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3224 zExp -= shiftCount; 3225 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 3226 zSig0, zSig1, status); 3227 3228 } 3229 3230 /*---------------------------------------------------------------------------- 3231 | Returns the least-significant 64 fraction bits of the quadruple-precision 3232 | floating-point value `a'. 3233 *----------------------------------------------------------------------------*/ 3234 3235 static inline uint64_t extractFloat128Frac1( float128 a ) 3236 { 3237 3238 return a.low; 3239 3240 } 3241 3242 /*---------------------------------------------------------------------------- 3243 | Returns the most-significant 48 fraction bits of the quadruple-precision 3244 | floating-point value `a'. 3245 *----------------------------------------------------------------------------*/ 3246 3247 static inline uint64_t extractFloat128Frac0( float128 a ) 3248 { 3249 3250 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 3251 3252 } 3253 3254 /*---------------------------------------------------------------------------- 3255 | Returns the exponent bits of the quadruple-precision floating-point value 3256 | `a'. 3257 *----------------------------------------------------------------------------*/ 3258 3259 static inline int32_t extractFloat128Exp( float128 a ) 3260 { 3261 3262 return ( a.high>>48 ) & 0x7FFF; 3263 3264 } 3265 3266 /*---------------------------------------------------------------------------- 3267 | Returns the sign bit of the quadruple-precision floating-point value `a'. 3268 *----------------------------------------------------------------------------*/ 3269 3270 static inline flag extractFloat128Sign( float128 a ) 3271 { 3272 3273 return a.high>>63; 3274 3275 } 3276 3277 /*---------------------------------------------------------------------------- 3278 | Normalizes the subnormal quadruple-precision floating-point value 3279 | represented by the denormalized significand formed by the concatenation of 3280 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 3281 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 3282 | significand are stored at the location pointed to by `zSig0Ptr', and the 3283 | least significant 64 bits of the normalized significand are stored at the 3284 | location pointed to by `zSig1Ptr'. 3285 *----------------------------------------------------------------------------*/ 3286 3287 static void 3288 normalizeFloat128Subnormal( 3289 uint64_t aSig0, 3290 uint64_t aSig1, 3291 int32_t *zExpPtr, 3292 uint64_t *zSig0Ptr, 3293 uint64_t *zSig1Ptr 3294 ) 3295 { 3296 int8_t shiftCount; 3297 3298 if ( aSig0 == 0 ) { 3299 shiftCount = countLeadingZeros64( aSig1 ) - 15; 3300 if ( shiftCount < 0 ) { 3301 *zSig0Ptr = aSig1>>( - shiftCount ); 3302 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 3303 } 3304 else { 3305 *zSig0Ptr = aSig1<<shiftCount; 3306 *zSig1Ptr = 0; 3307 } 3308 *zExpPtr = - shiftCount - 63; 3309 } 3310 else { 3311 shiftCount = countLeadingZeros64( aSig0 ) - 15; 3312 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 3313 *zExpPtr = 1 - shiftCount; 3314 } 3315 3316 } 3317 3318 /*---------------------------------------------------------------------------- 3319 | Packs the sign `zSign', the exponent `zExp', and the significand formed 3320 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 3321 | floating-point value, returning the result. After being shifted into the 3322 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 3323 | added together to form the most significant 32 bits of the result. This 3324 | means that any integer portion of `zSig0' will be added into the exponent. 3325 | Since a properly normalized significand will have an integer portion equal 3326 | to 1, the `zExp' input should be 1 less than the desired result exponent 3327 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 3328 | significand. 3329 *----------------------------------------------------------------------------*/ 3330 3331 static inline float128 3332 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 3333 { 3334 float128 z; 3335 3336 z.low = zSig1; 3337 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 3338 return z; 3339 3340 } 3341 3342 /*---------------------------------------------------------------------------- 3343 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3344 | and extended significand formed by the concatenation of `zSig0', `zSig1', 3345 | and `zSig2', and returns the proper quadruple-precision floating-point value 3346 | corresponding to the abstract input. Ordinarily, the abstract value is 3347 | simply rounded and packed into the quadruple-precision format, with the 3348 | inexact exception raised if the abstract input cannot be represented 3349 | exactly. However, if the abstract value is too large, the overflow and 3350 | inexact exceptions are raised and an infinity or maximal finite value is 3351 | returned. If the abstract value is too small, the input value is rounded to 3352 | a subnormal number, and the underflow and inexact exceptions are raised if 3353 | the abstract input cannot be represented exactly as a subnormal quadruple- 3354 | precision floating-point number. 3355 | The input significand must be normalized or smaller. If the input 3356 | significand is not normalized, `zExp' must be 0; in that case, the result 3357 | returned is a subnormal number, and it must not require rounding. In the 3358 | usual case that the input significand is normalized, `zExp' must be 1 less 3359 | than the ``true'' floating-point exponent. The handling of underflow and 3360 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3361 *----------------------------------------------------------------------------*/ 3362 3363 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 3364 uint64_t zSig0, uint64_t zSig1, 3365 uint64_t zSig2, float_status *status) 3366 { 3367 int8_t roundingMode; 3368 flag roundNearestEven, increment, isTiny; 3369 3370 roundingMode = status->float_rounding_mode; 3371 roundNearestEven = ( roundingMode == float_round_nearest_even ); 3372 switch (roundingMode) { 3373 case float_round_nearest_even: 3374 case float_round_ties_away: 3375 increment = ((int64_t)zSig2 < 0); 3376 break; 3377 case float_round_to_zero: 3378 increment = 0; 3379 break; 3380 case float_round_up: 3381 increment = !zSign && zSig2; 3382 break; 3383 case float_round_down: 3384 increment = zSign && zSig2; 3385 break; 3386 case float_round_to_odd: 3387 increment = !(zSig1 & 0x1) && zSig2; 3388 break; 3389 default: 3390 abort(); 3391 } 3392 if ( 0x7FFD <= (uint32_t) zExp ) { 3393 if ( ( 0x7FFD < zExp ) 3394 || ( ( zExp == 0x7FFD ) 3395 && eq128( 3396 LIT64( 0x0001FFFFFFFFFFFF ), 3397 LIT64( 0xFFFFFFFFFFFFFFFF ), 3398 zSig0, 3399 zSig1 3400 ) 3401 && increment 3402 ) 3403 ) { 3404 float_raise(float_flag_overflow | float_flag_inexact, status); 3405 if ( ( roundingMode == float_round_to_zero ) 3406 || ( zSign && ( roundingMode == float_round_up ) ) 3407 || ( ! zSign && ( roundingMode == float_round_down ) ) 3408 || (roundingMode == float_round_to_odd) 3409 ) { 3410 return 3411 packFloat128( 3412 zSign, 3413 0x7FFE, 3414 LIT64( 0x0000FFFFFFFFFFFF ), 3415 LIT64( 0xFFFFFFFFFFFFFFFF ) 3416 ); 3417 } 3418 return packFloat128( zSign, 0x7FFF, 0, 0 ); 3419 } 3420 if ( zExp < 0 ) { 3421 if (status->flush_to_zero) { 3422 float_raise(float_flag_output_denormal, status); 3423 return packFloat128(zSign, 0, 0, 0); 3424 } 3425 isTiny = 3426 (status->float_detect_tininess 3427 == float_tininess_before_rounding) 3428 || ( zExp < -1 ) 3429 || ! increment 3430 || lt128( 3431 zSig0, 3432 zSig1, 3433 LIT64( 0x0001FFFFFFFFFFFF ), 3434 LIT64( 0xFFFFFFFFFFFFFFFF ) 3435 ); 3436 shift128ExtraRightJamming( 3437 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 3438 zExp = 0; 3439 if (isTiny && zSig2) { 3440 float_raise(float_flag_underflow, status); 3441 } 3442 switch (roundingMode) { 3443 case float_round_nearest_even: 3444 case float_round_ties_away: 3445 increment = ((int64_t)zSig2 < 0); 3446 break; 3447 case float_round_to_zero: 3448 increment = 0; 3449 break; 3450 case float_round_up: 3451 increment = !zSign && zSig2; 3452 break; 3453 case float_round_down: 3454 increment = zSign && zSig2; 3455 break; 3456 case float_round_to_odd: 3457 increment = !(zSig1 & 0x1) && zSig2; 3458 break; 3459 default: 3460 abort(); 3461 } 3462 } 3463 } 3464 if (zSig2) { 3465 status->float_exception_flags |= float_flag_inexact; 3466 } 3467 if ( increment ) { 3468 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 3469 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 3470 } 3471 else { 3472 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 3473 } 3474 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3475 3476 } 3477 3478 /*---------------------------------------------------------------------------- 3479 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3480 | and significand formed by the concatenation of `zSig0' and `zSig1', and 3481 | returns the proper quadruple-precision floating-point value corresponding 3482 | to the abstract input. This routine is just like `roundAndPackFloat128' 3483 | except that the input significand has fewer bits and does not have to be 3484 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 3485 | point exponent. 3486 *----------------------------------------------------------------------------*/ 3487 3488 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 3489 uint64_t zSig0, uint64_t zSig1, 3490 float_status *status) 3491 { 3492 int8_t shiftCount; 3493 uint64_t zSig2; 3494 3495 if ( zSig0 == 0 ) { 3496 zSig0 = zSig1; 3497 zSig1 = 0; 3498 zExp -= 64; 3499 } 3500 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3501 if ( 0 <= shiftCount ) { 3502 zSig2 = 0; 3503 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3504 } 3505 else { 3506 shift128ExtraRightJamming( 3507 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3508 } 3509 zExp -= shiftCount; 3510 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3511 3512 } 3513 3514 3515 /*---------------------------------------------------------------------------- 3516 | Returns the result of converting the 32-bit two's complement integer `a' 3517 | to the extended double-precision floating-point format. The conversion 3518 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3519 | Arithmetic. 3520 *----------------------------------------------------------------------------*/ 3521 3522 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3523 { 3524 flag zSign; 3525 uint32_t absA; 3526 int8_t shiftCount; 3527 uint64_t zSig; 3528 3529 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3530 zSign = ( a < 0 ); 3531 absA = zSign ? - a : a; 3532 shiftCount = countLeadingZeros32( absA ) + 32; 3533 zSig = absA; 3534 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3535 3536 } 3537 3538 /*---------------------------------------------------------------------------- 3539 | Returns the result of converting the 32-bit two's complement integer `a' to 3540 | the quadruple-precision floating-point format. The conversion is performed 3541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3542 *----------------------------------------------------------------------------*/ 3543 3544 float128 int32_to_float128(int32_t a, float_status *status) 3545 { 3546 flag zSign; 3547 uint32_t absA; 3548 int8_t shiftCount; 3549 uint64_t zSig0; 3550 3551 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3552 zSign = ( a < 0 ); 3553 absA = zSign ? - a : a; 3554 shiftCount = countLeadingZeros32( absA ) + 17; 3555 zSig0 = absA; 3556 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3557 3558 } 3559 3560 /*---------------------------------------------------------------------------- 3561 | Returns the result of converting the 64-bit two's complement integer `a' 3562 | to the extended double-precision floating-point format. The conversion 3563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3564 | Arithmetic. 3565 *----------------------------------------------------------------------------*/ 3566 3567 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3568 { 3569 flag zSign; 3570 uint64_t absA; 3571 int8_t shiftCount; 3572 3573 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3574 zSign = ( a < 0 ); 3575 absA = zSign ? - a : a; 3576 shiftCount = countLeadingZeros64( absA ); 3577 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3578 3579 } 3580 3581 /*---------------------------------------------------------------------------- 3582 | Returns the result of converting the 64-bit two's complement integer `a' to 3583 | the quadruple-precision floating-point format. The conversion is performed 3584 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3585 *----------------------------------------------------------------------------*/ 3586 3587 float128 int64_to_float128(int64_t a, float_status *status) 3588 { 3589 flag zSign; 3590 uint64_t absA; 3591 int8_t shiftCount; 3592 int32_t zExp; 3593 uint64_t zSig0, zSig1; 3594 3595 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3596 zSign = ( a < 0 ); 3597 absA = zSign ? - a : a; 3598 shiftCount = countLeadingZeros64( absA ) + 49; 3599 zExp = 0x406E - shiftCount; 3600 if ( 64 <= shiftCount ) { 3601 zSig1 = 0; 3602 zSig0 = absA; 3603 shiftCount -= 64; 3604 } 3605 else { 3606 zSig1 = absA; 3607 zSig0 = 0; 3608 } 3609 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3610 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3611 3612 } 3613 3614 /*---------------------------------------------------------------------------- 3615 | Returns the result of converting the 64-bit unsigned integer `a' 3616 | to the quadruple-precision floating-point format. The conversion is performed 3617 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3618 *----------------------------------------------------------------------------*/ 3619 3620 float128 uint64_to_float128(uint64_t a, float_status *status) 3621 { 3622 if (a == 0) { 3623 return float128_zero; 3624 } 3625 return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status); 3626 } 3627 3628 /*---------------------------------------------------------------------------- 3629 | Returns the result of converting the single-precision floating-point value 3630 | `a' to the extended double-precision floating-point format. The conversion 3631 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3632 | Arithmetic. 3633 *----------------------------------------------------------------------------*/ 3634 3635 floatx80 float32_to_floatx80(float32 a, float_status *status) 3636 { 3637 flag aSign; 3638 int aExp; 3639 uint32_t aSig; 3640 3641 a = float32_squash_input_denormal(a, status); 3642 aSig = extractFloat32Frac( a ); 3643 aExp = extractFloat32Exp( a ); 3644 aSign = extractFloat32Sign( a ); 3645 if ( aExp == 0xFF ) { 3646 if (aSig) { 3647 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3648 } 3649 return packFloatx80(aSign, 3650 floatx80_infinity_high, 3651 floatx80_infinity_low); 3652 } 3653 if ( aExp == 0 ) { 3654 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3655 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3656 } 3657 aSig |= 0x00800000; 3658 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3659 3660 } 3661 3662 /*---------------------------------------------------------------------------- 3663 | Returns the result of converting the single-precision floating-point value 3664 | `a' to the double-precision floating-point format. The conversion is 3665 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3666 | Arithmetic. 3667 *----------------------------------------------------------------------------*/ 3668 3669 float128 float32_to_float128(float32 a, float_status *status) 3670 { 3671 flag aSign; 3672 int aExp; 3673 uint32_t aSig; 3674 3675 a = float32_squash_input_denormal(a, status); 3676 aSig = extractFloat32Frac( a ); 3677 aExp = extractFloat32Exp( a ); 3678 aSign = extractFloat32Sign( a ); 3679 if ( aExp == 0xFF ) { 3680 if (aSig) { 3681 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3682 } 3683 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3684 } 3685 if ( aExp == 0 ) { 3686 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3687 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3688 --aExp; 3689 } 3690 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3691 3692 } 3693 3694 /*---------------------------------------------------------------------------- 3695 | Returns the remainder of the single-precision floating-point value `a' 3696 | with respect to the corresponding value `b'. The operation is performed 3697 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3698 *----------------------------------------------------------------------------*/ 3699 3700 float32 float32_rem(float32 a, float32 b, float_status *status) 3701 { 3702 flag aSign, zSign; 3703 int aExp, bExp, expDiff; 3704 uint32_t aSig, bSig; 3705 uint32_t q; 3706 uint64_t aSig64, bSig64, q64; 3707 uint32_t alternateASig; 3708 int32_t sigMean; 3709 a = float32_squash_input_denormal(a, status); 3710 b = float32_squash_input_denormal(b, status); 3711 3712 aSig = extractFloat32Frac( a ); 3713 aExp = extractFloat32Exp( a ); 3714 aSign = extractFloat32Sign( a ); 3715 bSig = extractFloat32Frac( b ); 3716 bExp = extractFloat32Exp( b ); 3717 if ( aExp == 0xFF ) { 3718 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3719 return propagateFloat32NaN(a, b, status); 3720 } 3721 float_raise(float_flag_invalid, status); 3722 return float32_default_nan(status); 3723 } 3724 if ( bExp == 0xFF ) { 3725 if (bSig) { 3726 return propagateFloat32NaN(a, b, status); 3727 } 3728 return a; 3729 } 3730 if ( bExp == 0 ) { 3731 if ( bSig == 0 ) { 3732 float_raise(float_flag_invalid, status); 3733 return float32_default_nan(status); 3734 } 3735 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3736 } 3737 if ( aExp == 0 ) { 3738 if ( aSig == 0 ) return a; 3739 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3740 } 3741 expDiff = aExp - bExp; 3742 aSig |= 0x00800000; 3743 bSig |= 0x00800000; 3744 if ( expDiff < 32 ) { 3745 aSig <<= 8; 3746 bSig <<= 8; 3747 if ( expDiff < 0 ) { 3748 if ( expDiff < -1 ) return a; 3749 aSig >>= 1; 3750 } 3751 q = ( bSig <= aSig ); 3752 if ( q ) aSig -= bSig; 3753 if ( 0 < expDiff ) { 3754 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3755 q >>= 32 - expDiff; 3756 bSig >>= 2; 3757 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3758 } 3759 else { 3760 aSig >>= 2; 3761 bSig >>= 2; 3762 } 3763 } 3764 else { 3765 if ( bSig <= aSig ) aSig -= bSig; 3766 aSig64 = ( (uint64_t) aSig )<<40; 3767 bSig64 = ( (uint64_t) bSig )<<40; 3768 expDiff -= 64; 3769 while ( 0 < expDiff ) { 3770 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3771 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3772 aSig64 = - ( ( bSig * q64 )<<38 ); 3773 expDiff -= 62; 3774 } 3775 expDiff += 64; 3776 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3777 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3778 q = q64>>( 64 - expDiff ); 3779 bSig <<= 6; 3780 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3781 } 3782 do { 3783 alternateASig = aSig; 3784 ++q; 3785 aSig -= bSig; 3786 } while ( 0 <= (int32_t) aSig ); 3787 sigMean = aSig + alternateASig; 3788 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3789 aSig = alternateASig; 3790 } 3791 zSign = ( (int32_t) aSig < 0 ); 3792 if ( zSign ) aSig = - aSig; 3793 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3794 } 3795 3796 3797 3798 /*---------------------------------------------------------------------------- 3799 | Returns the binary exponential of the single-precision floating-point value 3800 | `a'. The operation is performed according to the IEC/IEEE Standard for 3801 | Binary Floating-Point Arithmetic. 3802 | 3803 | Uses the following identities: 3804 | 3805 | 1. ------------------------------------------------------------------------- 3806 | x x*ln(2) 3807 | 2 = e 3808 | 3809 | 2. ------------------------------------------------------------------------- 3810 | 2 3 4 5 n 3811 | x x x x x x x 3812 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3813 | 1! 2! 3! 4! 5! n! 3814 *----------------------------------------------------------------------------*/ 3815 3816 static const float64 float32_exp2_coefficients[15] = 3817 { 3818 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3819 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3820 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3821 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3822 const_float64( 0x3f81111111111111ll ), /* 5 */ 3823 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3824 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3825 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3826 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3827 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3828 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3829 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3830 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3831 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3832 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3833 }; 3834 3835 float32 float32_exp2(float32 a, float_status *status) 3836 { 3837 flag aSign; 3838 int aExp; 3839 uint32_t aSig; 3840 float64 r, x, xn; 3841 int i; 3842 a = float32_squash_input_denormal(a, status); 3843 3844 aSig = extractFloat32Frac( a ); 3845 aExp = extractFloat32Exp( a ); 3846 aSign = extractFloat32Sign( a ); 3847 3848 if ( aExp == 0xFF) { 3849 if (aSig) { 3850 return propagateFloat32NaN(a, float32_zero, status); 3851 } 3852 return (aSign) ? float32_zero : a; 3853 } 3854 if (aExp == 0) { 3855 if (aSig == 0) return float32_one; 3856 } 3857 3858 float_raise(float_flag_inexact, status); 3859 3860 /* ******************************* */ 3861 /* using float64 for approximation */ 3862 /* ******************************* */ 3863 x = float32_to_float64(a, status); 3864 x = float64_mul(x, float64_ln2, status); 3865 3866 xn = x; 3867 r = float64_one; 3868 for (i = 0 ; i < 15 ; i++) { 3869 float64 f; 3870 3871 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3872 r = float64_add(r, f, status); 3873 3874 xn = float64_mul(xn, x, status); 3875 } 3876 3877 return float64_to_float32(r, status); 3878 } 3879 3880 /*---------------------------------------------------------------------------- 3881 | Returns the binary log of the single-precision floating-point value `a'. 3882 | The operation is performed according to the IEC/IEEE Standard for Binary 3883 | Floating-Point Arithmetic. 3884 *----------------------------------------------------------------------------*/ 3885 float32 float32_log2(float32 a, float_status *status) 3886 { 3887 flag aSign, zSign; 3888 int aExp; 3889 uint32_t aSig, zSig, i; 3890 3891 a = float32_squash_input_denormal(a, status); 3892 aSig = extractFloat32Frac( a ); 3893 aExp = extractFloat32Exp( a ); 3894 aSign = extractFloat32Sign( a ); 3895 3896 if ( aExp == 0 ) { 3897 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3898 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3899 } 3900 if ( aSign ) { 3901 float_raise(float_flag_invalid, status); 3902 return float32_default_nan(status); 3903 } 3904 if ( aExp == 0xFF ) { 3905 if (aSig) { 3906 return propagateFloat32NaN(a, float32_zero, status); 3907 } 3908 return a; 3909 } 3910 3911 aExp -= 0x7F; 3912 aSig |= 0x00800000; 3913 zSign = aExp < 0; 3914 zSig = aExp << 23; 3915 3916 for (i = 1 << 22; i > 0; i >>= 1) { 3917 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3918 if ( aSig & 0x01000000 ) { 3919 aSig >>= 1; 3920 zSig |= i; 3921 } 3922 } 3923 3924 if ( zSign ) 3925 zSig = -zSig; 3926 3927 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3928 } 3929 3930 /*---------------------------------------------------------------------------- 3931 | Returns 1 if the single-precision floating-point value `a' is equal to 3932 | the corresponding value `b', and 0 otherwise. The invalid exception is 3933 | raised if either operand is a NaN. Otherwise, the comparison is performed 3934 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3935 *----------------------------------------------------------------------------*/ 3936 3937 int float32_eq(float32 a, float32 b, float_status *status) 3938 { 3939 uint32_t av, bv; 3940 a = float32_squash_input_denormal(a, status); 3941 b = float32_squash_input_denormal(b, status); 3942 3943 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3944 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3945 ) { 3946 float_raise(float_flag_invalid, status); 3947 return 0; 3948 } 3949 av = float32_val(a); 3950 bv = float32_val(b); 3951 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3952 } 3953 3954 /*---------------------------------------------------------------------------- 3955 | Returns 1 if the single-precision floating-point value `a' is less than 3956 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3957 | exception is raised if either operand is a NaN. The comparison is performed 3958 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3959 *----------------------------------------------------------------------------*/ 3960 3961 int float32_le(float32 a, float32 b, float_status *status) 3962 { 3963 flag aSign, bSign; 3964 uint32_t av, bv; 3965 a = float32_squash_input_denormal(a, status); 3966 b = float32_squash_input_denormal(b, status); 3967 3968 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3969 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3970 ) { 3971 float_raise(float_flag_invalid, status); 3972 return 0; 3973 } 3974 aSign = extractFloat32Sign( a ); 3975 bSign = extractFloat32Sign( b ); 3976 av = float32_val(a); 3977 bv = float32_val(b); 3978 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3979 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3980 3981 } 3982 3983 /*---------------------------------------------------------------------------- 3984 | Returns 1 if the single-precision floating-point value `a' is less than 3985 | the corresponding value `b', and 0 otherwise. The invalid exception is 3986 | raised if either operand is a NaN. The comparison is performed according 3987 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3988 *----------------------------------------------------------------------------*/ 3989 3990 int float32_lt(float32 a, float32 b, float_status *status) 3991 { 3992 flag aSign, bSign; 3993 uint32_t av, bv; 3994 a = float32_squash_input_denormal(a, status); 3995 b = float32_squash_input_denormal(b, status); 3996 3997 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3998 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3999 ) { 4000 float_raise(float_flag_invalid, status); 4001 return 0; 4002 } 4003 aSign = extractFloat32Sign( a ); 4004 bSign = extractFloat32Sign( b ); 4005 av = float32_val(a); 4006 bv = float32_val(b); 4007 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4008 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4009 4010 } 4011 4012 /*---------------------------------------------------------------------------- 4013 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4014 | be compared, and 0 otherwise. The invalid exception is raised if either 4015 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4016 | Standard for Binary Floating-Point Arithmetic. 4017 *----------------------------------------------------------------------------*/ 4018 4019 int float32_unordered(float32 a, float32 b, float_status *status) 4020 { 4021 a = float32_squash_input_denormal(a, status); 4022 b = float32_squash_input_denormal(b, status); 4023 4024 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4025 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4026 ) { 4027 float_raise(float_flag_invalid, status); 4028 return 1; 4029 } 4030 return 0; 4031 } 4032 4033 /*---------------------------------------------------------------------------- 4034 | Returns 1 if the single-precision floating-point value `a' is equal to 4035 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4036 | exception. The comparison is performed according to the IEC/IEEE Standard 4037 | for Binary Floating-Point Arithmetic. 4038 *----------------------------------------------------------------------------*/ 4039 4040 int float32_eq_quiet(float32 a, float32 b, float_status *status) 4041 { 4042 a = float32_squash_input_denormal(a, status); 4043 b = float32_squash_input_denormal(b, status); 4044 4045 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4046 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4047 ) { 4048 if (float32_is_signaling_nan(a, status) 4049 || float32_is_signaling_nan(b, status)) { 4050 float_raise(float_flag_invalid, status); 4051 } 4052 return 0; 4053 } 4054 return ( float32_val(a) == float32_val(b) ) || 4055 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 4056 } 4057 4058 /*---------------------------------------------------------------------------- 4059 | Returns 1 if the single-precision floating-point value `a' is less than or 4060 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4061 | cause an exception. Otherwise, the comparison is performed according to the 4062 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4063 *----------------------------------------------------------------------------*/ 4064 4065 int float32_le_quiet(float32 a, float32 b, float_status *status) 4066 { 4067 flag aSign, bSign; 4068 uint32_t av, bv; 4069 a = float32_squash_input_denormal(a, status); 4070 b = float32_squash_input_denormal(b, status); 4071 4072 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4073 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4074 ) { 4075 if (float32_is_signaling_nan(a, status) 4076 || float32_is_signaling_nan(b, status)) { 4077 float_raise(float_flag_invalid, status); 4078 } 4079 return 0; 4080 } 4081 aSign = extractFloat32Sign( a ); 4082 bSign = extractFloat32Sign( b ); 4083 av = float32_val(a); 4084 bv = float32_val(b); 4085 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 4086 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4087 4088 } 4089 4090 /*---------------------------------------------------------------------------- 4091 | Returns 1 if the single-precision floating-point value `a' is less than 4092 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4093 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4094 | Standard for Binary Floating-Point Arithmetic. 4095 *----------------------------------------------------------------------------*/ 4096 4097 int float32_lt_quiet(float32 a, float32 b, float_status *status) 4098 { 4099 flag aSign, bSign; 4100 uint32_t av, bv; 4101 a = float32_squash_input_denormal(a, status); 4102 b = float32_squash_input_denormal(b, status); 4103 4104 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4105 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4106 ) { 4107 if (float32_is_signaling_nan(a, status) 4108 || float32_is_signaling_nan(b, status)) { 4109 float_raise(float_flag_invalid, status); 4110 } 4111 return 0; 4112 } 4113 aSign = extractFloat32Sign( a ); 4114 bSign = extractFloat32Sign( b ); 4115 av = float32_val(a); 4116 bv = float32_val(b); 4117 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 4118 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4119 4120 } 4121 4122 /*---------------------------------------------------------------------------- 4123 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 4124 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4125 | comparison is performed according to the IEC/IEEE Standard for Binary 4126 | Floating-Point Arithmetic. 4127 *----------------------------------------------------------------------------*/ 4128 4129 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 4130 { 4131 a = float32_squash_input_denormal(a, status); 4132 b = float32_squash_input_denormal(b, status); 4133 4134 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 4135 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 4136 ) { 4137 if (float32_is_signaling_nan(a, status) 4138 || float32_is_signaling_nan(b, status)) { 4139 float_raise(float_flag_invalid, status); 4140 } 4141 return 1; 4142 } 4143 return 0; 4144 } 4145 4146 /*---------------------------------------------------------------------------- 4147 | If `a' is denormal and we are in flush-to-zero mode then set the 4148 | input-denormal exception and return zero. Otherwise just return the value. 4149 *----------------------------------------------------------------------------*/ 4150 float16 float16_squash_input_denormal(float16 a, float_status *status) 4151 { 4152 if (status->flush_inputs_to_zero) { 4153 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 4154 float_raise(float_flag_input_denormal, status); 4155 return make_float16(float16_val(a) & 0x8000); 4156 } 4157 } 4158 return a; 4159 } 4160 4161 /*---------------------------------------------------------------------------- 4162 | Returns the result of converting the double-precision floating-point value 4163 | `a' to the extended double-precision floating-point format. The conversion 4164 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4165 | Arithmetic. 4166 *----------------------------------------------------------------------------*/ 4167 4168 floatx80 float64_to_floatx80(float64 a, float_status *status) 4169 { 4170 flag aSign; 4171 int aExp; 4172 uint64_t aSig; 4173 4174 a = float64_squash_input_denormal(a, status); 4175 aSig = extractFloat64Frac( a ); 4176 aExp = extractFloat64Exp( a ); 4177 aSign = extractFloat64Sign( a ); 4178 if ( aExp == 0x7FF ) { 4179 if (aSig) { 4180 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4181 } 4182 return packFloatx80(aSign, 4183 floatx80_infinity_high, 4184 floatx80_infinity_low); 4185 } 4186 if ( aExp == 0 ) { 4187 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4188 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4189 } 4190 return 4191 packFloatx80( 4192 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4193 4194 } 4195 4196 /*---------------------------------------------------------------------------- 4197 | Returns the result of converting the double-precision floating-point value 4198 | `a' to the quadruple-precision floating-point format. The conversion is 4199 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4200 | Arithmetic. 4201 *----------------------------------------------------------------------------*/ 4202 4203 float128 float64_to_float128(float64 a, float_status *status) 4204 { 4205 flag aSign; 4206 int aExp; 4207 uint64_t aSig, zSig0, zSig1; 4208 4209 a = float64_squash_input_denormal(a, status); 4210 aSig = extractFloat64Frac( a ); 4211 aExp = extractFloat64Exp( a ); 4212 aSign = extractFloat64Sign( a ); 4213 if ( aExp == 0x7FF ) { 4214 if (aSig) { 4215 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4216 } 4217 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4218 } 4219 if ( aExp == 0 ) { 4220 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4221 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4222 --aExp; 4223 } 4224 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4225 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4226 4227 } 4228 4229 4230 /*---------------------------------------------------------------------------- 4231 | Returns the remainder of the double-precision floating-point value `a' 4232 | with respect to the corresponding value `b'. The operation is performed 4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4234 *----------------------------------------------------------------------------*/ 4235 4236 float64 float64_rem(float64 a, float64 b, float_status *status) 4237 { 4238 flag aSign, zSign; 4239 int aExp, bExp, expDiff; 4240 uint64_t aSig, bSig; 4241 uint64_t q, alternateASig; 4242 int64_t sigMean; 4243 4244 a = float64_squash_input_denormal(a, status); 4245 b = float64_squash_input_denormal(b, status); 4246 aSig = extractFloat64Frac( a ); 4247 aExp = extractFloat64Exp( a ); 4248 aSign = extractFloat64Sign( a ); 4249 bSig = extractFloat64Frac( b ); 4250 bExp = extractFloat64Exp( b ); 4251 if ( aExp == 0x7FF ) { 4252 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4253 return propagateFloat64NaN(a, b, status); 4254 } 4255 float_raise(float_flag_invalid, status); 4256 return float64_default_nan(status); 4257 } 4258 if ( bExp == 0x7FF ) { 4259 if (bSig) { 4260 return propagateFloat64NaN(a, b, status); 4261 } 4262 return a; 4263 } 4264 if ( bExp == 0 ) { 4265 if ( bSig == 0 ) { 4266 float_raise(float_flag_invalid, status); 4267 return float64_default_nan(status); 4268 } 4269 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4270 } 4271 if ( aExp == 0 ) { 4272 if ( aSig == 0 ) return a; 4273 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4274 } 4275 expDiff = aExp - bExp; 4276 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4277 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4278 if ( expDiff < 0 ) { 4279 if ( expDiff < -1 ) return a; 4280 aSig >>= 1; 4281 } 4282 q = ( bSig <= aSig ); 4283 if ( q ) aSig -= bSig; 4284 expDiff -= 64; 4285 while ( 0 < expDiff ) { 4286 q = estimateDiv128To64( aSig, 0, bSig ); 4287 q = ( 2 < q ) ? q - 2 : 0; 4288 aSig = - ( ( bSig>>2 ) * q ); 4289 expDiff -= 62; 4290 } 4291 expDiff += 64; 4292 if ( 0 < expDiff ) { 4293 q = estimateDiv128To64( aSig, 0, bSig ); 4294 q = ( 2 < q ) ? q - 2 : 0; 4295 q >>= 64 - expDiff; 4296 bSig >>= 2; 4297 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4298 } 4299 else { 4300 aSig >>= 2; 4301 bSig >>= 2; 4302 } 4303 do { 4304 alternateASig = aSig; 4305 ++q; 4306 aSig -= bSig; 4307 } while ( 0 <= (int64_t) aSig ); 4308 sigMean = aSig + alternateASig; 4309 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4310 aSig = alternateASig; 4311 } 4312 zSign = ( (int64_t) aSig < 0 ); 4313 if ( zSign ) aSig = - aSig; 4314 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4315 4316 } 4317 4318 /*---------------------------------------------------------------------------- 4319 | Returns the binary log of the double-precision floating-point value `a'. 4320 | The operation is performed according to the IEC/IEEE Standard for Binary 4321 | Floating-Point Arithmetic. 4322 *----------------------------------------------------------------------------*/ 4323 float64 float64_log2(float64 a, float_status *status) 4324 { 4325 flag aSign, zSign; 4326 int aExp; 4327 uint64_t aSig, aSig0, aSig1, zSig, i; 4328 a = float64_squash_input_denormal(a, status); 4329 4330 aSig = extractFloat64Frac( a ); 4331 aExp = extractFloat64Exp( a ); 4332 aSign = extractFloat64Sign( a ); 4333 4334 if ( aExp == 0 ) { 4335 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4336 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4337 } 4338 if ( aSign ) { 4339 float_raise(float_flag_invalid, status); 4340 return float64_default_nan(status); 4341 } 4342 if ( aExp == 0x7FF ) { 4343 if (aSig) { 4344 return propagateFloat64NaN(a, float64_zero, status); 4345 } 4346 return a; 4347 } 4348 4349 aExp -= 0x3FF; 4350 aSig |= LIT64( 0x0010000000000000 ); 4351 zSign = aExp < 0; 4352 zSig = (uint64_t)aExp << 52; 4353 for (i = 1LL << 51; i > 0; i >>= 1) { 4354 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4355 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4356 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4357 aSig >>= 1; 4358 zSig |= i; 4359 } 4360 } 4361 4362 if ( zSign ) 4363 zSig = -zSig; 4364 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4365 } 4366 4367 /*---------------------------------------------------------------------------- 4368 | Returns 1 if the double-precision floating-point value `a' is equal to the 4369 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4370 | if either operand is a NaN. Otherwise, the comparison is performed 4371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4372 *----------------------------------------------------------------------------*/ 4373 4374 int float64_eq(float64 a, float64 b, float_status *status) 4375 { 4376 uint64_t av, bv; 4377 a = float64_squash_input_denormal(a, status); 4378 b = float64_squash_input_denormal(b, status); 4379 4380 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4381 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4382 ) { 4383 float_raise(float_flag_invalid, status); 4384 return 0; 4385 } 4386 av = float64_val(a); 4387 bv = float64_val(b); 4388 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4389 4390 } 4391 4392 /*---------------------------------------------------------------------------- 4393 | Returns 1 if the double-precision floating-point value `a' is less than or 4394 | equal to the corresponding value `b', and 0 otherwise. The invalid 4395 | exception is raised if either operand is a NaN. The comparison is performed 4396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4397 *----------------------------------------------------------------------------*/ 4398 4399 int float64_le(float64 a, float64 b, float_status *status) 4400 { 4401 flag aSign, bSign; 4402 uint64_t av, bv; 4403 a = float64_squash_input_denormal(a, status); 4404 b = float64_squash_input_denormal(b, status); 4405 4406 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4407 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4408 ) { 4409 float_raise(float_flag_invalid, status); 4410 return 0; 4411 } 4412 aSign = extractFloat64Sign( a ); 4413 bSign = extractFloat64Sign( b ); 4414 av = float64_val(a); 4415 bv = float64_val(b); 4416 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4417 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4418 4419 } 4420 4421 /*---------------------------------------------------------------------------- 4422 | Returns 1 if the double-precision floating-point value `a' is less than 4423 | the corresponding value `b', and 0 otherwise. The invalid exception is 4424 | raised if either operand is a NaN. The comparison is performed according 4425 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4426 *----------------------------------------------------------------------------*/ 4427 4428 int float64_lt(float64 a, float64 b, float_status *status) 4429 { 4430 flag aSign, bSign; 4431 uint64_t av, bv; 4432 4433 a = float64_squash_input_denormal(a, status); 4434 b = float64_squash_input_denormal(b, status); 4435 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4436 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4437 ) { 4438 float_raise(float_flag_invalid, status); 4439 return 0; 4440 } 4441 aSign = extractFloat64Sign( a ); 4442 bSign = extractFloat64Sign( b ); 4443 av = float64_val(a); 4444 bv = float64_val(b); 4445 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4446 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4447 4448 } 4449 4450 /*---------------------------------------------------------------------------- 4451 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4452 | be compared, and 0 otherwise. The invalid exception is raised if either 4453 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4454 | Standard for Binary Floating-Point Arithmetic. 4455 *----------------------------------------------------------------------------*/ 4456 4457 int float64_unordered(float64 a, float64 b, float_status *status) 4458 { 4459 a = float64_squash_input_denormal(a, status); 4460 b = float64_squash_input_denormal(b, status); 4461 4462 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4463 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4464 ) { 4465 float_raise(float_flag_invalid, status); 4466 return 1; 4467 } 4468 return 0; 4469 } 4470 4471 /*---------------------------------------------------------------------------- 4472 | Returns 1 if the double-precision floating-point value `a' is equal to the 4473 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4474 | exception.The comparison is performed according to the IEC/IEEE Standard 4475 | for Binary Floating-Point Arithmetic. 4476 *----------------------------------------------------------------------------*/ 4477 4478 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4479 { 4480 uint64_t av, bv; 4481 a = float64_squash_input_denormal(a, status); 4482 b = float64_squash_input_denormal(b, status); 4483 4484 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4485 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4486 ) { 4487 if (float64_is_signaling_nan(a, status) 4488 || float64_is_signaling_nan(b, status)) { 4489 float_raise(float_flag_invalid, status); 4490 } 4491 return 0; 4492 } 4493 av = float64_val(a); 4494 bv = float64_val(b); 4495 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4496 4497 } 4498 4499 /*---------------------------------------------------------------------------- 4500 | Returns 1 if the double-precision floating-point value `a' is less than or 4501 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4502 | cause an exception. Otherwise, the comparison is performed according to the 4503 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4504 *----------------------------------------------------------------------------*/ 4505 4506 int float64_le_quiet(float64 a, float64 b, float_status *status) 4507 { 4508 flag aSign, bSign; 4509 uint64_t av, bv; 4510 a = float64_squash_input_denormal(a, status); 4511 b = float64_squash_input_denormal(b, status); 4512 4513 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4514 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4515 ) { 4516 if (float64_is_signaling_nan(a, status) 4517 || float64_is_signaling_nan(b, status)) { 4518 float_raise(float_flag_invalid, status); 4519 } 4520 return 0; 4521 } 4522 aSign = extractFloat64Sign( a ); 4523 bSign = extractFloat64Sign( b ); 4524 av = float64_val(a); 4525 bv = float64_val(b); 4526 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4527 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4528 4529 } 4530 4531 /*---------------------------------------------------------------------------- 4532 | Returns 1 if the double-precision floating-point value `a' is less than 4533 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4534 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4535 | Standard for Binary Floating-Point Arithmetic. 4536 *----------------------------------------------------------------------------*/ 4537 4538 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4539 { 4540 flag aSign, bSign; 4541 uint64_t av, bv; 4542 a = float64_squash_input_denormal(a, status); 4543 b = float64_squash_input_denormal(b, status); 4544 4545 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4546 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4547 ) { 4548 if (float64_is_signaling_nan(a, status) 4549 || float64_is_signaling_nan(b, status)) { 4550 float_raise(float_flag_invalid, status); 4551 } 4552 return 0; 4553 } 4554 aSign = extractFloat64Sign( a ); 4555 bSign = extractFloat64Sign( b ); 4556 av = float64_val(a); 4557 bv = float64_val(b); 4558 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4559 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4560 4561 } 4562 4563 /*---------------------------------------------------------------------------- 4564 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4565 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4566 | comparison is performed according to the IEC/IEEE Standard for Binary 4567 | Floating-Point Arithmetic. 4568 *----------------------------------------------------------------------------*/ 4569 4570 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4571 { 4572 a = float64_squash_input_denormal(a, status); 4573 b = float64_squash_input_denormal(b, status); 4574 4575 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4576 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4577 ) { 4578 if (float64_is_signaling_nan(a, status) 4579 || float64_is_signaling_nan(b, status)) { 4580 float_raise(float_flag_invalid, status); 4581 } 4582 return 1; 4583 } 4584 return 0; 4585 } 4586 4587 /*---------------------------------------------------------------------------- 4588 | Returns the result of converting the extended double-precision floating- 4589 | point value `a' to the 32-bit two's complement integer format. The 4590 | conversion is performed according to the IEC/IEEE Standard for Binary 4591 | Floating-Point Arithmetic---which means in particular that the conversion 4592 | is rounded according to the current rounding mode. If `a' is a NaN, the 4593 | largest positive integer is returned. Otherwise, if the conversion 4594 | overflows, the largest integer with the same sign as `a' is returned. 4595 *----------------------------------------------------------------------------*/ 4596 4597 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4598 { 4599 flag aSign; 4600 int32_t aExp, shiftCount; 4601 uint64_t aSig; 4602 4603 if (floatx80_invalid_encoding(a)) { 4604 float_raise(float_flag_invalid, status); 4605 return 1 << 31; 4606 } 4607 aSig = extractFloatx80Frac( a ); 4608 aExp = extractFloatx80Exp( a ); 4609 aSign = extractFloatx80Sign( a ); 4610 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4611 shiftCount = 0x4037 - aExp; 4612 if ( shiftCount <= 0 ) shiftCount = 1; 4613 shift64RightJamming( aSig, shiftCount, &aSig ); 4614 return roundAndPackInt32(aSign, aSig, status); 4615 4616 } 4617 4618 /*---------------------------------------------------------------------------- 4619 | Returns the result of converting the extended double-precision floating- 4620 | point value `a' to the 32-bit two's complement integer format. The 4621 | conversion is performed according to the IEC/IEEE Standard for Binary 4622 | Floating-Point Arithmetic, except that the conversion is always rounded 4623 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4624 | Otherwise, if the conversion overflows, the largest integer with the same 4625 | sign as `a' is returned. 4626 *----------------------------------------------------------------------------*/ 4627 4628 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4629 { 4630 flag aSign; 4631 int32_t aExp, shiftCount; 4632 uint64_t aSig, savedASig; 4633 int32_t z; 4634 4635 if (floatx80_invalid_encoding(a)) { 4636 float_raise(float_flag_invalid, status); 4637 return 1 << 31; 4638 } 4639 aSig = extractFloatx80Frac( a ); 4640 aExp = extractFloatx80Exp( a ); 4641 aSign = extractFloatx80Sign( a ); 4642 if ( 0x401E < aExp ) { 4643 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4644 goto invalid; 4645 } 4646 else if ( aExp < 0x3FFF ) { 4647 if (aExp || aSig) { 4648 status->float_exception_flags |= float_flag_inexact; 4649 } 4650 return 0; 4651 } 4652 shiftCount = 0x403E - aExp; 4653 savedASig = aSig; 4654 aSig >>= shiftCount; 4655 z = aSig; 4656 if ( aSign ) z = - z; 4657 if ( ( z < 0 ) ^ aSign ) { 4658 invalid: 4659 float_raise(float_flag_invalid, status); 4660 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4661 } 4662 if ( ( aSig<<shiftCount ) != savedASig ) { 4663 status->float_exception_flags |= float_flag_inexact; 4664 } 4665 return z; 4666 4667 } 4668 4669 /*---------------------------------------------------------------------------- 4670 | Returns the result of converting the extended double-precision floating- 4671 | point value `a' to the 64-bit two's complement integer format. The 4672 | conversion is performed according to the IEC/IEEE Standard for Binary 4673 | Floating-Point Arithmetic---which means in particular that the conversion 4674 | is rounded according to the current rounding mode. If `a' is a NaN, 4675 | the largest positive integer is returned. Otherwise, if the conversion 4676 | overflows, the largest integer with the same sign as `a' is returned. 4677 *----------------------------------------------------------------------------*/ 4678 4679 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4680 { 4681 flag aSign; 4682 int32_t aExp, shiftCount; 4683 uint64_t aSig, aSigExtra; 4684 4685 if (floatx80_invalid_encoding(a)) { 4686 float_raise(float_flag_invalid, status); 4687 return 1ULL << 63; 4688 } 4689 aSig = extractFloatx80Frac( a ); 4690 aExp = extractFloatx80Exp( a ); 4691 aSign = extractFloatx80Sign( a ); 4692 shiftCount = 0x403E - aExp; 4693 if ( shiftCount <= 0 ) { 4694 if ( shiftCount ) { 4695 float_raise(float_flag_invalid, status); 4696 if (!aSign || floatx80_is_any_nan(a)) { 4697 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4698 } 4699 return (int64_t) LIT64( 0x8000000000000000 ); 4700 } 4701 aSigExtra = 0; 4702 } 4703 else { 4704 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4705 } 4706 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4707 4708 } 4709 4710 /*---------------------------------------------------------------------------- 4711 | Returns the result of converting the extended double-precision floating- 4712 | point value `a' to the 64-bit two's complement integer format. The 4713 | conversion is performed according to the IEC/IEEE Standard for Binary 4714 | Floating-Point Arithmetic, except that the conversion is always rounded 4715 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4716 | Otherwise, if the conversion overflows, the largest integer with the same 4717 | sign as `a' is returned. 4718 *----------------------------------------------------------------------------*/ 4719 4720 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4721 { 4722 flag aSign; 4723 int32_t aExp, shiftCount; 4724 uint64_t aSig; 4725 int64_t z; 4726 4727 if (floatx80_invalid_encoding(a)) { 4728 float_raise(float_flag_invalid, status); 4729 return 1ULL << 63; 4730 } 4731 aSig = extractFloatx80Frac( a ); 4732 aExp = extractFloatx80Exp( a ); 4733 aSign = extractFloatx80Sign( a ); 4734 shiftCount = aExp - 0x403E; 4735 if ( 0 <= shiftCount ) { 4736 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4737 if ( ( a.high != 0xC03E ) || aSig ) { 4738 float_raise(float_flag_invalid, status); 4739 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4740 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4741 } 4742 } 4743 return (int64_t) LIT64( 0x8000000000000000 ); 4744 } 4745 else if ( aExp < 0x3FFF ) { 4746 if (aExp | aSig) { 4747 status->float_exception_flags |= float_flag_inexact; 4748 } 4749 return 0; 4750 } 4751 z = aSig>>( - shiftCount ); 4752 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4753 status->float_exception_flags |= float_flag_inexact; 4754 } 4755 if ( aSign ) z = - z; 4756 return z; 4757 4758 } 4759 4760 /*---------------------------------------------------------------------------- 4761 | Returns the result of converting the extended double-precision floating- 4762 | point value `a' to the single-precision floating-point format. The 4763 | conversion is performed according to the IEC/IEEE Standard for Binary 4764 | Floating-Point Arithmetic. 4765 *----------------------------------------------------------------------------*/ 4766 4767 float32 floatx80_to_float32(floatx80 a, float_status *status) 4768 { 4769 flag aSign; 4770 int32_t aExp; 4771 uint64_t aSig; 4772 4773 if (floatx80_invalid_encoding(a)) { 4774 float_raise(float_flag_invalid, status); 4775 return float32_default_nan(status); 4776 } 4777 aSig = extractFloatx80Frac( a ); 4778 aExp = extractFloatx80Exp( a ); 4779 aSign = extractFloatx80Sign( a ); 4780 if ( aExp == 0x7FFF ) { 4781 if ( (uint64_t) ( aSig<<1 ) ) { 4782 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4783 } 4784 return packFloat32( aSign, 0xFF, 0 ); 4785 } 4786 shift64RightJamming( aSig, 33, &aSig ); 4787 if ( aExp || aSig ) aExp -= 0x3F81; 4788 return roundAndPackFloat32(aSign, aExp, aSig, status); 4789 4790 } 4791 4792 /*---------------------------------------------------------------------------- 4793 | Returns the result of converting the extended double-precision floating- 4794 | point value `a' to the double-precision floating-point format. The 4795 | conversion is performed according to the IEC/IEEE Standard for Binary 4796 | Floating-Point Arithmetic. 4797 *----------------------------------------------------------------------------*/ 4798 4799 float64 floatx80_to_float64(floatx80 a, float_status *status) 4800 { 4801 flag aSign; 4802 int32_t aExp; 4803 uint64_t aSig, zSig; 4804 4805 if (floatx80_invalid_encoding(a)) { 4806 float_raise(float_flag_invalid, status); 4807 return float64_default_nan(status); 4808 } 4809 aSig = extractFloatx80Frac( a ); 4810 aExp = extractFloatx80Exp( a ); 4811 aSign = extractFloatx80Sign( a ); 4812 if ( aExp == 0x7FFF ) { 4813 if ( (uint64_t) ( aSig<<1 ) ) { 4814 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4815 } 4816 return packFloat64( aSign, 0x7FF, 0 ); 4817 } 4818 shift64RightJamming( aSig, 1, &zSig ); 4819 if ( aExp || aSig ) aExp -= 0x3C01; 4820 return roundAndPackFloat64(aSign, aExp, zSig, status); 4821 4822 } 4823 4824 /*---------------------------------------------------------------------------- 4825 | Returns the result of converting the extended double-precision floating- 4826 | point value `a' to the quadruple-precision floating-point format. The 4827 | conversion is performed according to the IEC/IEEE Standard for Binary 4828 | Floating-Point Arithmetic. 4829 *----------------------------------------------------------------------------*/ 4830 4831 float128 floatx80_to_float128(floatx80 a, float_status *status) 4832 { 4833 flag aSign; 4834 int aExp; 4835 uint64_t aSig, zSig0, zSig1; 4836 4837 if (floatx80_invalid_encoding(a)) { 4838 float_raise(float_flag_invalid, status); 4839 return float128_default_nan(status); 4840 } 4841 aSig = extractFloatx80Frac( a ); 4842 aExp = extractFloatx80Exp( a ); 4843 aSign = extractFloatx80Sign( a ); 4844 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4845 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4846 } 4847 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4848 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4849 4850 } 4851 4852 /*---------------------------------------------------------------------------- 4853 | Rounds the extended double-precision floating-point value `a' 4854 | to the precision provided by floatx80_rounding_precision and returns the 4855 | result as an extended double-precision floating-point value. 4856 | The operation is performed according to the IEC/IEEE Standard for Binary 4857 | Floating-Point Arithmetic. 4858 *----------------------------------------------------------------------------*/ 4859 4860 floatx80 floatx80_round(floatx80 a, float_status *status) 4861 { 4862 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4863 extractFloatx80Sign(a), 4864 extractFloatx80Exp(a), 4865 extractFloatx80Frac(a), 0, status); 4866 } 4867 4868 /*---------------------------------------------------------------------------- 4869 | Rounds the extended double-precision floating-point value `a' to an integer, 4870 | and returns the result as an extended quadruple-precision floating-point 4871 | value. The operation is performed according to the IEC/IEEE Standard for 4872 | Binary Floating-Point Arithmetic. 4873 *----------------------------------------------------------------------------*/ 4874 4875 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4876 { 4877 flag aSign; 4878 int32_t aExp; 4879 uint64_t lastBitMask, roundBitsMask; 4880 floatx80 z; 4881 4882 if (floatx80_invalid_encoding(a)) { 4883 float_raise(float_flag_invalid, status); 4884 return floatx80_default_nan(status); 4885 } 4886 aExp = extractFloatx80Exp( a ); 4887 if ( 0x403E <= aExp ) { 4888 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4889 return propagateFloatx80NaN(a, a, status); 4890 } 4891 return a; 4892 } 4893 if ( aExp < 0x3FFF ) { 4894 if ( ( aExp == 0 ) 4895 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4896 return a; 4897 } 4898 status->float_exception_flags |= float_flag_inexact; 4899 aSign = extractFloatx80Sign( a ); 4900 switch (status->float_rounding_mode) { 4901 case float_round_nearest_even: 4902 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4903 ) { 4904 return 4905 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4906 } 4907 break; 4908 case float_round_ties_away: 4909 if (aExp == 0x3FFE) { 4910 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4911 } 4912 break; 4913 case float_round_down: 4914 return 4915 aSign ? 4916 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4917 : packFloatx80( 0, 0, 0 ); 4918 case float_round_up: 4919 return 4920 aSign ? packFloatx80( 1, 0, 0 ) 4921 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4922 } 4923 return packFloatx80( aSign, 0, 0 ); 4924 } 4925 lastBitMask = 1; 4926 lastBitMask <<= 0x403E - aExp; 4927 roundBitsMask = lastBitMask - 1; 4928 z = a; 4929 switch (status->float_rounding_mode) { 4930 case float_round_nearest_even: 4931 z.low += lastBitMask>>1; 4932 if ((z.low & roundBitsMask) == 0) { 4933 z.low &= ~lastBitMask; 4934 } 4935 break; 4936 case float_round_ties_away: 4937 z.low += lastBitMask >> 1; 4938 break; 4939 case float_round_to_zero: 4940 break; 4941 case float_round_up: 4942 if (!extractFloatx80Sign(z)) { 4943 z.low += roundBitsMask; 4944 } 4945 break; 4946 case float_round_down: 4947 if (extractFloatx80Sign(z)) { 4948 z.low += roundBitsMask; 4949 } 4950 break; 4951 default: 4952 abort(); 4953 } 4954 z.low &= ~ roundBitsMask; 4955 if ( z.low == 0 ) { 4956 ++z.high; 4957 z.low = LIT64( 0x8000000000000000 ); 4958 } 4959 if (z.low != a.low) { 4960 status->float_exception_flags |= float_flag_inexact; 4961 } 4962 return z; 4963 4964 } 4965 4966 /*---------------------------------------------------------------------------- 4967 | Returns the result of adding the absolute values of the extended double- 4968 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4969 | negated before being returned. `zSign' is ignored if the result is a NaN. 4970 | The addition is performed according to the IEC/IEEE Standard for Binary 4971 | Floating-Point Arithmetic. 4972 *----------------------------------------------------------------------------*/ 4973 4974 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4975 float_status *status) 4976 { 4977 int32_t aExp, bExp, zExp; 4978 uint64_t aSig, bSig, zSig0, zSig1; 4979 int32_t expDiff; 4980 4981 aSig = extractFloatx80Frac( a ); 4982 aExp = extractFloatx80Exp( a ); 4983 bSig = extractFloatx80Frac( b ); 4984 bExp = extractFloatx80Exp( b ); 4985 expDiff = aExp - bExp; 4986 if ( 0 < expDiff ) { 4987 if ( aExp == 0x7FFF ) { 4988 if ((uint64_t)(aSig << 1)) { 4989 return propagateFloatx80NaN(a, b, status); 4990 } 4991 return a; 4992 } 4993 if ( bExp == 0 ) --expDiff; 4994 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4995 zExp = aExp; 4996 } 4997 else if ( expDiff < 0 ) { 4998 if ( bExp == 0x7FFF ) { 4999 if ((uint64_t)(bSig << 1)) { 5000 return propagateFloatx80NaN(a, b, status); 5001 } 5002 return packFloatx80(zSign, 5003 floatx80_infinity_high, 5004 floatx80_infinity_low); 5005 } 5006 if ( aExp == 0 ) ++expDiff; 5007 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5008 zExp = bExp; 5009 } 5010 else { 5011 if ( aExp == 0x7FFF ) { 5012 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5013 return propagateFloatx80NaN(a, b, status); 5014 } 5015 return a; 5016 } 5017 zSig1 = 0; 5018 zSig0 = aSig + bSig; 5019 if ( aExp == 0 ) { 5020 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 5021 goto roundAndPack; 5022 } 5023 zExp = aExp; 5024 goto shiftRight1; 5025 } 5026 zSig0 = aSig + bSig; 5027 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 5028 shiftRight1: 5029 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5030 zSig0 |= LIT64( 0x8000000000000000 ); 5031 ++zExp; 5032 roundAndPack: 5033 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5034 zSign, zExp, zSig0, zSig1, status); 5035 } 5036 5037 /*---------------------------------------------------------------------------- 5038 | Returns the result of subtracting the absolute values of the extended 5039 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 5040 | difference is negated before being returned. `zSign' is ignored if the 5041 | result is a NaN. The subtraction is performed according to the IEC/IEEE 5042 | Standard for Binary Floating-Point Arithmetic. 5043 *----------------------------------------------------------------------------*/ 5044 5045 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 5046 float_status *status) 5047 { 5048 int32_t aExp, bExp, zExp; 5049 uint64_t aSig, bSig, zSig0, zSig1; 5050 int32_t expDiff; 5051 5052 aSig = extractFloatx80Frac( a ); 5053 aExp = extractFloatx80Exp( a ); 5054 bSig = extractFloatx80Frac( b ); 5055 bExp = extractFloatx80Exp( b ); 5056 expDiff = aExp - bExp; 5057 if ( 0 < expDiff ) goto aExpBigger; 5058 if ( expDiff < 0 ) goto bExpBigger; 5059 if ( aExp == 0x7FFF ) { 5060 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5061 return propagateFloatx80NaN(a, b, status); 5062 } 5063 float_raise(float_flag_invalid, status); 5064 return floatx80_default_nan(status); 5065 } 5066 if ( aExp == 0 ) { 5067 aExp = 1; 5068 bExp = 1; 5069 } 5070 zSig1 = 0; 5071 if ( bSig < aSig ) goto aBigger; 5072 if ( aSig < bSig ) goto bBigger; 5073 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5074 bExpBigger: 5075 if ( bExp == 0x7FFF ) { 5076 if ((uint64_t)(bSig << 1)) { 5077 return propagateFloatx80NaN(a, b, status); 5078 } 5079 return packFloatx80(zSign ^ 1, floatx80_infinity_high, 5080 floatx80_infinity_low); 5081 } 5082 if ( aExp == 0 ) ++expDiff; 5083 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5084 bBigger: 5085 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5086 zExp = bExp; 5087 zSign ^= 1; 5088 goto normalizeRoundAndPack; 5089 aExpBigger: 5090 if ( aExp == 0x7FFF ) { 5091 if ((uint64_t)(aSig << 1)) { 5092 return propagateFloatx80NaN(a, b, status); 5093 } 5094 return a; 5095 } 5096 if ( bExp == 0 ) --expDiff; 5097 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5098 aBigger: 5099 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5100 zExp = aExp; 5101 normalizeRoundAndPack: 5102 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5103 zSign, zExp, zSig0, zSig1, status); 5104 } 5105 5106 /*---------------------------------------------------------------------------- 5107 | Returns the result of adding the extended double-precision floating-point 5108 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5109 | Standard for Binary Floating-Point Arithmetic. 5110 *----------------------------------------------------------------------------*/ 5111 5112 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5113 { 5114 flag aSign, bSign; 5115 5116 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5117 float_raise(float_flag_invalid, status); 5118 return floatx80_default_nan(status); 5119 } 5120 aSign = extractFloatx80Sign( a ); 5121 bSign = extractFloatx80Sign( b ); 5122 if ( aSign == bSign ) { 5123 return addFloatx80Sigs(a, b, aSign, status); 5124 } 5125 else { 5126 return subFloatx80Sigs(a, b, aSign, status); 5127 } 5128 5129 } 5130 5131 /*---------------------------------------------------------------------------- 5132 | Returns the result of subtracting the extended double-precision floating- 5133 | point values `a' and `b'. The operation is performed according to the 5134 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5135 *----------------------------------------------------------------------------*/ 5136 5137 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5138 { 5139 flag aSign, bSign; 5140 5141 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5142 float_raise(float_flag_invalid, status); 5143 return floatx80_default_nan(status); 5144 } 5145 aSign = extractFloatx80Sign( a ); 5146 bSign = extractFloatx80Sign( b ); 5147 if ( aSign == bSign ) { 5148 return subFloatx80Sigs(a, b, aSign, status); 5149 } 5150 else { 5151 return addFloatx80Sigs(a, b, aSign, status); 5152 } 5153 5154 } 5155 5156 /*---------------------------------------------------------------------------- 5157 | Returns the result of multiplying the extended double-precision floating- 5158 | point values `a' and `b'. The operation is performed according to the 5159 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5160 *----------------------------------------------------------------------------*/ 5161 5162 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5163 { 5164 flag aSign, bSign, zSign; 5165 int32_t aExp, bExp, zExp; 5166 uint64_t aSig, bSig, zSig0, zSig1; 5167 5168 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5169 float_raise(float_flag_invalid, status); 5170 return floatx80_default_nan(status); 5171 } 5172 aSig = extractFloatx80Frac( a ); 5173 aExp = extractFloatx80Exp( a ); 5174 aSign = extractFloatx80Sign( a ); 5175 bSig = extractFloatx80Frac( b ); 5176 bExp = extractFloatx80Exp( b ); 5177 bSign = extractFloatx80Sign( b ); 5178 zSign = aSign ^ bSign; 5179 if ( aExp == 0x7FFF ) { 5180 if ( (uint64_t) ( aSig<<1 ) 5181 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5182 return propagateFloatx80NaN(a, b, status); 5183 } 5184 if ( ( bExp | bSig ) == 0 ) goto invalid; 5185 return packFloatx80(zSign, floatx80_infinity_high, 5186 floatx80_infinity_low); 5187 } 5188 if ( bExp == 0x7FFF ) { 5189 if ((uint64_t)(bSig << 1)) { 5190 return propagateFloatx80NaN(a, b, status); 5191 } 5192 if ( ( aExp | aSig ) == 0 ) { 5193 invalid: 5194 float_raise(float_flag_invalid, status); 5195 return floatx80_default_nan(status); 5196 } 5197 return packFloatx80(zSign, floatx80_infinity_high, 5198 floatx80_infinity_low); 5199 } 5200 if ( aExp == 0 ) { 5201 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5202 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5203 } 5204 if ( bExp == 0 ) { 5205 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5206 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5207 } 5208 zExp = aExp + bExp - 0x3FFE; 5209 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5210 if ( 0 < (int64_t) zSig0 ) { 5211 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5212 --zExp; 5213 } 5214 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5215 zSign, zExp, zSig0, zSig1, status); 5216 } 5217 5218 /*---------------------------------------------------------------------------- 5219 | Returns the result of dividing the extended double-precision floating-point 5220 | value `a' by the corresponding value `b'. The operation is performed 5221 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5222 *----------------------------------------------------------------------------*/ 5223 5224 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5225 { 5226 flag aSign, bSign, zSign; 5227 int32_t aExp, bExp, zExp; 5228 uint64_t aSig, bSig, zSig0, zSig1; 5229 uint64_t rem0, rem1, rem2, term0, term1, term2; 5230 5231 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5232 float_raise(float_flag_invalid, status); 5233 return floatx80_default_nan(status); 5234 } 5235 aSig = extractFloatx80Frac( a ); 5236 aExp = extractFloatx80Exp( a ); 5237 aSign = extractFloatx80Sign( a ); 5238 bSig = extractFloatx80Frac( b ); 5239 bExp = extractFloatx80Exp( b ); 5240 bSign = extractFloatx80Sign( b ); 5241 zSign = aSign ^ bSign; 5242 if ( aExp == 0x7FFF ) { 5243 if ((uint64_t)(aSig << 1)) { 5244 return propagateFloatx80NaN(a, b, status); 5245 } 5246 if ( bExp == 0x7FFF ) { 5247 if ((uint64_t)(bSig << 1)) { 5248 return propagateFloatx80NaN(a, b, status); 5249 } 5250 goto invalid; 5251 } 5252 return packFloatx80(zSign, floatx80_infinity_high, 5253 floatx80_infinity_low); 5254 } 5255 if ( bExp == 0x7FFF ) { 5256 if ((uint64_t)(bSig << 1)) { 5257 return propagateFloatx80NaN(a, b, status); 5258 } 5259 return packFloatx80( zSign, 0, 0 ); 5260 } 5261 if ( bExp == 0 ) { 5262 if ( bSig == 0 ) { 5263 if ( ( aExp | aSig ) == 0 ) { 5264 invalid: 5265 float_raise(float_flag_invalid, status); 5266 return floatx80_default_nan(status); 5267 } 5268 float_raise(float_flag_divbyzero, status); 5269 return packFloatx80(zSign, floatx80_infinity_high, 5270 floatx80_infinity_low); 5271 } 5272 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5273 } 5274 if ( aExp == 0 ) { 5275 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5276 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5277 } 5278 zExp = aExp - bExp + 0x3FFE; 5279 rem1 = 0; 5280 if ( bSig <= aSig ) { 5281 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5282 ++zExp; 5283 } 5284 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5285 mul64To128( bSig, zSig0, &term0, &term1 ); 5286 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5287 while ( (int64_t) rem0 < 0 ) { 5288 --zSig0; 5289 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5290 } 5291 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5292 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5293 mul64To128( bSig, zSig1, &term1, &term2 ); 5294 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5295 while ( (int64_t) rem1 < 0 ) { 5296 --zSig1; 5297 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5298 } 5299 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5300 } 5301 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5302 zSign, zExp, zSig0, zSig1, status); 5303 } 5304 5305 /*---------------------------------------------------------------------------- 5306 | Returns the remainder of the extended double-precision floating-point value 5307 | `a' with respect to the corresponding value `b'. The operation is performed 5308 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5309 *----------------------------------------------------------------------------*/ 5310 5311 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5312 { 5313 flag aSign, zSign; 5314 int32_t aExp, bExp, expDiff; 5315 uint64_t aSig0, aSig1, bSig; 5316 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5317 5318 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5319 float_raise(float_flag_invalid, status); 5320 return floatx80_default_nan(status); 5321 } 5322 aSig0 = extractFloatx80Frac( a ); 5323 aExp = extractFloatx80Exp( a ); 5324 aSign = extractFloatx80Sign( a ); 5325 bSig = extractFloatx80Frac( b ); 5326 bExp = extractFloatx80Exp( b ); 5327 if ( aExp == 0x7FFF ) { 5328 if ( (uint64_t) ( aSig0<<1 ) 5329 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5330 return propagateFloatx80NaN(a, b, status); 5331 } 5332 goto invalid; 5333 } 5334 if ( bExp == 0x7FFF ) { 5335 if ((uint64_t)(bSig << 1)) { 5336 return propagateFloatx80NaN(a, b, status); 5337 } 5338 return a; 5339 } 5340 if ( bExp == 0 ) { 5341 if ( bSig == 0 ) { 5342 invalid: 5343 float_raise(float_flag_invalid, status); 5344 return floatx80_default_nan(status); 5345 } 5346 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5347 } 5348 if ( aExp == 0 ) { 5349 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5350 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5351 } 5352 bSig |= LIT64( 0x8000000000000000 ); 5353 zSign = aSign; 5354 expDiff = aExp - bExp; 5355 aSig1 = 0; 5356 if ( expDiff < 0 ) { 5357 if ( expDiff < -1 ) return a; 5358 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5359 expDiff = 0; 5360 } 5361 q = ( bSig <= aSig0 ); 5362 if ( q ) aSig0 -= bSig; 5363 expDiff -= 64; 5364 while ( 0 < expDiff ) { 5365 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5366 q = ( 2 < q ) ? q - 2 : 0; 5367 mul64To128( bSig, q, &term0, &term1 ); 5368 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5369 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5370 expDiff -= 62; 5371 } 5372 expDiff += 64; 5373 if ( 0 < expDiff ) { 5374 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5375 q = ( 2 < q ) ? q - 2 : 0; 5376 q >>= 64 - expDiff; 5377 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5378 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5379 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5380 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5381 ++q; 5382 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5383 } 5384 } 5385 else { 5386 term1 = 0; 5387 term0 = bSig; 5388 } 5389 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5390 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5391 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5392 && ( q & 1 ) ) 5393 ) { 5394 aSig0 = alternateASig0; 5395 aSig1 = alternateASig1; 5396 zSign = ! zSign; 5397 } 5398 return 5399 normalizeRoundAndPackFloatx80( 5400 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5401 5402 } 5403 5404 /*---------------------------------------------------------------------------- 5405 | Returns the square root of the extended double-precision floating-point 5406 | value `a'. The operation is performed according to the IEC/IEEE Standard 5407 | for Binary Floating-Point Arithmetic. 5408 *----------------------------------------------------------------------------*/ 5409 5410 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5411 { 5412 flag aSign; 5413 int32_t aExp, zExp; 5414 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5415 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5416 5417 if (floatx80_invalid_encoding(a)) { 5418 float_raise(float_flag_invalid, status); 5419 return floatx80_default_nan(status); 5420 } 5421 aSig0 = extractFloatx80Frac( a ); 5422 aExp = extractFloatx80Exp( a ); 5423 aSign = extractFloatx80Sign( a ); 5424 if ( aExp == 0x7FFF ) { 5425 if ((uint64_t)(aSig0 << 1)) { 5426 return propagateFloatx80NaN(a, a, status); 5427 } 5428 if ( ! aSign ) return a; 5429 goto invalid; 5430 } 5431 if ( aSign ) { 5432 if ( ( aExp | aSig0 ) == 0 ) return a; 5433 invalid: 5434 float_raise(float_flag_invalid, status); 5435 return floatx80_default_nan(status); 5436 } 5437 if ( aExp == 0 ) { 5438 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5439 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5440 } 5441 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5442 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5443 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5444 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5445 doubleZSig0 = zSig0<<1; 5446 mul64To128( zSig0, zSig0, &term0, &term1 ); 5447 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5448 while ( (int64_t) rem0 < 0 ) { 5449 --zSig0; 5450 doubleZSig0 -= 2; 5451 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5452 } 5453 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5454 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5455 if ( zSig1 == 0 ) zSig1 = 1; 5456 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5457 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5458 mul64To128( zSig1, zSig1, &term2, &term3 ); 5459 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5460 while ( (int64_t) rem1 < 0 ) { 5461 --zSig1; 5462 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5463 term3 |= 1; 5464 term2 |= doubleZSig0; 5465 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5466 } 5467 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5468 } 5469 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5470 zSig0 |= doubleZSig0; 5471 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5472 0, zExp, zSig0, zSig1, status); 5473 } 5474 5475 /*---------------------------------------------------------------------------- 5476 | Returns 1 if the extended double-precision floating-point value `a' is equal 5477 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5478 | raised if either operand is a NaN. Otherwise, the comparison is performed 5479 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5480 *----------------------------------------------------------------------------*/ 5481 5482 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5483 { 5484 5485 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5486 || (extractFloatx80Exp(a) == 0x7FFF 5487 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5488 || (extractFloatx80Exp(b) == 0x7FFF 5489 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5490 ) { 5491 float_raise(float_flag_invalid, status); 5492 return 0; 5493 } 5494 return 5495 ( a.low == b.low ) 5496 && ( ( a.high == b.high ) 5497 || ( ( a.low == 0 ) 5498 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5499 ); 5500 5501 } 5502 5503 /*---------------------------------------------------------------------------- 5504 | Returns 1 if the extended double-precision floating-point value `a' is 5505 | less than or equal to the corresponding value `b', and 0 otherwise. The 5506 | invalid exception is raised if either operand is a NaN. The comparison is 5507 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5508 | Arithmetic. 5509 *----------------------------------------------------------------------------*/ 5510 5511 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5512 { 5513 flag aSign, bSign; 5514 5515 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5516 || (extractFloatx80Exp(a) == 0x7FFF 5517 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5518 || (extractFloatx80Exp(b) == 0x7FFF 5519 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5520 ) { 5521 float_raise(float_flag_invalid, status); 5522 return 0; 5523 } 5524 aSign = extractFloatx80Sign( a ); 5525 bSign = extractFloatx80Sign( b ); 5526 if ( aSign != bSign ) { 5527 return 5528 aSign 5529 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5530 == 0 ); 5531 } 5532 return 5533 aSign ? le128( b.high, b.low, a.high, a.low ) 5534 : le128( a.high, a.low, b.high, b.low ); 5535 5536 } 5537 5538 /*---------------------------------------------------------------------------- 5539 | Returns 1 if the extended double-precision floating-point value `a' is 5540 | less than the corresponding value `b', and 0 otherwise. The invalid 5541 | exception is raised if either operand is a NaN. The comparison is performed 5542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5543 *----------------------------------------------------------------------------*/ 5544 5545 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5546 { 5547 flag aSign, bSign; 5548 5549 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5550 || (extractFloatx80Exp(a) == 0x7FFF 5551 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5552 || (extractFloatx80Exp(b) == 0x7FFF 5553 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5554 ) { 5555 float_raise(float_flag_invalid, status); 5556 return 0; 5557 } 5558 aSign = extractFloatx80Sign( a ); 5559 bSign = extractFloatx80Sign( b ); 5560 if ( aSign != bSign ) { 5561 return 5562 aSign 5563 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5564 != 0 ); 5565 } 5566 return 5567 aSign ? lt128( b.high, b.low, a.high, a.low ) 5568 : lt128( a.high, a.low, b.high, b.low ); 5569 5570 } 5571 5572 /*---------------------------------------------------------------------------- 5573 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5574 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5575 | either operand is a NaN. The comparison is performed according to the 5576 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5577 *----------------------------------------------------------------------------*/ 5578 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5579 { 5580 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5581 || (extractFloatx80Exp(a) == 0x7FFF 5582 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5583 || (extractFloatx80Exp(b) == 0x7FFF 5584 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5585 ) { 5586 float_raise(float_flag_invalid, status); 5587 return 1; 5588 } 5589 return 0; 5590 } 5591 5592 /*---------------------------------------------------------------------------- 5593 | Returns 1 if the extended double-precision floating-point value `a' is 5594 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5595 | cause an exception. The comparison is performed according to the IEC/IEEE 5596 | Standard for Binary Floating-Point Arithmetic. 5597 *----------------------------------------------------------------------------*/ 5598 5599 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5600 { 5601 5602 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5603 float_raise(float_flag_invalid, status); 5604 return 0; 5605 } 5606 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5607 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5608 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5609 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5610 ) { 5611 if (floatx80_is_signaling_nan(a, status) 5612 || floatx80_is_signaling_nan(b, status)) { 5613 float_raise(float_flag_invalid, status); 5614 } 5615 return 0; 5616 } 5617 return 5618 ( a.low == b.low ) 5619 && ( ( a.high == b.high ) 5620 || ( ( a.low == 0 ) 5621 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5622 ); 5623 5624 } 5625 5626 /*---------------------------------------------------------------------------- 5627 | Returns 1 if the extended double-precision floating-point value `a' is less 5628 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5629 | do not cause an exception. Otherwise, the comparison is performed according 5630 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5631 *----------------------------------------------------------------------------*/ 5632 5633 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5634 { 5635 flag aSign, bSign; 5636 5637 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5638 float_raise(float_flag_invalid, status); 5639 return 0; 5640 } 5641 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5642 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5643 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5644 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5645 ) { 5646 if (floatx80_is_signaling_nan(a, status) 5647 || floatx80_is_signaling_nan(b, status)) { 5648 float_raise(float_flag_invalid, status); 5649 } 5650 return 0; 5651 } 5652 aSign = extractFloatx80Sign( a ); 5653 bSign = extractFloatx80Sign( b ); 5654 if ( aSign != bSign ) { 5655 return 5656 aSign 5657 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5658 == 0 ); 5659 } 5660 return 5661 aSign ? le128( b.high, b.low, a.high, a.low ) 5662 : le128( a.high, a.low, b.high, b.low ); 5663 5664 } 5665 5666 /*---------------------------------------------------------------------------- 5667 | Returns 1 if the extended double-precision floating-point value `a' is less 5668 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5669 | an exception. Otherwise, the comparison is performed according to the 5670 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5671 *----------------------------------------------------------------------------*/ 5672 5673 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5674 { 5675 flag aSign, bSign; 5676 5677 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5678 float_raise(float_flag_invalid, status); 5679 return 0; 5680 } 5681 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5682 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5683 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5684 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5685 ) { 5686 if (floatx80_is_signaling_nan(a, status) 5687 || floatx80_is_signaling_nan(b, status)) { 5688 float_raise(float_flag_invalid, status); 5689 } 5690 return 0; 5691 } 5692 aSign = extractFloatx80Sign( a ); 5693 bSign = extractFloatx80Sign( b ); 5694 if ( aSign != bSign ) { 5695 return 5696 aSign 5697 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5698 != 0 ); 5699 } 5700 return 5701 aSign ? lt128( b.high, b.low, a.high, a.low ) 5702 : lt128( a.high, a.low, b.high, b.low ); 5703 5704 } 5705 5706 /*---------------------------------------------------------------------------- 5707 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5708 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5709 | The comparison is performed according to the IEC/IEEE Standard for Binary 5710 | Floating-Point Arithmetic. 5711 *----------------------------------------------------------------------------*/ 5712 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5713 { 5714 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5715 float_raise(float_flag_invalid, status); 5716 return 1; 5717 } 5718 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5719 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5720 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5721 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5722 ) { 5723 if (floatx80_is_signaling_nan(a, status) 5724 || floatx80_is_signaling_nan(b, status)) { 5725 float_raise(float_flag_invalid, status); 5726 } 5727 return 1; 5728 } 5729 return 0; 5730 } 5731 5732 /*---------------------------------------------------------------------------- 5733 | Returns the result of converting the quadruple-precision floating-point 5734 | value `a' to the 32-bit two's complement integer format. The conversion 5735 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5736 | Arithmetic---which means in particular that the conversion is rounded 5737 | according to the current rounding mode. If `a' is a NaN, the largest 5738 | positive integer is returned. Otherwise, if the conversion overflows, the 5739 | largest integer with the same sign as `a' is returned. 5740 *----------------------------------------------------------------------------*/ 5741 5742 int32_t float128_to_int32(float128 a, float_status *status) 5743 { 5744 flag aSign; 5745 int32_t aExp, shiftCount; 5746 uint64_t aSig0, aSig1; 5747 5748 aSig1 = extractFloat128Frac1( a ); 5749 aSig0 = extractFloat128Frac0( a ); 5750 aExp = extractFloat128Exp( a ); 5751 aSign = extractFloat128Sign( a ); 5752 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5753 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5754 aSig0 |= ( aSig1 != 0 ); 5755 shiftCount = 0x4028 - aExp; 5756 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5757 return roundAndPackInt32(aSign, aSig0, status); 5758 5759 } 5760 5761 /*---------------------------------------------------------------------------- 5762 | Returns the result of converting the quadruple-precision floating-point 5763 | value `a' to the 32-bit two's complement integer format. The conversion 5764 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5765 | Arithmetic, except that the conversion is always rounded toward zero. If 5766 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5767 | conversion overflows, the largest integer with the same sign as `a' is 5768 | returned. 5769 *----------------------------------------------------------------------------*/ 5770 5771 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5772 { 5773 flag aSign; 5774 int32_t aExp, shiftCount; 5775 uint64_t aSig0, aSig1, savedASig; 5776 int32_t z; 5777 5778 aSig1 = extractFloat128Frac1( a ); 5779 aSig0 = extractFloat128Frac0( a ); 5780 aExp = extractFloat128Exp( a ); 5781 aSign = extractFloat128Sign( a ); 5782 aSig0 |= ( aSig1 != 0 ); 5783 if ( 0x401E < aExp ) { 5784 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5785 goto invalid; 5786 } 5787 else if ( aExp < 0x3FFF ) { 5788 if (aExp || aSig0) { 5789 status->float_exception_flags |= float_flag_inexact; 5790 } 5791 return 0; 5792 } 5793 aSig0 |= LIT64( 0x0001000000000000 ); 5794 shiftCount = 0x402F - aExp; 5795 savedASig = aSig0; 5796 aSig0 >>= shiftCount; 5797 z = aSig0; 5798 if ( aSign ) z = - z; 5799 if ( ( z < 0 ) ^ aSign ) { 5800 invalid: 5801 float_raise(float_flag_invalid, status); 5802 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5803 } 5804 if ( ( aSig0<<shiftCount ) != savedASig ) { 5805 status->float_exception_flags |= float_flag_inexact; 5806 } 5807 return z; 5808 5809 } 5810 5811 /*---------------------------------------------------------------------------- 5812 | Returns the result of converting the quadruple-precision floating-point 5813 | value `a' to the 64-bit two's complement integer format. The conversion 5814 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5815 | Arithmetic---which means in particular that the conversion is rounded 5816 | according to the current rounding mode. If `a' is a NaN, the largest 5817 | positive integer is returned. Otherwise, if the conversion overflows, the 5818 | largest integer with the same sign as `a' is returned. 5819 *----------------------------------------------------------------------------*/ 5820 5821 int64_t float128_to_int64(float128 a, float_status *status) 5822 { 5823 flag aSign; 5824 int32_t aExp, shiftCount; 5825 uint64_t aSig0, aSig1; 5826 5827 aSig1 = extractFloat128Frac1( a ); 5828 aSig0 = extractFloat128Frac0( a ); 5829 aExp = extractFloat128Exp( a ); 5830 aSign = extractFloat128Sign( a ); 5831 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5832 shiftCount = 0x402F - aExp; 5833 if ( shiftCount <= 0 ) { 5834 if ( 0x403E < aExp ) { 5835 float_raise(float_flag_invalid, status); 5836 if ( ! aSign 5837 || ( ( aExp == 0x7FFF ) 5838 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5839 ) 5840 ) { 5841 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5842 } 5843 return (int64_t) LIT64( 0x8000000000000000 ); 5844 } 5845 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5846 } 5847 else { 5848 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5849 } 5850 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5851 5852 } 5853 5854 /*---------------------------------------------------------------------------- 5855 | Returns the result of converting the quadruple-precision floating-point 5856 | value `a' to the 64-bit two's complement integer format. The conversion 5857 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5858 | Arithmetic, except that the conversion is always rounded toward zero. 5859 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5860 | the conversion overflows, the largest integer with the same sign as `a' is 5861 | returned. 5862 *----------------------------------------------------------------------------*/ 5863 5864 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5865 { 5866 flag aSign; 5867 int32_t aExp, shiftCount; 5868 uint64_t aSig0, aSig1; 5869 int64_t z; 5870 5871 aSig1 = extractFloat128Frac1( a ); 5872 aSig0 = extractFloat128Frac0( a ); 5873 aExp = extractFloat128Exp( a ); 5874 aSign = extractFloat128Sign( a ); 5875 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5876 shiftCount = aExp - 0x402F; 5877 if ( 0 < shiftCount ) { 5878 if ( 0x403E <= aExp ) { 5879 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5880 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5881 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5882 if (aSig1) { 5883 status->float_exception_flags |= float_flag_inexact; 5884 } 5885 } 5886 else { 5887 float_raise(float_flag_invalid, status); 5888 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5889 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5890 } 5891 } 5892 return (int64_t) LIT64( 0x8000000000000000 ); 5893 } 5894 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5895 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5896 status->float_exception_flags |= float_flag_inexact; 5897 } 5898 } 5899 else { 5900 if ( aExp < 0x3FFF ) { 5901 if ( aExp | aSig0 | aSig1 ) { 5902 status->float_exception_flags |= float_flag_inexact; 5903 } 5904 return 0; 5905 } 5906 z = aSig0>>( - shiftCount ); 5907 if ( aSig1 5908 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5909 status->float_exception_flags |= float_flag_inexact; 5910 } 5911 } 5912 if ( aSign ) z = - z; 5913 return z; 5914 5915 } 5916 5917 /*---------------------------------------------------------------------------- 5918 | Returns the result of converting the quadruple-precision floating-point value 5919 | `a' to the 64-bit unsigned integer format. The conversion is 5920 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5921 | Arithmetic---which means in particular that the conversion is rounded 5922 | according to the current rounding mode. If `a' is a NaN, the largest 5923 | positive integer is returned. If the conversion overflows, the 5924 | largest unsigned integer is returned. If 'a' is negative, the value is 5925 | rounded and zero is returned; negative values that do not round to zero 5926 | will raise the inexact exception. 5927 *----------------------------------------------------------------------------*/ 5928 5929 uint64_t float128_to_uint64(float128 a, float_status *status) 5930 { 5931 flag aSign; 5932 int aExp; 5933 int shiftCount; 5934 uint64_t aSig0, aSig1; 5935 5936 aSig0 = extractFloat128Frac0(a); 5937 aSig1 = extractFloat128Frac1(a); 5938 aExp = extractFloat128Exp(a); 5939 aSign = extractFloat128Sign(a); 5940 if (aSign && (aExp > 0x3FFE)) { 5941 float_raise(float_flag_invalid, status); 5942 if (float128_is_any_nan(a)) { 5943 return LIT64(0xFFFFFFFFFFFFFFFF); 5944 } else { 5945 return 0; 5946 } 5947 } 5948 if (aExp) { 5949 aSig0 |= LIT64(0x0001000000000000); 5950 } 5951 shiftCount = 0x402F - aExp; 5952 if (shiftCount <= 0) { 5953 if (0x403E < aExp) { 5954 float_raise(float_flag_invalid, status); 5955 return LIT64(0xFFFFFFFFFFFFFFFF); 5956 } 5957 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5958 } else { 5959 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5960 } 5961 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5962 } 5963 5964 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5965 { 5966 uint64_t v; 5967 signed char current_rounding_mode = status->float_rounding_mode; 5968 5969 set_float_rounding_mode(float_round_to_zero, status); 5970 v = float128_to_uint64(a, status); 5971 set_float_rounding_mode(current_rounding_mode, status); 5972 5973 return v; 5974 } 5975 5976 /*---------------------------------------------------------------------------- 5977 | Returns the result of converting the quadruple-precision floating-point 5978 | value `a' to the 32-bit unsigned integer format. The conversion 5979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5980 | Arithmetic except that the conversion is always rounded toward zero. 5981 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5982 | if the conversion overflows, the largest unsigned integer is returned. 5983 | If 'a' is negative, the value is rounded and zero is returned; negative 5984 | values that do not round to zero will raise the inexact exception. 5985 *----------------------------------------------------------------------------*/ 5986 5987 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5988 { 5989 uint64_t v; 5990 uint32_t res; 5991 int old_exc_flags = get_float_exception_flags(status); 5992 5993 v = float128_to_uint64_round_to_zero(a, status); 5994 if (v > 0xffffffff) { 5995 res = 0xffffffff; 5996 } else { 5997 return v; 5998 } 5999 set_float_exception_flags(old_exc_flags, status); 6000 float_raise(float_flag_invalid, status); 6001 return res; 6002 } 6003 6004 /*---------------------------------------------------------------------------- 6005 | Returns the result of converting the quadruple-precision floating-point 6006 | value `a' to the single-precision floating-point format. The conversion 6007 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6008 | Arithmetic. 6009 *----------------------------------------------------------------------------*/ 6010 6011 float32 float128_to_float32(float128 a, float_status *status) 6012 { 6013 flag aSign; 6014 int32_t aExp; 6015 uint64_t aSig0, aSig1; 6016 uint32_t zSig; 6017 6018 aSig1 = extractFloat128Frac1( a ); 6019 aSig0 = extractFloat128Frac0( a ); 6020 aExp = extractFloat128Exp( a ); 6021 aSign = extractFloat128Sign( a ); 6022 if ( aExp == 0x7FFF ) { 6023 if ( aSig0 | aSig1 ) { 6024 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 6025 } 6026 return packFloat32( aSign, 0xFF, 0 ); 6027 } 6028 aSig0 |= ( aSig1 != 0 ); 6029 shift64RightJamming( aSig0, 18, &aSig0 ); 6030 zSig = aSig0; 6031 if ( aExp || zSig ) { 6032 zSig |= 0x40000000; 6033 aExp -= 0x3F81; 6034 } 6035 return roundAndPackFloat32(aSign, aExp, zSig, status); 6036 6037 } 6038 6039 /*---------------------------------------------------------------------------- 6040 | Returns the result of converting the quadruple-precision floating-point 6041 | value `a' to the double-precision floating-point format. The conversion 6042 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 6043 | Arithmetic. 6044 *----------------------------------------------------------------------------*/ 6045 6046 float64 float128_to_float64(float128 a, float_status *status) 6047 { 6048 flag aSign; 6049 int32_t aExp; 6050 uint64_t aSig0, aSig1; 6051 6052 aSig1 = extractFloat128Frac1( a ); 6053 aSig0 = extractFloat128Frac0( a ); 6054 aExp = extractFloat128Exp( a ); 6055 aSign = extractFloat128Sign( a ); 6056 if ( aExp == 0x7FFF ) { 6057 if ( aSig0 | aSig1 ) { 6058 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 6059 } 6060 return packFloat64( aSign, 0x7FF, 0 ); 6061 } 6062 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6063 aSig0 |= ( aSig1 != 0 ); 6064 if ( aExp || aSig0 ) { 6065 aSig0 |= LIT64( 0x4000000000000000 ); 6066 aExp -= 0x3C01; 6067 } 6068 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6069 6070 } 6071 6072 /*---------------------------------------------------------------------------- 6073 | Returns the result of converting the quadruple-precision floating-point 6074 | value `a' to the extended double-precision floating-point format. The 6075 | conversion is performed according to the IEC/IEEE Standard for Binary 6076 | Floating-Point Arithmetic. 6077 *----------------------------------------------------------------------------*/ 6078 6079 floatx80 float128_to_floatx80(float128 a, float_status *status) 6080 { 6081 flag aSign; 6082 int32_t aExp; 6083 uint64_t aSig0, aSig1; 6084 6085 aSig1 = extractFloat128Frac1( a ); 6086 aSig0 = extractFloat128Frac0( a ); 6087 aExp = extractFloat128Exp( a ); 6088 aSign = extractFloat128Sign( a ); 6089 if ( aExp == 0x7FFF ) { 6090 if ( aSig0 | aSig1 ) { 6091 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6092 } 6093 return packFloatx80(aSign, floatx80_infinity_high, 6094 floatx80_infinity_low); 6095 } 6096 if ( aExp == 0 ) { 6097 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6098 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6099 } 6100 else { 6101 aSig0 |= LIT64( 0x0001000000000000 ); 6102 } 6103 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6104 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6105 6106 } 6107 6108 /*---------------------------------------------------------------------------- 6109 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6110 | returns the result as a quadruple-precision floating-point value. The 6111 | operation is performed according to the IEC/IEEE Standard for Binary 6112 | Floating-Point Arithmetic. 6113 *----------------------------------------------------------------------------*/ 6114 6115 float128 float128_round_to_int(float128 a, float_status *status) 6116 { 6117 flag aSign; 6118 int32_t aExp; 6119 uint64_t lastBitMask, roundBitsMask; 6120 float128 z; 6121 6122 aExp = extractFloat128Exp( a ); 6123 if ( 0x402F <= aExp ) { 6124 if ( 0x406F <= aExp ) { 6125 if ( ( aExp == 0x7FFF ) 6126 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6127 ) { 6128 return propagateFloat128NaN(a, a, status); 6129 } 6130 return a; 6131 } 6132 lastBitMask = 1; 6133 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6134 roundBitsMask = lastBitMask - 1; 6135 z = a; 6136 switch (status->float_rounding_mode) { 6137 case float_round_nearest_even: 6138 if ( lastBitMask ) { 6139 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6140 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6141 } 6142 else { 6143 if ( (int64_t) z.low < 0 ) { 6144 ++z.high; 6145 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6146 } 6147 } 6148 break; 6149 case float_round_ties_away: 6150 if (lastBitMask) { 6151 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6152 } else { 6153 if ((int64_t) z.low < 0) { 6154 ++z.high; 6155 } 6156 } 6157 break; 6158 case float_round_to_zero: 6159 break; 6160 case float_round_up: 6161 if (!extractFloat128Sign(z)) { 6162 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6163 } 6164 break; 6165 case float_round_down: 6166 if (extractFloat128Sign(z)) { 6167 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6168 } 6169 break; 6170 default: 6171 abort(); 6172 } 6173 z.low &= ~ roundBitsMask; 6174 } 6175 else { 6176 if ( aExp < 0x3FFF ) { 6177 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6178 status->float_exception_flags |= float_flag_inexact; 6179 aSign = extractFloat128Sign( a ); 6180 switch (status->float_rounding_mode) { 6181 case float_round_nearest_even: 6182 if ( ( aExp == 0x3FFE ) 6183 && ( extractFloat128Frac0( a ) 6184 | extractFloat128Frac1( a ) ) 6185 ) { 6186 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6187 } 6188 break; 6189 case float_round_ties_away: 6190 if (aExp == 0x3FFE) { 6191 return packFloat128(aSign, 0x3FFF, 0, 0); 6192 } 6193 break; 6194 case float_round_down: 6195 return 6196 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6197 : packFloat128( 0, 0, 0, 0 ); 6198 case float_round_up: 6199 return 6200 aSign ? packFloat128( 1, 0, 0, 0 ) 6201 : packFloat128( 0, 0x3FFF, 0, 0 ); 6202 } 6203 return packFloat128( aSign, 0, 0, 0 ); 6204 } 6205 lastBitMask = 1; 6206 lastBitMask <<= 0x402F - aExp; 6207 roundBitsMask = lastBitMask - 1; 6208 z.low = 0; 6209 z.high = a.high; 6210 switch (status->float_rounding_mode) { 6211 case float_round_nearest_even: 6212 z.high += lastBitMask>>1; 6213 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6214 z.high &= ~ lastBitMask; 6215 } 6216 break; 6217 case float_round_ties_away: 6218 z.high += lastBitMask>>1; 6219 break; 6220 case float_round_to_zero: 6221 break; 6222 case float_round_up: 6223 if (!extractFloat128Sign(z)) { 6224 z.high |= ( a.low != 0 ); 6225 z.high += roundBitsMask; 6226 } 6227 break; 6228 case float_round_down: 6229 if (extractFloat128Sign(z)) { 6230 z.high |= (a.low != 0); 6231 z.high += roundBitsMask; 6232 } 6233 break; 6234 default: 6235 abort(); 6236 } 6237 z.high &= ~ roundBitsMask; 6238 } 6239 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6240 status->float_exception_flags |= float_flag_inexact; 6241 } 6242 return z; 6243 6244 } 6245 6246 /*---------------------------------------------------------------------------- 6247 | Returns the result of adding the absolute values of the quadruple-precision 6248 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6249 | before being returned. `zSign' is ignored if the result is a NaN. 6250 | The addition is performed according to the IEC/IEEE Standard for Binary 6251 | Floating-Point Arithmetic. 6252 *----------------------------------------------------------------------------*/ 6253 6254 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6255 float_status *status) 6256 { 6257 int32_t aExp, bExp, zExp; 6258 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6259 int32_t expDiff; 6260 6261 aSig1 = extractFloat128Frac1( a ); 6262 aSig0 = extractFloat128Frac0( a ); 6263 aExp = extractFloat128Exp( a ); 6264 bSig1 = extractFloat128Frac1( b ); 6265 bSig0 = extractFloat128Frac0( b ); 6266 bExp = extractFloat128Exp( b ); 6267 expDiff = aExp - bExp; 6268 if ( 0 < expDiff ) { 6269 if ( aExp == 0x7FFF ) { 6270 if (aSig0 | aSig1) { 6271 return propagateFloat128NaN(a, b, status); 6272 } 6273 return a; 6274 } 6275 if ( bExp == 0 ) { 6276 --expDiff; 6277 } 6278 else { 6279 bSig0 |= LIT64( 0x0001000000000000 ); 6280 } 6281 shift128ExtraRightJamming( 6282 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6283 zExp = aExp; 6284 } 6285 else if ( expDiff < 0 ) { 6286 if ( bExp == 0x7FFF ) { 6287 if (bSig0 | bSig1) { 6288 return propagateFloat128NaN(a, b, status); 6289 } 6290 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6291 } 6292 if ( aExp == 0 ) { 6293 ++expDiff; 6294 } 6295 else { 6296 aSig0 |= LIT64( 0x0001000000000000 ); 6297 } 6298 shift128ExtraRightJamming( 6299 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6300 zExp = bExp; 6301 } 6302 else { 6303 if ( aExp == 0x7FFF ) { 6304 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6305 return propagateFloat128NaN(a, b, status); 6306 } 6307 return a; 6308 } 6309 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6310 if ( aExp == 0 ) { 6311 if (status->flush_to_zero) { 6312 if (zSig0 | zSig1) { 6313 float_raise(float_flag_output_denormal, status); 6314 } 6315 return packFloat128(zSign, 0, 0, 0); 6316 } 6317 return packFloat128( zSign, 0, zSig0, zSig1 ); 6318 } 6319 zSig2 = 0; 6320 zSig0 |= LIT64( 0x0002000000000000 ); 6321 zExp = aExp; 6322 goto shiftRight1; 6323 } 6324 aSig0 |= LIT64( 0x0001000000000000 ); 6325 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6326 --zExp; 6327 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6328 ++zExp; 6329 shiftRight1: 6330 shift128ExtraRightJamming( 6331 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6332 roundAndPack: 6333 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6334 6335 } 6336 6337 /*---------------------------------------------------------------------------- 6338 | Returns the result of subtracting the absolute values of the quadruple- 6339 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6340 | difference is negated before being returned. `zSign' is ignored if the 6341 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6342 | Standard for Binary Floating-Point Arithmetic. 6343 *----------------------------------------------------------------------------*/ 6344 6345 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6346 float_status *status) 6347 { 6348 int32_t aExp, bExp, zExp; 6349 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6350 int32_t expDiff; 6351 6352 aSig1 = extractFloat128Frac1( a ); 6353 aSig0 = extractFloat128Frac0( a ); 6354 aExp = extractFloat128Exp( a ); 6355 bSig1 = extractFloat128Frac1( b ); 6356 bSig0 = extractFloat128Frac0( b ); 6357 bExp = extractFloat128Exp( b ); 6358 expDiff = aExp - bExp; 6359 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6360 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6361 if ( 0 < expDiff ) goto aExpBigger; 6362 if ( expDiff < 0 ) goto bExpBigger; 6363 if ( aExp == 0x7FFF ) { 6364 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6365 return propagateFloat128NaN(a, b, status); 6366 } 6367 float_raise(float_flag_invalid, status); 6368 return float128_default_nan(status); 6369 } 6370 if ( aExp == 0 ) { 6371 aExp = 1; 6372 bExp = 1; 6373 } 6374 if ( bSig0 < aSig0 ) goto aBigger; 6375 if ( aSig0 < bSig0 ) goto bBigger; 6376 if ( bSig1 < aSig1 ) goto aBigger; 6377 if ( aSig1 < bSig1 ) goto bBigger; 6378 return packFloat128(status->float_rounding_mode == float_round_down, 6379 0, 0, 0); 6380 bExpBigger: 6381 if ( bExp == 0x7FFF ) { 6382 if (bSig0 | bSig1) { 6383 return propagateFloat128NaN(a, b, status); 6384 } 6385 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6386 } 6387 if ( aExp == 0 ) { 6388 ++expDiff; 6389 } 6390 else { 6391 aSig0 |= LIT64( 0x4000000000000000 ); 6392 } 6393 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6394 bSig0 |= LIT64( 0x4000000000000000 ); 6395 bBigger: 6396 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6397 zExp = bExp; 6398 zSign ^= 1; 6399 goto normalizeRoundAndPack; 6400 aExpBigger: 6401 if ( aExp == 0x7FFF ) { 6402 if (aSig0 | aSig1) { 6403 return propagateFloat128NaN(a, b, status); 6404 } 6405 return a; 6406 } 6407 if ( bExp == 0 ) { 6408 --expDiff; 6409 } 6410 else { 6411 bSig0 |= LIT64( 0x4000000000000000 ); 6412 } 6413 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6414 aSig0 |= LIT64( 0x4000000000000000 ); 6415 aBigger: 6416 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6417 zExp = aExp; 6418 normalizeRoundAndPack: 6419 --zExp; 6420 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6421 status); 6422 6423 } 6424 6425 /*---------------------------------------------------------------------------- 6426 | Returns the result of adding the quadruple-precision floating-point values 6427 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6428 | for Binary Floating-Point Arithmetic. 6429 *----------------------------------------------------------------------------*/ 6430 6431 float128 float128_add(float128 a, float128 b, float_status *status) 6432 { 6433 flag aSign, bSign; 6434 6435 aSign = extractFloat128Sign( a ); 6436 bSign = extractFloat128Sign( b ); 6437 if ( aSign == bSign ) { 6438 return addFloat128Sigs(a, b, aSign, status); 6439 } 6440 else { 6441 return subFloat128Sigs(a, b, aSign, status); 6442 } 6443 6444 } 6445 6446 /*---------------------------------------------------------------------------- 6447 | Returns the result of subtracting the quadruple-precision floating-point 6448 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6449 | Standard for Binary Floating-Point Arithmetic. 6450 *----------------------------------------------------------------------------*/ 6451 6452 float128 float128_sub(float128 a, float128 b, float_status *status) 6453 { 6454 flag aSign, bSign; 6455 6456 aSign = extractFloat128Sign( a ); 6457 bSign = extractFloat128Sign( b ); 6458 if ( aSign == bSign ) { 6459 return subFloat128Sigs(a, b, aSign, status); 6460 } 6461 else { 6462 return addFloat128Sigs(a, b, aSign, status); 6463 } 6464 6465 } 6466 6467 /*---------------------------------------------------------------------------- 6468 | Returns the result of multiplying the quadruple-precision floating-point 6469 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6470 | Standard for Binary Floating-Point Arithmetic. 6471 *----------------------------------------------------------------------------*/ 6472 6473 float128 float128_mul(float128 a, float128 b, float_status *status) 6474 { 6475 flag aSign, bSign, zSign; 6476 int32_t aExp, bExp, zExp; 6477 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6478 6479 aSig1 = extractFloat128Frac1( a ); 6480 aSig0 = extractFloat128Frac0( a ); 6481 aExp = extractFloat128Exp( a ); 6482 aSign = extractFloat128Sign( a ); 6483 bSig1 = extractFloat128Frac1( b ); 6484 bSig0 = extractFloat128Frac0( b ); 6485 bExp = extractFloat128Exp( b ); 6486 bSign = extractFloat128Sign( b ); 6487 zSign = aSign ^ bSign; 6488 if ( aExp == 0x7FFF ) { 6489 if ( ( aSig0 | aSig1 ) 6490 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6491 return propagateFloat128NaN(a, b, status); 6492 } 6493 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6494 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6495 } 6496 if ( bExp == 0x7FFF ) { 6497 if (bSig0 | bSig1) { 6498 return propagateFloat128NaN(a, b, status); 6499 } 6500 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6501 invalid: 6502 float_raise(float_flag_invalid, status); 6503 return float128_default_nan(status); 6504 } 6505 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6506 } 6507 if ( aExp == 0 ) { 6508 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6509 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6510 } 6511 if ( bExp == 0 ) { 6512 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6513 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6514 } 6515 zExp = aExp + bExp - 0x4000; 6516 aSig0 |= LIT64( 0x0001000000000000 ); 6517 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6518 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6519 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6520 zSig2 |= ( zSig3 != 0 ); 6521 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6522 shift128ExtraRightJamming( 6523 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6524 ++zExp; 6525 } 6526 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6527 6528 } 6529 6530 /*---------------------------------------------------------------------------- 6531 | Returns the result of dividing the quadruple-precision floating-point value 6532 | `a' by the corresponding value `b'. The operation is performed according to 6533 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6534 *----------------------------------------------------------------------------*/ 6535 6536 float128 float128_div(float128 a, float128 b, float_status *status) 6537 { 6538 flag aSign, bSign, zSign; 6539 int32_t aExp, bExp, zExp; 6540 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6541 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6542 6543 aSig1 = extractFloat128Frac1( a ); 6544 aSig0 = extractFloat128Frac0( a ); 6545 aExp = extractFloat128Exp( a ); 6546 aSign = extractFloat128Sign( a ); 6547 bSig1 = extractFloat128Frac1( b ); 6548 bSig0 = extractFloat128Frac0( b ); 6549 bExp = extractFloat128Exp( b ); 6550 bSign = extractFloat128Sign( b ); 6551 zSign = aSign ^ bSign; 6552 if ( aExp == 0x7FFF ) { 6553 if (aSig0 | aSig1) { 6554 return propagateFloat128NaN(a, b, status); 6555 } 6556 if ( bExp == 0x7FFF ) { 6557 if (bSig0 | bSig1) { 6558 return propagateFloat128NaN(a, b, status); 6559 } 6560 goto invalid; 6561 } 6562 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6563 } 6564 if ( bExp == 0x7FFF ) { 6565 if (bSig0 | bSig1) { 6566 return propagateFloat128NaN(a, b, status); 6567 } 6568 return packFloat128( zSign, 0, 0, 0 ); 6569 } 6570 if ( bExp == 0 ) { 6571 if ( ( bSig0 | bSig1 ) == 0 ) { 6572 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6573 invalid: 6574 float_raise(float_flag_invalid, status); 6575 return float128_default_nan(status); 6576 } 6577 float_raise(float_flag_divbyzero, status); 6578 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6579 } 6580 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6581 } 6582 if ( aExp == 0 ) { 6583 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6584 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6585 } 6586 zExp = aExp - bExp + 0x3FFD; 6587 shortShift128Left( 6588 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6589 shortShift128Left( 6590 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6591 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6592 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6593 ++zExp; 6594 } 6595 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6596 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6597 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6598 while ( (int64_t) rem0 < 0 ) { 6599 --zSig0; 6600 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6601 } 6602 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6603 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6604 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6605 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6606 while ( (int64_t) rem1 < 0 ) { 6607 --zSig1; 6608 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6609 } 6610 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6611 } 6612 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6613 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6614 6615 } 6616 6617 /*---------------------------------------------------------------------------- 6618 | Returns the remainder of the quadruple-precision floating-point value `a' 6619 | with respect to the corresponding value `b'. The operation is performed 6620 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6621 *----------------------------------------------------------------------------*/ 6622 6623 float128 float128_rem(float128 a, float128 b, float_status *status) 6624 { 6625 flag aSign, zSign; 6626 int32_t aExp, bExp, expDiff; 6627 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6628 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6629 int64_t sigMean0; 6630 6631 aSig1 = extractFloat128Frac1( a ); 6632 aSig0 = extractFloat128Frac0( a ); 6633 aExp = extractFloat128Exp( a ); 6634 aSign = extractFloat128Sign( a ); 6635 bSig1 = extractFloat128Frac1( b ); 6636 bSig0 = extractFloat128Frac0( b ); 6637 bExp = extractFloat128Exp( b ); 6638 if ( aExp == 0x7FFF ) { 6639 if ( ( aSig0 | aSig1 ) 6640 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6641 return propagateFloat128NaN(a, b, status); 6642 } 6643 goto invalid; 6644 } 6645 if ( bExp == 0x7FFF ) { 6646 if (bSig0 | bSig1) { 6647 return propagateFloat128NaN(a, b, status); 6648 } 6649 return a; 6650 } 6651 if ( bExp == 0 ) { 6652 if ( ( bSig0 | bSig1 ) == 0 ) { 6653 invalid: 6654 float_raise(float_flag_invalid, status); 6655 return float128_default_nan(status); 6656 } 6657 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6658 } 6659 if ( aExp == 0 ) { 6660 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6661 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6662 } 6663 expDiff = aExp - bExp; 6664 if ( expDiff < -1 ) return a; 6665 shortShift128Left( 6666 aSig0 | LIT64( 0x0001000000000000 ), 6667 aSig1, 6668 15 - ( expDiff < 0 ), 6669 &aSig0, 6670 &aSig1 6671 ); 6672 shortShift128Left( 6673 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6674 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6675 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6676 expDiff -= 64; 6677 while ( 0 < expDiff ) { 6678 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6679 q = ( 4 < q ) ? q - 4 : 0; 6680 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6681 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6682 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6683 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6684 expDiff -= 61; 6685 } 6686 if ( -64 < expDiff ) { 6687 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6688 q = ( 4 < q ) ? q - 4 : 0; 6689 q >>= - expDiff; 6690 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6691 expDiff += 52; 6692 if ( expDiff < 0 ) { 6693 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6694 } 6695 else { 6696 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6697 } 6698 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6699 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6700 } 6701 else { 6702 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6703 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6704 } 6705 do { 6706 alternateASig0 = aSig0; 6707 alternateASig1 = aSig1; 6708 ++q; 6709 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6710 } while ( 0 <= (int64_t) aSig0 ); 6711 add128( 6712 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6713 if ( ( sigMean0 < 0 ) 6714 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6715 aSig0 = alternateASig0; 6716 aSig1 = alternateASig1; 6717 } 6718 zSign = ( (int64_t) aSig0 < 0 ); 6719 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6720 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6721 status); 6722 } 6723 6724 /*---------------------------------------------------------------------------- 6725 | Returns the square root of the quadruple-precision floating-point value `a'. 6726 | The operation is performed according to the IEC/IEEE Standard for Binary 6727 | Floating-Point Arithmetic. 6728 *----------------------------------------------------------------------------*/ 6729 6730 float128 float128_sqrt(float128 a, float_status *status) 6731 { 6732 flag aSign; 6733 int32_t aExp, zExp; 6734 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6735 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6736 6737 aSig1 = extractFloat128Frac1( a ); 6738 aSig0 = extractFloat128Frac0( a ); 6739 aExp = extractFloat128Exp( a ); 6740 aSign = extractFloat128Sign( a ); 6741 if ( aExp == 0x7FFF ) { 6742 if (aSig0 | aSig1) { 6743 return propagateFloat128NaN(a, a, status); 6744 } 6745 if ( ! aSign ) return a; 6746 goto invalid; 6747 } 6748 if ( aSign ) { 6749 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6750 invalid: 6751 float_raise(float_flag_invalid, status); 6752 return float128_default_nan(status); 6753 } 6754 if ( aExp == 0 ) { 6755 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6756 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6757 } 6758 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6759 aSig0 |= LIT64( 0x0001000000000000 ); 6760 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6761 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6762 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6763 doubleZSig0 = zSig0<<1; 6764 mul64To128( zSig0, zSig0, &term0, &term1 ); 6765 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6766 while ( (int64_t) rem0 < 0 ) { 6767 --zSig0; 6768 doubleZSig0 -= 2; 6769 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6770 } 6771 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6772 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6773 if ( zSig1 == 0 ) zSig1 = 1; 6774 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6775 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6776 mul64To128( zSig1, zSig1, &term2, &term3 ); 6777 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6778 while ( (int64_t) rem1 < 0 ) { 6779 --zSig1; 6780 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6781 term3 |= 1; 6782 term2 |= doubleZSig0; 6783 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6784 } 6785 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6786 } 6787 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6788 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6789 6790 } 6791 6792 /*---------------------------------------------------------------------------- 6793 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6794 | the corresponding value `b', and 0 otherwise. The invalid exception is 6795 | raised if either operand is a NaN. Otherwise, the comparison is performed 6796 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6797 *----------------------------------------------------------------------------*/ 6798 6799 int float128_eq(float128 a, float128 b, float_status *status) 6800 { 6801 6802 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6803 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6804 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6805 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6806 ) { 6807 float_raise(float_flag_invalid, status); 6808 return 0; 6809 } 6810 return 6811 ( a.low == b.low ) 6812 && ( ( a.high == b.high ) 6813 || ( ( a.low == 0 ) 6814 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6815 ); 6816 6817 } 6818 6819 /*---------------------------------------------------------------------------- 6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6821 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6822 | exception is raised if either operand is a NaN. The comparison is performed 6823 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6824 *----------------------------------------------------------------------------*/ 6825 6826 int float128_le(float128 a, float128 b, float_status *status) 6827 { 6828 flag aSign, bSign; 6829 6830 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6831 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6832 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6833 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6834 ) { 6835 float_raise(float_flag_invalid, status); 6836 return 0; 6837 } 6838 aSign = extractFloat128Sign( a ); 6839 bSign = extractFloat128Sign( b ); 6840 if ( aSign != bSign ) { 6841 return 6842 aSign 6843 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6844 == 0 ); 6845 } 6846 return 6847 aSign ? le128( b.high, b.low, a.high, a.low ) 6848 : le128( a.high, a.low, b.high, b.low ); 6849 6850 } 6851 6852 /*---------------------------------------------------------------------------- 6853 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6854 | the corresponding value `b', and 0 otherwise. The invalid exception is 6855 | raised if either operand is a NaN. The comparison is performed according 6856 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6857 *----------------------------------------------------------------------------*/ 6858 6859 int float128_lt(float128 a, float128 b, float_status *status) 6860 { 6861 flag aSign, bSign; 6862 6863 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6864 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6865 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6866 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6867 ) { 6868 float_raise(float_flag_invalid, status); 6869 return 0; 6870 } 6871 aSign = extractFloat128Sign( a ); 6872 bSign = extractFloat128Sign( b ); 6873 if ( aSign != bSign ) { 6874 return 6875 aSign 6876 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6877 != 0 ); 6878 } 6879 return 6880 aSign ? lt128( b.high, b.low, a.high, a.low ) 6881 : lt128( a.high, a.low, b.high, b.low ); 6882 6883 } 6884 6885 /*---------------------------------------------------------------------------- 6886 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6887 | be compared, and 0 otherwise. The invalid exception is raised if either 6888 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6889 | Standard for Binary Floating-Point Arithmetic. 6890 *----------------------------------------------------------------------------*/ 6891 6892 int float128_unordered(float128 a, float128 b, float_status *status) 6893 { 6894 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6895 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6896 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6897 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6898 ) { 6899 float_raise(float_flag_invalid, status); 6900 return 1; 6901 } 6902 return 0; 6903 } 6904 6905 /*---------------------------------------------------------------------------- 6906 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6907 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6908 | exception. The comparison is performed according to the IEC/IEEE Standard 6909 | for Binary Floating-Point Arithmetic. 6910 *----------------------------------------------------------------------------*/ 6911 6912 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6913 { 6914 6915 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6916 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6917 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6918 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6919 ) { 6920 if (float128_is_signaling_nan(a, status) 6921 || float128_is_signaling_nan(b, status)) { 6922 float_raise(float_flag_invalid, status); 6923 } 6924 return 0; 6925 } 6926 return 6927 ( a.low == b.low ) 6928 && ( ( a.high == b.high ) 6929 || ( ( a.low == 0 ) 6930 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6931 ); 6932 6933 } 6934 6935 /*---------------------------------------------------------------------------- 6936 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6937 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6938 | cause an exception. Otherwise, the comparison is performed according to the 6939 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6940 *----------------------------------------------------------------------------*/ 6941 6942 int float128_le_quiet(float128 a, float128 b, float_status *status) 6943 { 6944 flag aSign, bSign; 6945 6946 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6947 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6948 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6949 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6950 ) { 6951 if (float128_is_signaling_nan(a, status) 6952 || float128_is_signaling_nan(b, status)) { 6953 float_raise(float_flag_invalid, status); 6954 } 6955 return 0; 6956 } 6957 aSign = extractFloat128Sign( a ); 6958 bSign = extractFloat128Sign( b ); 6959 if ( aSign != bSign ) { 6960 return 6961 aSign 6962 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6963 == 0 ); 6964 } 6965 return 6966 aSign ? le128( b.high, b.low, a.high, a.low ) 6967 : le128( a.high, a.low, b.high, b.low ); 6968 6969 } 6970 6971 /*---------------------------------------------------------------------------- 6972 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6973 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6974 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6975 | Standard for Binary Floating-Point Arithmetic. 6976 *----------------------------------------------------------------------------*/ 6977 6978 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6979 { 6980 flag aSign, bSign; 6981 6982 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6983 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6984 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6985 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6986 ) { 6987 if (float128_is_signaling_nan(a, status) 6988 || float128_is_signaling_nan(b, status)) { 6989 float_raise(float_flag_invalid, status); 6990 } 6991 return 0; 6992 } 6993 aSign = extractFloat128Sign( a ); 6994 bSign = extractFloat128Sign( b ); 6995 if ( aSign != bSign ) { 6996 return 6997 aSign 6998 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6999 != 0 ); 7000 } 7001 return 7002 aSign ? lt128( b.high, b.low, a.high, a.low ) 7003 : lt128( a.high, a.low, b.high, b.low ); 7004 7005 } 7006 7007 /*---------------------------------------------------------------------------- 7008 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 7009 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 7010 | comparison is performed according to the IEC/IEEE Standard for Binary 7011 | Floating-Point Arithmetic. 7012 *----------------------------------------------------------------------------*/ 7013 7014 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 7015 { 7016 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 7017 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 7018 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 7019 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 7020 ) { 7021 if (float128_is_signaling_nan(a, status) 7022 || float128_is_signaling_nan(b, status)) { 7023 float_raise(float_flag_invalid, status); 7024 } 7025 return 1; 7026 } 7027 return 0; 7028 } 7029 7030 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 7031 int is_quiet, float_status *status) 7032 { 7033 flag aSign, bSign; 7034 7035 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 7036 float_raise(float_flag_invalid, status); 7037 return float_relation_unordered; 7038 } 7039 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 7040 ( extractFloatx80Frac( a )<<1 ) ) || 7041 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 7042 ( extractFloatx80Frac( b )<<1 ) )) { 7043 if (!is_quiet || 7044 floatx80_is_signaling_nan(a, status) || 7045 floatx80_is_signaling_nan(b, status)) { 7046 float_raise(float_flag_invalid, status); 7047 } 7048 return float_relation_unordered; 7049 } 7050 aSign = extractFloatx80Sign( a ); 7051 bSign = extractFloatx80Sign( b ); 7052 if ( aSign != bSign ) { 7053 7054 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 7055 ( ( a.low | b.low ) == 0 ) ) { 7056 /* zero case */ 7057 return float_relation_equal; 7058 } else { 7059 return 1 - (2 * aSign); 7060 } 7061 } else { 7062 if (a.low == b.low && a.high == b.high) { 7063 return float_relation_equal; 7064 } else { 7065 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7066 } 7067 } 7068 } 7069 7070 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7071 { 7072 return floatx80_compare_internal(a, b, 0, status); 7073 } 7074 7075 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7076 { 7077 return floatx80_compare_internal(a, b, 1, status); 7078 } 7079 7080 static inline int float128_compare_internal(float128 a, float128 b, 7081 int is_quiet, float_status *status) 7082 { 7083 flag aSign, bSign; 7084 7085 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7086 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7087 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7088 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7089 if (!is_quiet || 7090 float128_is_signaling_nan(a, status) || 7091 float128_is_signaling_nan(b, status)) { 7092 float_raise(float_flag_invalid, status); 7093 } 7094 return float_relation_unordered; 7095 } 7096 aSign = extractFloat128Sign( a ); 7097 bSign = extractFloat128Sign( b ); 7098 if ( aSign != bSign ) { 7099 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7100 /* zero case */ 7101 return float_relation_equal; 7102 } else { 7103 return 1 - (2 * aSign); 7104 } 7105 } else { 7106 if (a.low == b.low && a.high == b.high) { 7107 return float_relation_equal; 7108 } else { 7109 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7110 } 7111 } 7112 } 7113 7114 int float128_compare(float128 a, float128 b, float_status *status) 7115 { 7116 return float128_compare_internal(a, b, 0, status); 7117 } 7118 7119 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7120 { 7121 return float128_compare_internal(a, b, 1, status); 7122 } 7123 7124 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7125 { 7126 flag aSign; 7127 int32_t aExp; 7128 uint64_t aSig; 7129 7130 if (floatx80_invalid_encoding(a)) { 7131 float_raise(float_flag_invalid, status); 7132 return floatx80_default_nan(status); 7133 } 7134 aSig = extractFloatx80Frac( a ); 7135 aExp = extractFloatx80Exp( a ); 7136 aSign = extractFloatx80Sign( a ); 7137 7138 if ( aExp == 0x7FFF ) { 7139 if ( aSig<<1 ) { 7140 return propagateFloatx80NaN(a, a, status); 7141 } 7142 return a; 7143 } 7144 7145 if (aExp == 0) { 7146 if (aSig == 0) { 7147 return a; 7148 } 7149 aExp++; 7150 } 7151 7152 if (n > 0x10000) { 7153 n = 0x10000; 7154 } else if (n < -0x10000) { 7155 n = -0x10000; 7156 } 7157 7158 aExp += n; 7159 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7160 aSign, aExp, aSig, 0, status); 7161 } 7162 7163 float128 float128_scalbn(float128 a, int n, float_status *status) 7164 { 7165 flag aSign; 7166 int32_t aExp; 7167 uint64_t aSig0, aSig1; 7168 7169 aSig1 = extractFloat128Frac1( a ); 7170 aSig0 = extractFloat128Frac0( a ); 7171 aExp = extractFloat128Exp( a ); 7172 aSign = extractFloat128Sign( a ); 7173 if ( aExp == 0x7FFF ) { 7174 if ( aSig0 | aSig1 ) { 7175 return propagateFloat128NaN(a, a, status); 7176 } 7177 return a; 7178 } 7179 if (aExp != 0) { 7180 aSig0 |= LIT64( 0x0001000000000000 ); 7181 } else if (aSig0 == 0 && aSig1 == 0) { 7182 return a; 7183 } else { 7184 aExp++; 7185 } 7186 7187 if (n > 0x10000) { 7188 n = 0x10000; 7189 } else if (n < -0x10000) { 7190 n = -0x10000; 7191 } 7192 7193 aExp += n - 1; 7194 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7195 , status); 7196 7197 } 7198