1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Float Min/Max */ 1667 /* min() and max() functions. These can't be implemented as 1668 * 'compare and pick one input' because that would mishandle 1669 * NaNs and +0 vs -0. 1670 * 1671 * minnum() and maxnum() functions. These are similar to the min() 1672 * and max() functions but if one of the arguments is a QNaN and 1673 * the other is numerical then the numerical argument is returned. 1674 * SNaNs will get quietened before being returned. 1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1676 * and maxNum() operations. min() and max() are the typical min/max 1677 * semantics provided by many CPUs which predate that specification. 1678 * 1679 * minnummag() and maxnummag() functions correspond to minNumMag() 1680 * and minNumMag() from the IEEE-754 2008. 1681 */ 1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1683 bool ieee, bool ismag, float_status *s) 1684 { 1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1686 if (ieee) { 1687 /* Takes two floating-point values `a' and `b', one of 1688 * which is a NaN, and returns the appropriate NaN 1689 * result. If either `a' or `b' is a signaling NaN, 1690 * the invalid exception is raised. 1691 */ 1692 if (is_snan(a.cls) || is_snan(b.cls)) { 1693 return pick_nan(a, b, s); 1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1695 return b; 1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1697 return a; 1698 } 1699 } 1700 return pick_nan(a, b, s); 1701 } else { 1702 int a_exp, b_exp; 1703 bool a_sign, b_sign; 1704 1705 switch (a.cls) { 1706 case float_class_normal: 1707 a_exp = a.exp; 1708 break; 1709 case float_class_inf: 1710 a_exp = INT_MAX; 1711 break; 1712 case float_class_zero: 1713 a_exp = INT_MIN; 1714 break; 1715 default: 1716 g_assert_not_reached(); 1717 break; 1718 } 1719 switch (b.cls) { 1720 case float_class_normal: 1721 b_exp = b.exp; 1722 break; 1723 case float_class_inf: 1724 b_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 b_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 1734 a_sign = a.sign; 1735 b_sign = b.sign; 1736 if (ismag) { 1737 a_sign = b_sign = 0; 1738 } 1739 1740 if (a_sign == b_sign) { 1741 bool a_less = a_exp < b_exp; 1742 if (a_exp == b_exp) { 1743 a_less = a.frac < b.frac; 1744 } 1745 return a_sign ^ a_less ^ ismin ? b : a; 1746 } else { 1747 return a_sign ^ ismin ? b : a; 1748 } 1749 } 1750 } 1751 1752 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1754 float_status *s) \ 1755 { \ 1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1759 \ 1760 return float ## sz ## _round_pack_canonical(pr, s); \ 1761 } 1762 1763 MINMAX(16, min, true, false, false) 1764 MINMAX(16, minnum, true, true, false) 1765 MINMAX(16, minnummag, true, true, true) 1766 MINMAX(16, max, false, false, false) 1767 MINMAX(16, maxnum, false, true, false) 1768 MINMAX(16, maxnummag, false, true, true) 1769 1770 MINMAX(32, min, true, false, false) 1771 MINMAX(32, minnum, true, true, false) 1772 MINMAX(32, minnummag, true, true, true) 1773 MINMAX(32, max, false, false, false) 1774 MINMAX(32, maxnum, false, true, false) 1775 MINMAX(32, maxnummag, false, true, true) 1776 1777 MINMAX(64, min, true, false, false) 1778 MINMAX(64, minnum, true, true, false) 1779 MINMAX(64, minnummag, true, true, true) 1780 MINMAX(64, max, false, false, false) 1781 MINMAX(64, maxnum, false, true, false) 1782 MINMAX(64, maxnummag, false, true, true) 1783 1784 #undef MINMAX 1785 1786 /* Floating point compare */ 1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1788 float_status *s) 1789 { 1790 if (is_nan(a.cls) || is_nan(b.cls)) { 1791 if (!is_quiet || 1792 a.cls == float_class_snan || 1793 b.cls == float_class_snan) { 1794 s->float_exception_flags |= float_flag_invalid; 1795 } 1796 return float_relation_unordered; 1797 } 1798 1799 if (a.cls == float_class_zero) { 1800 if (b.cls == float_class_zero) { 1801 return float_relation_equal; 1802 } 1803 return b.sign ? float_relation_greater : float_relation_less; 1804 } else if (b.cls == float_class_zero) { 1805 return a.sign ? float_relation_less : float_relation_greater; 1806 } 1807 1808 /* The only really important thing about infinity is its sign. If 1809 * both are infinities the sign marks the smallest of the two. 1810 */ 1811 if (a.cls == float_class_inf) { 1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1813 return float_relation_equal; 1814 } 1815 return a.sign ? float_relation_less : float_relation_greater; 1816 } else if (b.cls == float_class_inf) { 1817 return b.sign ? float_relation_greater : float_relation_less; 1818 } 1819 1820 if (a.sign != b.sign) { 1821 return a.sign ? float_relation_less : float_relation_greater; 1822 } 1823 1824 if (a.exp == b.exp) { 1825 if (a.frac == b.frac) { 1826 return float_relation_equal; 1827 } 1828 if (a.sign) { 1829 return a.frac > b.frac ? 1830 float_relation_less : float_relation_greater; 1831 } else { 1832 return a.frac > b.frac ? 1833 float_relation_greater : float_relation_less; 1834 } 1835 } else { 1836 if (a.sign) { 1837 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1838 } else { 1839 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1840 } 1841 } 1842 } 1843 1844 #define COMPARE(sz) \ 1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1846 float_status *s) \ 1847 { \ 1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1850 return compare_floats(pa, pb, false, s); \ 1851 } \ 1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1853 float_status *s) \ 1854 { \ 1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1857 return compare_floats(pa, pb, true, s); \ 1858 } 1859 1860 COMPARE(16) 1861 COMPARE(32) 1862 COMPARE(64) 1863 1864 #undef COMPARE 1865 1866 /* Multiply A by 2 raised to the power N. */ 1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1868 { 1869 if (unlikely(is_nan(a.cls))) { 1870 return return_nan(a, s); 1871 } 1872 if (a.cls == float_class_normal) { 1873 a.exp += n; 1874 } 1875 return a; 1876 } 1877 1878 float16 float16_scalbn(float16 a, int n, float_status *status) 1879 { 1880 FloatParts pa = float16_unpack_canonical(a, status); 1881 FloatParts pr = scalbn_decomposed(pa, n, status); 1882 return float16_round_pack_canonical(pr, status); 1883 } 1884 1885 float32 float32_scalbn(float32 a, int n, float_status *status) 1886 { 1887 FloatParts pa = float32_unpack_canonical(a, status); 1888 FloatParts pr = scalbn_decomposed(pa, n, status); 1889 return float32_round_pack_canonical(pr, status); 1890 } 1891 1892 float64 float64_scalbn(float64 a, int n, float_status *status) 1893 { 1894 FloatParts pa = float64_unpack_canonical(a, status); 1895 FloatParts pr = scalbn_decomposed(pa, n, status); 1896 return float64_round_pack_canonical(pr, status); 1897 } 1898 1899 /*---------------------------------------------------------------------------- 1900 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1901 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1902 | input. If `zSign' is 1, the input is negated before being converted to an 1903 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 1904 | is simply rounded to an integer, with the inexact exception raised if the 1905 | input cannot be represented exactly as an integer. However, if the fixed- 1906 | point input is too large, the invalid exception is raised and the largest 1907 | positive or negative integer is returned. 1908 *----------------------------------------------------------------------------*/ 1909 1910 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 1911 { 1912 int8_t roundingMode; 1913 flag roundNearestEven; 1914 int8_t roundIncrement, roundBits; 1915 int32_t z; 1916 1917 roundingMode = status->float_rounding_mode; 1918 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1919 switch (roundingMode) { 1920 case float_round_nearest_even: 1921 case float_round_ties_away: 1922 roundIncrement = 0x40; 1923 break; 1924 case float_round_to_zero: 1925 roundIncrement = 0; 1926 break; 1927 case float_round_up: 1928 roundIncrement = zSign ? 0 : 0x7f; 1929 break; 1930 case float_round_down: 1931 roundIncrement = zSign ? 0x7f : 0; 1932 break; 1933 default: 1934 abort(); 1935 } 1936 roundBits = absZ & 0x7F; 1937 absZ = ( absZ + roundIncrement )>>7; 1938 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 1939 z = absZ; 1940 if ( zSign ) z = - z; 1941 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 1942 float_raise(float_flag_invalid, status); 1943 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 1944 } 1945 if (roundBits) { 1946 status->float_exception_flags |= float_flag_inexact; 1947 } 1948 return z; 1949 1950 } 1951 1952 /*---------------------------------------------------------------------------- 1953 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 1954 | `absZ1', with binary point between bits 63 and 64 (between the input words), 1955 | and returns the properly rounded 64-bit integer corresponding to the input. 1956 | If `zSign' is 1, the input is negated before being converted to an integer. 1957 | Ordinarily, the fixed-point input is simply rounded to an integer, with 1958 | the inexact exception raised if the input cannot be represented exactly as 1959 | an integer. However, if the fixed-point input is too large, the invalid 1960 | exception is raised and the largest positive or negative integer is 1961 | returned. 1962 *----------------------------------------------------------------------------*/ 1963 1964 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 1965 float_status *status) 1966 { 1967 int8_t roundingMode; 1968 flag roundNearestEven, increment; 1969 int64_t z; 1970 1971 roundingMode = status->float_rounding_mode; 1972 roundNearestEven = ( roundingMode == float_round_nearest_even ); 1973 switch (roundingMode) { 1974 case float_round_nearest_even: 1975 case float_round_ties_away: 1976 increment = ((int64_t) absZ1 < 0); 1977 break; 1978 case float_round_to_zero: 1979 increment = 0; 1980 break; 1981 case float_round_up: 1982 increment = !zSign && absZ1; 1983 break; 1984 case float_round_down: 1985 increment = zSign && absZ1; 1986 break; 1987 default: 1988 abort(); 1989 } 1990 if ( increment ) { 1991 ++absZ0; 1992 if ( absZ0 == 0 ) goto overflow; 1993 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 1994 } 1995 z = absZ0; 1996 if ( zSign ) z = - z; 1997 if ( z && ( ( z < 0 ) ^ zSign ) ) { 1998 overflow: 1999 float_raise(float_flag_invalid, status); 2000 return 2001 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2002 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2003 } 2004 if (absZ1) { 2005 status->float_exception_flags |= float_flag_inexact; 2006 } 2007 return z; 2008 2009 } 2010 2011 /*---------------------------------------------------------------------------- 2012 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2013 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2014 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2015 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2016 | with the inexact exception raised if the input cannot be represented exactly 2017 | as an integer. However, if the fixed-point input is too large, the invalid 2018 | exception is raised and the largest unsigned integer is returned. 2019 *----------------------------------------------------------------------------*/ 2020 2021 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2022 uint64_t absZ1, float_status *status) 2023 { 2024 int8_t roundingMode; 2025 flag roundNearestEven, increment; 2026 2027 roundingMode = status->float_rounding_mode; 2028 roundNearestEven = (roundingMode == float_round_nearest_even); 2029 switch (roundingMode) { 2030 case float_round_nearest_even: 2031 case float_round_ties_away: 2032 increment = ((int64_t)absZ1 < 0); 2033 break; 2034 case float_round_to_zero: 2035 increment = 0; 2036 break; 2037 case float_round_up: 2038 increment = !zSign && absZ1; 2039 break; 2040 case float_round_down: 2041 increment = zSign && absZ1; 2042 break; 2043 default: 2044 abort(); 2045 } 2046 if (increment) { 2047 ++absZ0; 2048 if (absZ0 == 0) { 2049 float_raise(float_flag_invalid, status); 2050 return LIT64(0xFFFFFFFFFFFFFFFF); 2051 } 2052 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2053 } 2054 2055 if (zSign && absZ0) { 2056 float_raise(float_flag_invalid, status); 2057 return 0; 2058 } 2059 2060 if (absZ1) { 2061 status->float_exception_flags |= float_flag_inexact; 2062 } 2063 return absZ0; 2064 } 2065 2066 /*---------------------------------------------------------------------------- 2067 | If `a' is denormal and we are in flush-to-zero mode then set the 2068 | input-denormal exception and return zero. Otherwise just return the value. 2069 *----------------------------------------------------------------------------*/ 2070 float32 float32_squash_input_denormal(float32 a, float_status *status) 2071 { 2072 if (status->flush_inputs_to_zero) { 2073 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2074 float_raise(float_flag_input_denormal, status); 2075 return make_float32(float32_val(a) & 0x80000000); 2076 } 2077 } 2078 return a; 2079 } 2080 2081 /*---------------------------------------------------------------------------- 2082 | Normalizes the subnormal single-precision floating-point value represented 2083 | by the denormalized significand `aSig'. The normalized exponent and 2084 | significand are stored at the locations pointed to by `zExpPtr' and 2085 | `zSigPtr', respectively. 2086 *----------------------------------------------------------------------------*/ 2087 2088 static void 2089 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2090 { 2091 int8_t shiftCount; 2092 2093 shiftCount = countLeadingZeros32( aSig ) - 8; 2094 *zSigPtr = aSig<<shiftCount; 2095 *zExpPtr = 1 - shiftCount; 2096 2097 } 2098 2099 /*---------------------------------------------------------------------------- 2100 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2101 | single-precision floating-point value, returning the result. After being 2102 | shifted into the proper positions, the three fields are simply added 2103 | together to form the result. This means that any integer portion of `zSig' 2104 | will be added into the exponent. Since a properly normalized significand 2105 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2106 | than the desired result exponent whenever `zSig' is a complete, normalized 2107 | significand. 2108 *----------------------------------------------------------------------------*/ 2109 2110 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig) 2111 { 2112 2113 return make_float32( 2114 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig); 2115 2116 } 2117 2118 /*---------------------------------------------------------------------------- 2119 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2120 | and significand `zSig', and returns the proper single-precision floating- 2121 | point value corresponding to the abstract input. Ordinarily, the abstract 2122 | value is simply rounded and packed into the single-precision format, with 2123 | the inexact exception raised if the abstract input cannot be represented 2124 | exactly. However, if the abstract value is too large, the overflow and 2125 | inexact exceptions are raised and an infinity or maximal finite value is 2126 | returned. If the abstract value is too small, the input value is rounded to 2127 | a subnormal number, and the underflow and inexact exceptions are raised if 2128 | the abstract input cannot be represented exactly as a subnormal single- 2129 | precision floating-point number. 2130 | The input significand `zSig' has its binary point between bits 30 2131 | and 29, which is 7 bits to the left of the usual location. This shifted 2132 | significand must be normalized or smaller. If `zSig' is not normalized, 2133 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2134 | and it must not require rounding. In the usual case that `zSig' is 2135 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2136 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2137 | Binary Floating-Point Arithmetic. 2138 *----------------------------------------------------------------------------*/ 2139 2140 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2141 float_status *status) 2142 { 2143 int8_t roundingMode; 2144 flag roundNearestEven; 2145 int8_t roundIncrement, roundBits; 2146 flag isTiny; 2147 2148 roundingMode = status->float_rounding_mode; 2149 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2150 switch (roundingMode) { 2151 case float_round_nearest_even: 2152 case float_round_ties_away: 2153 roundIncrement = 0x40; 2154 break; 2155 case float_round_to_zero: 2156 roundIncrement = 0; 2157 break; 2158 case float_round_up: 2159 roundIncrement = zSign ? 0 : 0x7f; 2160 break; 2161 case float_round_down: 2162 roundIncrement = zSign ? 0x7f : 0; 2163 break; 2164 default: 2165 abort(); 2166 break; 2167 } 2168 roundBits = zSig & 0x7F; 2169 if ( 0xFD <= (uint16_t) zExp ) { 2170 if ( ( 0xFD < zExp ) 2171 || ( ( zExp == 0xFD ) 2172 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2173 ) { 2174 float_raise(float_flag_overflow | float_flag_inexact, status); 2175 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2176 } 2177 if ( zExp < 0 ) { 2178 if (status->flush_to_zero) { 2179 float_raise(float_flag_output_denormal, status); 2180 return packFloat32(zSign, 0, 0); 2181 } 2182 isTiny = 2183 (status->float_detect_tininess 2184 == float_tininess_before_rounding) 2185 || ( zExp < -1 ) 2186 || ( zSig + roundIncrement < 0x80000000 ); 2187 shift32RightJamming( zSig, - zExp, &zSig ); 2188 zExp = 0; 2189 roundBits = zSig & 0x7F; 2190 if (isTiny && roundBits) { 2191 float_raise(float_flag_underflow, status); 2192 } 2193 } 2194 } 2195 if (roundBits) { 2196 status->float_exception_flags |= float_flag_inexact; 2197 } 2198 zSig = ( zSig + roundIncrement )>>7; 2199 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2200 if ( zSig == 0 ) zExp = 0; 2201 return packFloat32( zSign, zExp, zSig ); 2202 2203 } 2204 2205 /*---------------------------------------------------------------------------- 2206 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2207 | and significand `zSig', and returns the proper single-precision floating- 2208 | point value corresponding to the abstract input. This routine is just like 2209 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2210 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2211 | floating-point exponent. 2212 *----------------------------------------------------------------------------*/ 2213 2214 static float32 2215 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2216 float_status *status) 2217 { 2218 int8_t shiftCount; 2219 2220 shiftCount = countLeadingZeros32( zSig ) - 1; 2221 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2222 status); 2223 2224 } 2225 2226 /*---------------------------------------------------------------------------- 2227 | If `a' is denormal and we are in flush-to-zero mode then set the 2228 | input-denormal exception and return zero. Otherwise just return the value. 2229 *----------------------------------------------------------------------------*/ 2230 float64 float64_squash_input_denormal(float64 a, float_status *status) 2231 { 2232 if (status->flush_inputs_to_zero) { 2233 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2234 float_raise(float_flag_input_denormal, status); 2235 return make_float64(float64_val(a) & (1ULL << 63)); 2236 } 2237 } 2238 return a; 2239 } 2240 2241 /*---------------------------------------------------------------------------- 2242 | Normalizes the subnormal double-precision floating-point value represented 2243 | by the denormalized significand `aSig'. The normalized exponent and 2244 | significand are stored at the locations pointed to by `zExpPtr' and 2245 | `zSigPtr', respectively. 2246 *----------------------------------------------------------------------------*/ 2247 2248 static void 2249 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2250 { 2251 int8_t shiftCount; 2252 2253 shiftCount = countLeadingZeros64( aSig ) - 11; 2254 *zSigPtr = aSig<<shiftCount; 2255 *zExpPtr = 1 - shiftCount; 2256 2257 } 2258 2259 /*---------------------------------------------------------------------------- 2260 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2261 | double-precision floating-point value, returning the result. After being 2262 | shifted into the proper positions, the three fields are simply added 2263 | together to form the result. This means that any integer portion of `zSig' 2264 | will be added into the exponent. Since a properly normalized significand 2265 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2266 | than the desired result exponent whenever `zSig' is a complete, normalized 2267 | significand. 2268 *----------------------------------------------------------------------------*/ 2269 2270 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2271 { 2272 2273 return make_float64( 2274 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2275 2276 } 2277 2278 /*---------------------------------------------------------------------------- 2279 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2280 | and significand `zSig', and returns the proper double-precision floating- 2281 | point value corresponding to the abstract input. Ordinarily, the abstract 2282 | value is simply rounded and packed into the double-precision format, with 2283 | the inexact exception raised if the abstract input cannot be represented 2284 | exactly. However, if the abstract value is too large, the overflow and 2285 | inexact exceptions are raised and an infinity or maximal finite value is 2286 | returned. If the abstract value is too small, the input value is rounded to 2287 | a subnormal number, and the underflow and inexact exceptions are raised if 2288 | the abstract input cannot be represented exactly as a subnormal double- 2289 | precision floating-point number. 2290 | The input significand `zSig' has its binary point between bits 62 2291 | and 61, which is 10 bits to the left of the usual location. This shifted 2292 | significand must be normalized or smaller. If `zSig' is not normalized, 2293 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2294 | and it must not require rounding. In the usual case that `zSig' is 2295 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2296 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2297 | Binary Floating-Point Arithmetic. 2298 *----------------------------------------------------------------------------*/ 2299 2300 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2301 float_status *status) 2302 { 2303 int8_t roundingMode; 2304 flag roundNearestEven; 2305 int roundIncrement, roundBits; 2306 flag isTiny; 2307 2308 roundingMode = status->float_rounding_mode; 2309 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2310 switch (roundingMode) { 2311 case float_round_nearest_even: 2312 case float_round_ties_away: 2313 roundIncrement = 0x200; 2314 break; 2315 case float_round_to_zero: 2316 roundIncrement = 0; 2317 break; 2318 case float_round_up: 2319 roundIncrement = zSign ? 0 : 0x3ff; 2320 break; 2321 case float_round_down: 2322 roundIncrement = zSign ? 0x3ff : 0; 2323 break; 2324 case float_round_to_odd: 2325 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2326 break; 2327 default: 2328 abort(); 2329 } 2330 roundBits = zSig & 0x3FF; 2331 if ( 0x7FD <= (uint16_t) zExp ) { 2332 if ( ( 0x7FD < zExp ) 2333 || ( ( zExp == 0x7FD ) 2334 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2335 ) { 2336 bool overflow_to_inf = roundingMode != float_round_to_odd && 2337 roundIncrement != 0; 2338 float_raise(float_flag_overflow | float_flag_inexact, status); 2339 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2340 } 2341 if ( zExp < 0 ) { 2342 if (status->flush_to_zero) { 2343 float_raise(float_flag_output_denormal, status); 2344 return packFloat64(zSign, 0, 0); 2345 } 2346 isTiny = 2347 (status->float_detect_tininess 2348 == float_tininess_before_rounding) 2349 || ( zExp < -1 ) 2350 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2351 shift64RightJamming( zSig, - zExp, &zSig ); 2352 zExp = 0; 2353 roundBits = zSig & 0x3FF; 2354 if (isTiny && roundBits) { 2355 float_raise(float_flag_underflow, status); 2356 } 2357 if (roundingMode == float_round_to_odd) { 2358 /* 2359 * For round-to-odd case, the roundIncrement depends on 2360 * zSig which just changed. 2361 */ 2362 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2363 } 2364 } 2365 } 2366 if (roundBits) { 2367 status->float_exception_flags |= float_flag_inexact; 2368 } 2369 zSig = ( zSig + roundIncrement )>>10; 2370 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2371 if ( zSig == 0 ) zExp = 0; 2372 return packFloat64( zSign, zExp, zSig ); 2373 2374 } 2375 2376 /*---------------------------------------------------------------------------- 2377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2378 | and significand `zSig', and returns the proper double-precision floating- 2379 | point value corresponding to the abstract input. This routine is just like 2380 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2381 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2382 | floating-point exponent. 2383 *----------------------------------------------------------------------------*/ 2384 2385 static float64 2386 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2387 float_status *status) 2388 { 2389 int8_t shiftCount; 2390 2391 shiftCount = countLeadingZeros64( zSig ) - 1; 2392 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2393 status); 2394 2395 } 2396 2397 /*---------------------------------------------------------------------------- 2398 | Returns the fraction bits of the extended double-precision floating-point 2399 | value `a'. 2400 *----------------------------------------------------------------------------*/ 2401 2402 static inline uint64_t extractFloatx80Frac( floatx80 a ) 2403 { 2404 2405 return a.low; 2406 2407 } 2408 2409 /*---------------------------------------------------------------------------- 2410 | Returns the exponent bits of the extended double-precision floating-point 2411 | value `a'. 2412 *----------------------------------------------------------------------------*/ 2413 2414 static inline int32_t extractFloatx80Exp( floatx80 a ) 2415 { 2416 2417 return a.high & 0x7FFF; 2418 2419 } 2420 2421 /*---------------------------------------------------------------------------- 2422 | Returns the sign bit of the extended double-precision floating-point value 2423 | `a'. 2424 *----------------------------------------------------------------------------*/ 2425 2426 static inline flag extractFloatx80Sign( floatx80 a ) 2427 { 2428 2429 return a.high>>15; 2430 2431 } 2432 2433 /*---------------------------------------------------------------------------- 2434 | Normalizes the subnormal extended double-precision floating-point value 2435 | represented by the denormalized significand `aSig'. The normalized exponent 2436 | and significand are stored at the locations pointed to by `zExpPtr' and 2437 | `zSigPtr', respectively. 2438 *----------------------------------------------------------------------------*/ 2439 2440 static void 2441 normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr ) 2442 { 2443 int8_t shiftCount; 2444 2445 shiftCount = countLeadingZeros64( aSig ); 2446 *zSigPtr = aSig<<shiftCount; 2447 *zExpPtr = 1 - shiftCount; 2448 2449 } 2450 2451 /*---------------------------------------------------------------------------- 2452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an 2453 | extended double-precision floating-point value, returning the result. 2454 *----------------------------------------------------------------------------*/ 2455 2456 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig ) 2457 { 2458 floatx80 z; 2459 2460 z.low = zSig; 2461 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp; 2462 return z; 2463 2464 } 2465 2466 /*---------------------------------------------------------------------------- 2467 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2468 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2469 | and returns the proper extended double-precision floating-point value 2470 | corresponding to the abstract input. Ordinarily, the abstract value is 2471 | rounded and packed into the extended double-precision format, with the 2472 | inexact exception raised if the abstract input cannot be represented 2473 | exactly. However, if the abstract value is too large, the overflow and 2474 | inexact exceptions are raised and an infinity or maximal finite value is 2475 | returned. If the abstract value is too small, the input value is rounded to 2476 | a subnormal number, and the underflow and inexact exceptions are raised if 2477 | the abstract input cannot be represented exactly as a subnormal extended 2478 | double-precision floating-point number. 2479 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2480 | number of bits as single or double precision, respectively. Otherwise, the 2481 | result is rounded to the full precision of the extended double-precision 2482 | format. 2483 | The input significand must be normalized or smaller. If the input 2484 | significand is not normalized, `zExp' must be 0; in that case, the result 2485 | returned is a subnormal number, and it must not require rounding. The 2486 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2487 | Floating-Point Arithmetic. 2488 *----------------------------------------------------------------------------*/ 2489 2490 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2491 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2492 float_status *status) 2493 { 2494 int8_t roundingMode; 2495 flag roundNearestEven, increment, isTiny; 2496 int64_t roundIncrement, roundMask, roundBits; 2497 2498 roundingMode = status->float_rounding_mode; 2499 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2500 if ( roundingPrecision == 80 ) goto precision80; 2501 if ( roundingPrecision == 64 ) { 2502 roundIncrement = LIT64( 0x0000000000000400 ); 2503 roundMask = LIT64( 0x00000000000007FF ); 2504 } 2505 else if ( roundingPrecision == 32 ) { 2506 roundIncrement = LIT64( 0x0000008000000000 ); 2507 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2508 } 2509 else { 2510 goto precision80; 2511 } 2512 zSig0 |= ( zSig1 != 0 ); 2513 switch (roundingMode) { 2514 case float_round_nearest_even: 2515 case float_round_ties_away: 2516 break; 2517 case float_round_to_zero: 2518 roundIncrement = 0; 2519 break; 2520 case float_round_up: 2521 roundIncrement = zSign ? 0 : roundMask; 2522 break; 2523 case float_round_down: 2524 roundIncrement = zSign ? roundMask : 0; 2525 break; 2526 default: 2527 abort(); 2528 } 2529 roundBits = zSig0 & roundMask; 2530 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2531 if ( ( 0x7FFE < zExp ) 2532 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2533 ) { 2534 goto overflow; 2535 } 2536 if ( zExp <= 0 ) { 2537 if (status->flush_to_zero) { 2538 float_raise(float_flag_output_denormal, status); 2539 return packFloatx80(zSign, 0, 0); 2540 } 2541 isTiny = 2542 (status->float_detect_tininess 2543 == float_tininess_before_rounding) 2544 || ( zExp < 0 ) 2545 || ( zSig0 <= zSig0 + roundIncrement ); 2546 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2547 zExp = 0; 2548 roundBits = zSig0 & roundMask; 2549 if (isTiny && roundBits) { 2550 float_raise(float_flag_underflow, status); 2551 } 2552 if (roundBits) { 2553 status->float_exception_flags |= float_flag_inexact; 2554 } 2555 zSig0 += roundIncrement; 2556 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2557 roundIncrement = roundMask + 1; 2558 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2559 roundMask |= roundIncrement; 2560 } 2561 zSig0 &= ~ roundMask; 2562 return packFloatx80( zSign, zExp, zSig0 ); 2563 } 2564 } 2565 if (roundBits) { 2566 status->float_exception_flags |= float_flag_inexact; 2567 } 2568 zSig0 += roundIncrement; 2569 if ( zSig0 < roundIncrement ) { 2570 ++zExp; 2571 zSig0 = LIT64( 0x8000000000000000 ); 2572 } 2573 roundIncrement = roundMask + 1; 2574 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2575 roundMask |= roundIncrement; 2576 } 2577 zSig0 &= ~ roundMask; 2578 if ( zSig0 == 0 ) zExp = 0; 2579 return packFloatx80( zSign, zExp, zSig0 ); 2580 precision80: 2581 switch (roundingMode) { 2582 case float_round_nearest_even: 2583 case float_round_ties_away: 2584 increment = ((int64_t)zSig1 < 0); 2585 break; 2586 case float_round_to_zero: 2587 increment = 0; 2588 break; 2589 case float_round_up: 2590 increment = !zSign && zSig1; 2591 break; 2592 case float_round_down: 2593 increment = zSign && zSig1; 2594 break; 2595 default: 2596 abort(); 2597 } 2598 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2599 if ( ( 0x7FFE < zExp ) 2600 || ( ( zExp == 0x7FFE ) 2601 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2602 && increment 2603 ) 2604 ) { 2605 roundMask = 0; 2606 overflow: 2607 float_raise(float_flag_overflow | float_flag_inexact, status); 2608 if ( ( roundingMode == float_round_to_zero ) 2609 || ( zSign && ( roundingMode == float_round_up ) ) 2610 || ( ! zSign && ( roundingMode == float_round_down ) ) 2611 ) { 2612 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2613 } 2614 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2615 } 2616 if ( zExp <= 0 ) { 2617 isTiny = 2618 (status->float_detect_tininess 2619 == float_tininess_before_rounding) 2620 || ( zExp < 0 ) 2621 || ! increment 2622 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2623 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2624 zExp = 0; 2625 if (isTiny && zSig1) { 2626 float_raise(float_flag_underflow, status); 2627 } 2628 if (zSig1) { 2629 status->float_exception_flags |= float_flag_inexact; 2630 } 2631 switch (roundingMode) { 2632 case float_round_nearest_even: 2633 case float_round_ties_away: 2634 increment = ((int64_t)zSig1 < 0); 2635 break; 2636 case float_round_to_zero: 2637 increment = 0; 2638 break; 2639 case float_round_up: 2640 increment = !zSign && zSig1; 2641 break; 2642 case float_round_down: 2643 increment = zSign && zSig1; 2644 break; 2645 default: 2646 abort(); 2647 } 2648 if ( increment ) { 2649 ++zSig0; 2650 zSig0 &= 2651 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2652 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2653 } 2654 return packFloatx80( zSign, zExp, zSig0 ); 2655 } 2656 } 2657 if (zSig1) { 2658 status->float_exception_flags |= float_flag_inexact; 2659 } 2660 if ( increment ) { 2661 ++zSig0; 2662 if ( zSig0 == 0 ) { 2663 ++zExp; 2664 zSig0 = LIT64( 0x8000000000000000 ); 2665 } 2666 else { 2667 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2668 } 2669 } 2670 else { 2671 if ( zSig0 == 0 ) zExp = 0; 2672 } 2673 return packFloatx80( zSign, zExp, zSig0 ); 2674 2675 } 2676 2677 /*---------------------------------------------------------------------------- 2678 | Takes an abstract floating-point value having sign `zSign', exponent 2679 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2680 | and returns the proper extended double-precision floating-point value 2681 | corresponding to the abstract input. This routine is just like 2682 | `roundAndPackFloatx80' except that the input significand does not have to be 2683 | normalized. 2684 *----------------------------------------------------------------------------*/ 2685 2686 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2687 flag zSign, int32_t zExp, 2688 uint64_t zSig0, uint64_t zSig1, 2689 float_status *status) 2690 { 2691 int8_t shiftCount; 2692 2693 if ( zSig0 == 0 ) { 2694 zSig0 = zSig1; 2695 zSig1 = 0; 2696 zExp -= 64; 2697 } 2698 shiftCount = countLeadingZeros64( zSig0 ); 2699 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2700 zExp -= shiftCount; 2701 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2702 zSig0, zSig1, status); 2703 2704 } 2705 2706 /*---------------------------------------------------------------------------- 2707 | Returns the least-significant 64 fraction bits of the quadruple-precision 2708 | floating-point value `a'. 2709 *----------------------------------------------------------------------------*/ 2710 2711 static inline uint64_t extractFloat128Frac1( float128 a ) 2712 { 2713 2714 return a.low; 2715 2716 } 2717 2718 /*---------------------------------------------------------------------------- 2719 | Returns the most-significant 48 fraction bits of the quadruple-precision 2720 | floating-point value `a'. 2721 *----------------------------------------------------------------------------*/ 2722 2723 static inline uint64_t extractFloat128Frac0( float128 a ) 2724 { 2725 2726 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2727 2728 } 2729 2730 /*---------------------------------------------------------------------------- 2731 | Returns the exponent bits of the quadruple-precision floating-point value 2732 | `a'. 2733 *----------------------------------------------------------------------------*/ 2734 2735 static inline int32_t extractFloat128Exp( float128 a ) 2736 { 2737 2738 return ( a.high>>48 ) & 0x7FFF; 2739 2740 } 2741 2742 /*---------------------------------------------------------------------------- 2743 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2744 *----------------------------------------------------------------------------*/ 2745 2746 static inline flag extractFloat128Sign( float128 a ) 2747 { 2748 2749 return a.high>>63; 2750 2751 } 2752 2753 /*---------------------------------------------------------------------------- 2754 | Normalizes the subnormal quadruple-precision floating-point value 2755 | represented by the denormalized significand formed by the concatenation of 2756 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2757 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2758 | significand are stored at the location pointed to by `zSig0Ptr', and the 2759 | least significant 64 bits of the normalized significand are stored at the 2760 | location pointed to by `zSig1Ptr'. 2761 *----------------------------------------------------------------------------*/ 2762 2763 static void 2764 normalizeFloat128Subnormal( 2765 uint64_t aSig0, 2766 uint64_t aSig1, 2767 int32_t *zExpPtr, 2768 uint64_t *zSig0Ptr, 2769 uint64_t *zSig1Ptr 2770 ) 2771 { 2772 int8_t shiftCount; 2773 2774 if ( aSig0 == 0 ) { 2775 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2776 if ( shiftCount < 0 ) { 2777 *zSig0Ptr = aSig1>>( - shiftCount ); 2778 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2779 } 2780 else { 2781 *zSig0Ptr = aSig1<<shiftCount; 2782 *zSig1Ptr = 0; 2783 } 2784 *zExpPtr = - shiftCount - 63; 2785 } 2786 else { 2787 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2788 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2789 *zExpPtr = 1 - shiftCount; 2790 } 2791 2792 } 2793 2794 /*---------------------------------------------------------------------------- 2795 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2796 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2797 | floating-point value, returning the result. After being shifted into the 2798 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2799 | added together to form the most significant 32 bits of the result. This 2800 | means that any integer portion of `zSig0' will be added into the exponent. 2801 | Since a properly normalized significand will have an integer portion equal 2802 | to 1, the `zExp' input should be 1 less than the desired result exponent 2803 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2804 | significand. 2805 *----------------------------------------------------------------------------*/ 2806 2807 static inline float128 2808 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2809 { 2810 float128 z; 2811 2812 z.low = zSig1; 2813 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2814 return z; 2815 2816 } 2817 2818 /*---------------------------------------------------------------------------- 2819 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2820 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2821 | and `zSig2', and returns the proper quadruple-precision floating-point value 2822 | corresponding to the abstract input. Ordinarily, the abstract value is 2823 | simply rounded and packed into the quadruple-precision format, with the 2824 | inexact exception raised if the abstract input cannot be represented 2825 | exactly. However, if the abstract value is too large, the overflow and 2826 | inexact exceptions are raised and an infinity or maximal finite value is 2827 | returned. If the abstract value is too small, the input value is rounded to 2828 | a subnormal number, and the underflow and inexact exceptions are raised if 2829 | the abstract input cannot be represented exactly as a subnormal quadruple- 2830 | precision floating-point number. 2831 | The input significand must be normalized or smaller. If the input 2832 | significand is not normalized, `zExp' must be 0; in that case, the result 2833 | returned is a subnormal number, and it must not require rounding. In the 2834 | usual case that the input significand is normalized, `zExp' must be 1 less 2835 | than the ``true'' floating-point exponent. The handling of underflow and 2836 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2837 *----------------------------------------------------------------------------*/ 2838 2839 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2840 uint64_t zSig0, uint64_t zSig1, 2841 uint64_t zSig2, float_status *status) 2842 { 2843 int8_t roundingMode; 2844 flag roundNearestEven, increment, isTiny; 2845 2846 roundingMode = status->float_rounding_mode; 2847 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2848 switch (roundingMode) { 2849 case float_round_nearest_even: 2850 case float_round_ties_away: 2851 increment = ((int64_t)zSig2 < 0); 2852 break; 2853 case float_round_to_zero: 2854 increment = 0; 2855 break; 2856 case float_round_up: 2857 increment = !zSign && zSig2; 2858 break; 2859 case float_round_down: 2860 increment = zSign && zSig2; 2861 break; 2862 case float_round_to_odd: 2863 increment = !(zSig1 & 0x1) && zSig2; 2864 break; 2865 default: 2866 abort(); 2867 } 2868 if ( 0x7FFD <= (uint32_t) zExp ) { 2869 if ( ( 0x7FFD < zExp ) 2870 || ( ( zExp == 0x7FFD ) 2871 && eq128( 2872 LIT64( 0x0001FFFFFFFFFFFF ), 2873 LIT64( 0xFFFFFFFFFFFFFFFF ), 2874 zSig0, 2875 zSig1 2876 ) 2877 && increment 2878 ) 2879 ) { 2880 float_raise(float_flag_overflow | float_flag_inexact, status); 2881 if ( ( roundingMode == float_round_to_zero ) 2882 || ( zSign && ( roundingMode == float_round_up ) ) 2883 || ( ! zSign && ( roundingMode == float_round_down ) ) 2884 || (roundingMode == float_round_to_odd) 2885 ) { 2886 return 2887 packFloat128( 2888 zSign, 2889 0x7FFE, 2890 LIT64( 0x0000FFFFFFFFFFFF ), 2891 LIT64( 0xFFFFFFFFFFFFFFFF ) 2892 ); 2893 } 2894 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2895 } 2896 if ( zExp < 0 ) { 2897 if (status->flush_to_zero) { 2898 float_raise(float_flag_output_denormal, status); 2899 return packFloat128(zSign, 0, 0, 0); 2900 } 2901 isTiny = 2902 (status->float_detect_tininess 2903 == float_tininess_before_rounding) 2904 || ( zExp < -1 ) 2905 || ! increment 2906 || lt128( 2907 zSig0, 2908 zSig1, 2909 LIT64( 0x0001FFFFFFFFFFFF ), 2910 LIT64( 0xFFFFFFFFFFFFFFFF ) 2911 ); 2912 shift128ExtraRightJamming( 2913 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2914 zExp = 0; 2915 if (isTiny && zSig2) { 2916 float_raise(float_flag_underflow, status); 2917 } 2918 switch (roundingMode) { 2919 case float_round_nearest_even: 2920 case float_round_ties_away: 2921 increment = ((int64_t)zSig2 < 0); 2922 break; 2923 case float_round_to_zero: 2924 increment = 0; 2925 break; 2926 case float_round_up: 2927 increment = !zSign && zSig2; 2928 break; 2929 case float_round_down: 2930 increment = zSign && zSig2; 2931 break; 2932 case float_round_to_odd: 2933 increment = !(zSig1 & 0x1) && zSig2; 2934 break; 2935 default: 2936 abort(); 2937 } 2938 } 2939 } 2940 if (zSig2) { 2941 status->float_exception_flags |= float_flag_inexact; 2942 } 2943 if ( increment ) { 2944 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2945 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2946 } 2947 else { 2948 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2949 } 2950 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2951 2952 } 2953 2954 /*---------------------------------------------------------------------------- 2955 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2956 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2957 | returns the proper quadruple-precision floating-point value corresponding 2958 | to the abstract input. This routine is just like `roundAndPackFloat128' 2959 | except that the input significand has fewer bits and does not have to be 2960 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2961 | point exponent. 2962 *----------------------------------------------------------------------------*/ 2963 2964 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2965 uint64_t zSig0, uint64_t zSig1, 2966 float_status *status) 2967 { 2968 int8_t shiftCount; 2969 uint64_t zSig2; 2970 2971 if ( zSig0 == 0 ) { 2972 zSig0 = zSig1; 2973 zSig1 = 0; 2974 zExp -= 64; 2975 } 2976 shiftCount = countLeadingZeros64( zSig0 ) - 15; 2977 if ( 0 <= shiftCount ) { 2978 zSig2 = 0; 2979 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2980 } 2981 else { 2982 shift128ExtraRightJamming( 2983 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 2984 } 2985 zExp -= shiftCount; 2986 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 2987 2988 } 2989 2990 2991 /*---------------------------------------------------------------------------- 2992 | Returns the result of converting the 32-bit two's complement integer `a' 2993 | to the extended double-precision floating-point format. The conversion 2994 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 2995 | Arithmetic. 2996 *----------------------------------------------------------------------------*/ 2997 2998 floatx80 int32_to_floatx80(int32_t a, float_status *status) 2999 { 3000 flag zSign; 3001 uint32_t absA; 3002 int8_t shiftCount; 3003 uint64_t zSig; 3004 3005 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3006 zSign = ( a < 0 ); 3007 absA = zSign ? - a : a; 3008 shiftCount = countLeadingZeros32( absA ) + 32; 3009 zSig = absA; 3010 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3011 3012 } 3013 3014 /*---------------------------------------------------------------------------- 3015 | Returns the result of converting the 32-bit two's complement integer `a' to 3016 | the quadruple-precision floating-point format. The conversion is performed 3017 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3018 *----------------------------------------------------------------------------*/ 3019 3020 float128 int32_to_float128(int32_t a, float_status *status) 3021 { 3022 flag zSign; 3023 uint32_t absA; 3024 int8_t shiftCount; 3025 uint64_t zSig0; 3026 3027 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3028 zSign = ( a < 0 ); 3029 absA = zSign ? - a : a; 3030 shiftCount = countLeadingZeros32( absA ) + 17; 3031 zSig0 = absA; 3032 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3033 3034 } 3035 3036 /*---------------------------------------------------------------------------- 3037 | Returns the result of converting the 64-bit two's complement integer `a' 3038 | to the extended double-precision floating-point format. The conversion 3039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3040 | Arithmetic. 3041 *----------------------------------------------------------------------------*/ 3042 3043 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3044 { 3045 flag zSign; 3046 uint64_t absA; 3047 int8_t shiftCount; 3048 3049 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3050 zSign = ( a < 0 ); 3051 absA = zSign ? - a : a; 3052 shiftCount = countLeadingZeros64( absA ); 3053 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3054 3055 } 3056 3057 /*---------------------------------------------------------------------------- 3058 | Returns the result of converting the 64-bit two's complement integer `a' to 3059 | the quadruple-precision floating-point format. The conversion is performed 3060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3061 *----------------------------------------------------------------------------*/ 3062 3063 float128 int64_to_float128(int64_t a, float_status *status) 3064 { 3065 flag zSign; 3066 uint64_t absA; 3067 int8_t shiftCount; 3068 int32_t zExp; 3069 uint64_t zSig0, zSig1; 3070 3071 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3072 zSign = ( a < 0 ); 3073 absA = zSign ? - a : a; 3074 shiftCount = countLeadingZeros64( absA ) + 49; 3075 zExp = 0x406E - shiftCount; 3076 if ( 64 <= shiftCount ) { 3077 zSig1 = 0; 3078 zSig0 = absA; 3079 shiftCount -= 64; 3080 } 3081 else { 3082 zSig1 = absA; 3083 zSig0 = 0; 3084 } 3085 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3086 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3087 3088 } 3089 3090 /*---------------------------------------------------------------------------- 3091 | Returns the result of converting the 64-bit unsigned integer `a' 3092 | to the quadruple-precision floating-point format. The conversion is performed 3093 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3094 *----------------------------------------------------------------------------*/ 3095 3096 float128 uint64_to_float128(uint64_t a, float_status *status) 3097 { 3098 if (a == 0) { 3099 return float128_zero; 3100 } 3101 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3102 } 3103 3104 3105 3106 3107 /*---------------------------------------------------------------------------- 3108 | Returns the result of converting the single-precision floating-point value 3109 | `a' to the double-precision floating-point format. The conversion is 3110 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3111 | Arithmetic. 3112 *----------------------------------------------------------------------------*/ 3113 3114 float64 float32_to_float64(float32 a, float_status *status) 3115 { 3116 flag aSign; 3117 int aExp; 3118 uint32_t aSig; 3119 a = float32_squash_input_denormal(a, status); 3120 3121 aSig = extractFloat32Frac( a ); 3122 aExp = extractFloat32Exp( a ); 3123 aSign = extractFloat32Sign( a ); 3124 if ( aExp == 0xFF ) { 3125 if (aSig) { 3126 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3127 } 3128 return packFloat64( aSign, 0x7FF, 0 ); 3129 } 3130 if ( aExp == 0 ) { 3131 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3132 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3133 --aExp; 3134 } 3135 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3136 3137 } 3138 3139 /*---------------------------------------------------------------------------- 3140 | Returns the result of converting the single-precision floating-point value 3141 | `a' to the extended double-precision floating-point format. The conversion 3142 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3143 | Arithmetic. 3144 *----------------------------------------------------------------------------*/ 3145 3146 floatx80 float32_to_floatx80(float32 a, float_status *status) 3147 { 3148 flag aSign; 3149 int aExp; 3150 uint32_t aSig; 3151 3152 a = float32_squash_input_denormal(a, status); 3153 aSig = extractFloat32Frac( a ); 3154 aExp = extractFloat32Exp( a ); 3155 aSign = extractFloat32Sign( a ); 3156 if ( aExp == 0xFF ) { 3157 if (aSig) { 3158 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3159 } 3160 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3161 } 3162 if ( aExp == 0 ) { 3163 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3164 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3165 } 3166 aSig |= 0x00800000; 3167 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3168 3169 } 3170 3171 /*---------------------------------------------------------------------------- 3172 | Returns the result of converting the single-precision floating-point value 3173 | `a' to the double-precision floating-point format. The conversion is 3174 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3175 | Arithmetic. 3176 *----------------------------------------------------------------------------*/ 3177 3178 float128 float32_to_float128(float32 a, float_status *status) 3179 { 3180 flag aSign; 3181 int aExp; 3182 uint32_t aSig; 3183 3184 a = float32_squash_input_denormal(a, status); 3185 aSig = extractFloat32Frac( a ); 3186 aExp = extractFloat32Exp( a ); 3187 aSign = extractFloat32Sign( a ); 3188 if ( aExp == 0xFF ) { 3189 if (aSig) { 3190 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3191 } 3192 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3193 } 3194 if ( aExp == 0 ) { 3195 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3196 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3197 --aExp; 3198 } 3199 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3200 3201 } 3202 3203 /*---------------------------------------------------------------------------- 3204 | Returns the remainder of the single-precision floating-point value `a' 3205 | with respect to the corresponding value `b'. The operation is performed 3206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3207 *----------------------------------------------------------------------------*/ 3208 3209 float32 float32_rem(float32 a, float32 b, float_status *status) 3210 { 3211 flag aSign, zSign; 3212 int aExp, bExp, expDiff; 3213 uint32_t aSig, bSig; 3214 uint32_t q; 3215 uint64_t aSig64, bSig64, q64; 3216 uint32_t alternateASig; 3217 int32_t sigMean; 3218 a = float32_squash_input_denormal(a, status); 3219 b = float32_squash_input_denormal(b, status); 3220 3221 aSig = extractFloat32Frac( a ); 3222 aExp = extractFloat32Exp( a ); 3223 aSign = extractFloat32Sign( a ); 3224 bSig = extractFloat32Frac( b ); 3225 bExp = extractFloat32Exp( b ); 3226 if ( aExp == 0xFF ) { 3227 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3228 return propagateFloat32NaN(a, b, status); 3229 } 3230 float_raise(float_flag_invalid, status); 3231 return float32_default_nan(status); 3232 } 3233 if ( bExp == 0xFF ) { 3234 if (bSig) { 3235 return propagateFloat32NaN(a, b, status); 3236 } 3237 return a; 3238 } 3239 if ( bExp == 0 ) { 3240 if ( bSig == 0 ) { 3241 float_raise(float_flag_invalid, status); 3242 return float32_default_nan(status); 3243 } 3244 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3245 } 3246 if ( aExp == 0 ) { 3247 if ( aSig == 0 ) return a; 3248 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3249 } 3250 expDiff = aExp - bExp; 3251 aSig |= 0x00800000; 3252 bSig |= 0x00800000; 3253 if ( expDiff < 32 ) { 3254 aSig <<= 8; 3255 bSig <<= 8; 3256 if ( expDiff < 0 ) { 3257 if ( expDiff < -1 ) return a; 3258 aSig >>= 1; 3259 } 3260 q = ( bSig <= aSig ); 3261 if ( q ) aSig -= bSig; 3262 if ( 0 < expDiff ) { 3263 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3264 q >>= 32 - expDiff; 3265 bSig >>= 2; 3266 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3267 } 3268 else { 3269 aSig >>= 2; 3270 bSig >>= 2; 3271 } 3272 } 3273 else { 3274 if ( bSig <= aSig ) aSig -= bSig; 3275 aSig64 = ( (uint64_t) aSig )<<40; 3276 bSig64 = ( (uint64_t) bSig )<<40; 3277 expDiff -= 64; 3278 while ( 0 < expDiff ) { 3279 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3280 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3281 aSig64 = - ( ( bSig * q64 )<<38 ); 3282 expDiff -= 62; 3283 } 3284 expDiff += 64; 3285 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3286 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3287 q = q64>>( 64 - expDiff ); 3288 bSig <<= 6; 3289 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3290 } 3291 do { 3292 alternateASig = aSig; 3293 ++q; 3294 aSig -= bSig; 3295 } while ( 0 <= (int32_t) aSig ); 3296 sigMean = aSig + alternateASig; 3297 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3298 aSig = alternateASig; 3299 } 3300 zSign = ( (int32_t) aSig < 0 ); 3301 if ( zSign ) aSig = - aSig; 3302 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3303 } 3304 3305 3306 /*---------------------------------------------------------------------------- 3307 | Returns the square root of the single-precision floating-point value `a'. 3308 | The operation is performed according to the IEC/IEEE Standard for Binary 3309 | Floating-Point Arithmetic. 3310 *----------------------------------------------------------------------------*/ 3311 3312 float32 float32_sqrt(float32 a, float_status *status) 3313 { 3314 flag aSign; 3315 int aExp, zExp; 3316 uint32_t aSig, zSig; 3317 uint64_t rem, term; 3318 a = float32_squash_input_denormal(a, status); 3319 3320 aSig = extractFloat32Frac( a ); 3321 aExp = extractFloat32Exp( a ); 3322 aSign = extractFloat32Sign( a ); 3323 if ( aExp == 0xFF ) { 3324 if (aSig) { 3325 return propagateFloat32NaN(a, float32_zero, status); 3326 } 3327 if ( ! aSign ) return a; 3328 float_raise(float_flag_invalid, status); 3329 return float32_default_nan(status); 3330 } 3331 if ( aSign ) { 3332 if ( ( aExp | aSig ) == 0 ) return a; 3333 float_raise(float_flag_invalid, status); 3334 return float32_default_nan(status); 3335 } 3336 if ( aExp == 0 ) { 3337 if ( aSig == 0 ) return float32_zero; 3338 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3339 } 3340 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; 3341 aSig = ( aSig | 0x00800000 )<<8; 3342 zSig = estimateSqrt32( aExp, aSig ) + 2; 3343 if ( ( zSig & 0x7F ) <= 5 ) { 3344 if ( zSig < 2 ) { 3345 zSig = 0x7FFFFFFF; 3346 goto roundAndPack; 3347 } 3348 aSig >>= aExp & 1; 3349 term = ( (uint64_t) zSig ) * zSig; 3350 rem = ( ( (uint64_t) aSig )<<32 ) - term; 3351 while ( (int64_t) rem < 0 ) { 3352 --zSig; 3353 rem += ( ( (uint64_t) zSig )<<1 ) | 1; 3354 } 3355 zSig |= ( rem != 0 ); 3356 } 3357 shift32RightJamming( zSig, 1, &zSig ); 3358 roundAndPack: 3359 return roundAndPackFloat32(0, zExp, zSig, status); 3360 3361 } 3362 3363 /*---------------------------------------------------------------------------- 3364 | Returns the binary exponential of the single-precision floating-point value 3365 | `a'. The operation is performed according to the IEC/IEEE Standard for 3366 | Binary Floating-Point Arithmetic. 3367 | 3368 | Uses the following identities: 3369 | 3370 | 1. ------------------------------------------------------------------------- 3371 | x x*ln(2) 3372 | 2 = e 3373 | 3374 | 2. ------------------------------------------------------------------------- 3375 | 2 3 4 5 n 3376 | x x x x x x x 3377 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3378 | 1! 2! 3! 4! 5! n! 3379 *----------------------------------------------------------------------------*/ 3380 3381 static const float64 float32_exp2_coefficients[15] = 3382 { 3383 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3384 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3385 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3386 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3387 const_float64( 0x3f81111111111111ll ), /* 5 */ 3388 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3389 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3390 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3391 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3392 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3393 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3394 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3395 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3396 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3397 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3398 }; 3399 3400 float32 float32_exp2(float32 a, float_status *status) 3401 { 3402 flag aSign; 3403 int aExp; 3404 uint32_t aSig; 3405 float64 r, x, xn; 3406 int i; 3407 a = float32_squash_input_denormal(a, status); 3408 3409 aSig = extractFloat32Frac( a ); 3410 aExp = extractFloat32Exp( a ); 3411 aSign = extractFloat32Sign( a ); 3412 3413 if ( aExp == 0xFF) { 3414 if (aSig) { 3415 return propagateFloat32NaN(a, float32_zero, status); 3416 } 3417 return (aSign) ? float32_zero : a; 3418 } 3419 if (aExp == 0) { 3420 if (aSig == 0) return float32_one; 3421 } 3422 3423 float_raise(float_flag_inexact, status); 3424 3425 /* ******************************* */ 3426 /* using float64 for approximation */ 3427 /* ******************************* */ 3428 x = float32_to_float64(a, status); 3429 x = float64_mul(x, float64_ln2, status); 3430 3431 xn = x; 3432 r = float64_one; 3433 for (i = 0 ; i < 15 ; i++) { 3434 float64 f; 3435 3436 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3437 r = float64_add(r, f, status); 3438 3439 xn = float64_mul(xn, x, status); 3440 } 3441 3442 return float64_to_float32(r, status); 3443 } 3444 3445 /*---------------------------------------------------------------------------- 3446 | Returns the binary log of the single-precision floating-point value `a'. 3447 | The operation is performed according to the IEC/IEEE Standard for Binary 3448 | Floating-Point Arithmetic. 3449 *----------------------------------------------------------------------------*/ 3450 float32 float32_log2(float32 a, float_status *status) 3451 { 3452 flag aSign, zSign; 3453 int aExp; 3454 uint32_t aSig, zSig, i; 3455 3456 a = float32_squash_input_denormal(a, status); 3457 aSig = extractFloat32Frac( a ); 3458 aExp = extractFloat32Exp( a ); 3459 aSign = extractFloat32Sign( a ); 3460 3461 if ( aExp == 0 ) { 3462 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3463 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3464 } 3465 if ( aSign ) { 3466 float_raise(float_flag_invalid, status); 3467 return float32_default_nan(status); 3468 } 3469 if ( aExp == 0xFF ) { 3470 if (aSig) { 3471 return propagateFloat32NaN(a, float32_zero, status); 3472 } 3473 return a; 3474 } 3475 3476 aExp -= 0x7F; 3477 aSig |= 0x00800000; 3478 zSign = aExp < 0; 3479 zSig = aExp << 23; 3480 3481 for (i = 1 << 22; i > 0; i >>= 1) { 3482 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3483 if ( aSig & 0x01000000 ) { 3484 aSig >>= 1; 3485 zSig |= i; 3486 } 3487 } 3488 3489 if ( zSign ) 3490 zSig = -zSig; 3491 3492 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3493 } 3494 3495 /*---------------------------------------------------------------------------- 3496 | Returns 1 if the single-precision floating-point value `a' is equal to 3497 | the corresponding value `b', and 0 otherwise. The invalid exception is 3498 | raised if either operand is a NaN. Otherwise, the comparison is performed 3499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3500 *----------------------------------------------------------------------------*/ 3501 3502 int float32_eq(float32 a, float32 b, float_status *status) 3503 { 3504 uint32_t av, bv; 3505 a = float32_squash_input_denormal(a, status); 3506 b = float32_squash_input_denormal(b, status); 3507 3508 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3509 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3510 ) { 3511 float_raise(float_flag_invalid, status); 3512 return 0; 3513 } 3514 av = float32_val(a); 3515 bv = float32_val(b); 3516 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3517 } 3518 3519 /*---------------------------------------------------------------------------- 3520 | Returns 1 if the single-precision floating-point value `a' is less than 3521 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3522 | exception is raised if either operand is a NaN. The comparison is performed 3523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3524 *----------------------------------------------------------------------------*/ 3525 3526 int float32_le(float32 a, float32 b, float_status *status) 3527 { 3528 flag aSign, bSign; 3529 uint32_t av, bv; 3530 a = float32_squash_input_denormal(a, status); 3531 b = float32_squash_input_denormal(b, status); 3532 3533 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3534 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3535 ) { 3536 float_raise(float_flag_invalid, status); 3537 return 0; 3538 } 3539 aSign = extractFloat32Sign( a ); 3540 bSign = extractFloat32Sign( b ); 3541 av = float32_val(a); 3542 bv = float32_val(b); 3543 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3544 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3545 3546 } 3547 3548 /*---------------------------------------------------------------------------- 3549 | Returns 1 if the single-precision floating-point value `a' is less than 3550 | the corresponding value `b', and 0 otherwise. The invalid exception is 3551 | raised if either operand is a NaN. The comparison is performed according 3552 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3553 *----------------------------------------------------------------------------*/ 3554 3555 int float32_lt(float32 a, float32 b, float_status *status) 3556 { 3557 flag aSign, bSign; 3558 uint32_t av, bv; 3559 a = float32_squash_input_denormal(a, status); 3560 b = float32_squash_input_denormal(b, status); 3561 3562 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3563 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3564 ) { 3565 float_raise(float_flag_invalid, status); 3566 return 0; 3567 } 3568 aSign = extractFloat32Sign( a ); 3569 bSign = extractFloat32Sign( b ); 3570 av = float32_val(a); 3571 bv = float32_val(b); 3572 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3573 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3574 3575 } 3576 3577 /*---------------------------------------------------------------------------- 3578 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3579 | be compared, and 0 otherwise. The invalid exception is raised if either 3580 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3581 | Standard for Binary Floating-Point Arithmetic. 3582 *----------------------------------------------------------------------------*/ 3583 3584 int float32_unordered(float32 a, float32 b, float_status *status) 3585 { 3586 a = float32_squash_input_denormal(a, status); 3587 b = float32_squash_input_denormal(b, status); 3588 3589 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3590 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3591 ) { 3592 float_raise(float_flag_invalid, status); 3593 return 1; 3594 } 3595 return 0; 3596 } 3597 3598 /*---------------------------------------------------------------------------- 3599 | Returns 1 if the single-precision floating-point value `a' is equal to 3600 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3601 | exception. The comparison is performed according to the IEC/IEEE Standard 3602 | for Binary Floating-Point Arithmetic. 3603 *----------------------------------------------------------------------------*/ 3604 3605 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3606 { 3607 a = float32_squash_input_denormal(a, status); 3608 b = float32_squash_input_denormal(b, status); 3609 3610 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3611 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3612 ) { 3613 if (float32_is_signaling_nan(a, status) 3614 || float32_is_signaling_nan(b, status)) { 3615 float_raise(float_flag_invalid, status); 3616 } 3617 return 0; 3618 } 3619 return ( float32_val(a) == float32_val(b) ) || 3620 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3621 } 3622 3623 /*---------------------------------------------------------------------------- 3624 | Returns 1 if the single-precision floating-point value `a' is less than or 3625 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3626 | cause an exception. Otherwise, the comparison is performed according to the 3627 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3628 *----------------------------------------------------------------------------*/ 3629 3630 int float32_le_quiet(float32 a, float32 b, float_status *status) 3631 { 3632 flag aSign, bSign; 3633 uint32_t av, bv; 3634 a = float32_squash_input_denormal(a, status); 3635 b = float32_squash_input_denormal(b, status); 3636 3637 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3638 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3639 ) { 3640 if (float32_is_signaling_nan(a, status) 3641 || float32_is_signaling_nan(b, status)) { 3642 float_raise(float_flag_invalid, status); 3643 } 3644 return 0; 3645 } 3646 aSign = extractFloat32Sign( a ); 3647 bSign = extractFloat32Sign( b ); 3648 av = float32_val(a); 3649 bv = float32_val(b); 3650 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3651 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3652 3653 } 3654 3655 /*---------------------------------------------------------------------------- 3656 | Returns 1 if the single-precision floating-point value `a' is less than 3657 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3658 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3659 | Standard for Binary Floating-Point Arithmetic. 3660 *----------------------------------------------------------------------------*/ 3661 3662 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3663 { 3664 flag aSign, bSign; 3665 uint32_t av, bv; 3666 a = float32_squash_input_denormal(a, status); 3667 b = float32_squash_input_denormal(b, status); 3668 3669 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3670 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3671 ) { 3672 if (float32_is_signaling_nan(a, status) 3673 || float32_is_signaling_nan(b, status)) { 3674 float_raise(float_flag_invalid, status); 3675 } 3676 return 0; 3677 } 3678 aSign = extractFloat32Sign( a ); 3679 bSign = extractFloat32Sign( b ); 3680 av = float32_val(a); 3681 bv = float32_val(b); 3682 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3683 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3684 3685 } 3686 3687 /*---------------------------------------------------------------------------- 3688 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3689 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3690 | comparison is performed according to the IEC/IEEE Standard for Binary 3691 | Floating-Point Arithmetic. 3692 *----------------------------------------------------------------------------*/ 3693 3694 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3695 { 3696 a = float32_squash_input_denormal(a, status); 3697 b = float32_squash_input_denormal(b, status); 3698 3699 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3700 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3701 ) { 3702 if (float32_is_signaling_nan(a, status) 3703 || float32_is_signaling_nan(b, status)) { 3704 float_raise(float_flag_invalid, status); 3705 } 3706 return 1; 3707 } 3708 return 0; 3709 } 3710 3711 3712 /*---------------------------------------------------------------------------- 3713 | Returns the result of converting the double-precision floating-point value 3714 | `a' to the single-precision floating-point format. The conversion is 3715 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3716 | Arithmetic. 3717 *----------------------------------------------------------------------------*/ 3718 3719 float32 float64_to_float32(float64 a, float_status *status) 3720 { 3721 flag aSign; 3722 int aExp; 3723 uint64_t aSig; 3724 uint32_t zSig; 3725 a = float64_squash_input_denormal(a, status); 3726 3727 aSig = extractFloat64Frac( a ); 3728 aExp = extractFloat64Exp( a ); 3729 aSign = extractFloat64Sign( a ); 3730 if ( aExp == 0x7FF ) { 3731 if (aSig) { 3732 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3733 } 3734 return packFloat32( aSign, 0xFF, 0 ); 3735 } 3736 shift64RightJamming( aSig, 22, &aSig ); 3737 zSig = aSig; 3738 if ( aExp || zSig ) { 3739 zSig |= 0x40000000; 3740 aExp -= 0x381; 3741 } 3742 return roundAndPackFloat32(aSign, aExp, zSig, status); 3743 3744 } 3745 3746 3747 /*---------------------------------------------------------------------------- 3748 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3749 | half-precision floating-point value, returning the result. After being 3750 | shifted into the proper positions, the three fields are simply added 3751 | together to form the result. This means that any integer portion of `zSig' 3752 | will be added into the exponent. Since a properly normalized significand 3753 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3754 | than the desired result exponent whenever `zSig' is a complete, normalized 3755 | significand. 3756 *----------------------------------------------------------------------------*/ 3757 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3758 { 3759 return make_float16( 3760 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3761 } 3762 3763 /*---------------------------------------------------------------------------- 3764 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3765 | and significand `zSig', and returns the proper half-precision floating- 3766 | point value corresponding to the abstract input. Ordinarily, the abstract 3767 | value is simply rounded and packed into the half-precision format, with 3768 | the inexact exception raised if the abstract input cannot be represented 3769 | exactly. However, if the abstract value is too large, the overflow and 3770 | inexact exceptions are raised and an infinity or maximal finite value is 3771 | returned. If the abstract value is too small, the input value is rounded to 3772 | a subnormal number, and the underflow and inexact exceptions are raised if 3773 | the abstract input cannot be represented exactly as a subnormal half- 3774 | precision floating-point number. 3775 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3776 | ARM-style "alternative representation", which omits the NaN and Inf 3777 | encodings in order to raise the maximum representable exponent by one. 3778 | The input significand `zSig' has its binary point between bits 22 3779 | and 23, which is 13 bits to the left of the usual location. This shifted 3780 | significand must be normalized or smaller. If `zSig' is not normalized, 3781 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3782 | and it must not require rounding. In the usual case that `zSig' is 3783 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3784 | Note the slightly odd position of the binary point in zSig compared with the 3785 | other roundAndPackFloat functions. This should probably be fixed if we 3786 | need to implement more float16 routines than just conversion. 3787 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3788 | Binary Floating-Point Arithmetic. 3789 *----------------------------------------------------------------------------*/ 3790 3791 static float16 roundAndPackFloat16(flag zSign, int zExp, 3792 uint32_t zSig, flag ieee, 3793 float_status *status) 3794 { 3795 int maxexp = ieee ? 29 : 30; 3796 uint32_t mask; 3797 uint32_t increment; 3798 bool rounding_bumps_exp; 3799 bool is_tiny = false; 3800 3801 /* Calculate the mask of bits of the mantissa which are not 3802 * representable in half-precision and will be lost. 3803 */ 3804 if (zExp < 1) { 3805 /* Will be denormal in halfprec */ 3806 mask = 0x00ffffff; 3807 if (zExp >= -11) { 3808 mask >>= 11 + zExp; 3809 } 3810 } else { 3811 /* Normal number in halfprec */ 3812 mask = 0x00001fff; 3813 } 3814 3815 switch (status->float_rounding_mode) { 3816 case float_round_nearest_even: 3817 increment = (mask + 1) >> 1; 3818 if ((zSig & mask) == increment) { 3819 increment = zSig & (increment << 1); 3820 } 3821 break; 3822 case float_round_ties_away: 3823 increment = (mask + 1) >> 1; 3824 break; 3825 case float_round_up: 3826 increment = zSign ? 0 : mask; 3827 break; 3828 case float_round_down: 3829 increment = zSign ? mask : 0; 3830 break; 3831 default: /* round_to_zero */ 3832 increment = 0; 3833 break; 3834 } 3835 3836 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3837 3838 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3839 if (ieee) { 3840 float_raise(float_flag_overflow | float_flag_inexact, status); 3841 return packFloat16(zSign, 0x1f, 0); 3842 } else { 3843 float_raise(float_flag_invalid, status); 3844 return packFloat16(zSign, 0x1f, 0x3ff); 3845 } 3846 } 3847 3848 if (zExp < 0) { 3849 /* Note that flush-to-zero does not affect half-precision results */ 3850 is_tiny = 3851 (status->float_detect_tininess == float_tininess_before_rounding) 3852 || (zExp < -1) 3853 || (!rounding_bumps_exp); 3854 } 3855 if (zSig & mask) { 3856 float_raise(float_flag_inexact, status); 3857 if (is_tiny) { 3858 float_raise(float_flag_underflow, status); 3859 } 3860 } 3861 3862 zSig += increment; 3863 if (rounding_bumps_exp) { 3864 zSig >>= 1; 3865 zExp++; 3866 } 3867 3868 if (zExp < -10) { 3869 return packFloat16(zSign, 0, 0); 3870 } 3871 if (zExp < 0) { 3872 zSig >>= -zExp; 3873 zExp = 0; 3874 } 3875 return packFloat16(zSign, zExp, zSig >> 13); 3876 } 3877 3878 /*---------------------------------------------------------------------------- 3879 | If `a' is denormal and we are in flush-to-zero mode then set the 3880 | input-denormal exception and return zero. Otherwise just return the value. 3881 *----------------------------------------------------------------------------*/ 3882 float16 float16_squash_input_denormal(float16 a, float_status *status) 3883 { 3884 if (status->flush_inputs_to_zero) { 3885 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3886 float_raise(float_flag_input_denormal, status); 3887 return make_float16(float16_val(a) & 0x8000); 3888 } 3889 } 3890 return a; 3891 } 3892 3893 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3894 uint32_t *zSigPtr) 3895 { 3896 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3897 *zSigPtr = aSig << shiftCount; 3898 *zExpPtr = 1 - shiftCount; 3899 } 3900 3901 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3902 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3903 3904 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3905 { 3906 flag aSign; 3907 int aExp; 3908 uint32_t aSig; 3909 3910 aSign = extractFloat16Sign(a); 3911 aExp = extractFloat16Exp(a); 3912 aSig = extractFloat16Frac(a); 3913 3914 if (aExp == 0x1f && ieee) { 3915 if (aSig) { 3916 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3917 } 3918 return packFloat32(aSign, 0xff, 0); 3919 } 3920 if (aExp == 0) { 3921 if (aSig == 0) { 3922 return packFloat32(aSign, 0, 0); 3923 } 3924 3925 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3926 aExp--; 3927 } 3928 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3929 } 3930 3931 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3932 { 3933 flag aSign; 3934 int aExp; 3935 uint32_t aSig; 3936 3937 a = float32_squash_input_denormal(a, status); 3938 3939 aSig = extractFloat32Frac( a ); 3940 aExp = extractFloat32Exp( a ); 3941 aSign = extractFloat32Sign( a ); 3942 if ( aExp == 0xFF ) { 3943 if (aSig) { 3944 /* Input is a NaN */ 3945 if (!ieee) { 3946 float_raise(float_flag_invalid, status); 3947 return packFloat16(aSign, 0, 0); 3948 } 3949 return commonNaNToFloat16( 3950 float32ToCommonNaN(a, status), status); 3951 } 3952 /* Infinity */ 3953 if (!ieee) { 3954 float_raise(float_flag_invalid, status); 3955 return packFloat16(aSign, 0x1f, 0x3ff); 3956 } 3957 return packFloat16(aSign, 0x1f, 0); 3958 } 3959 if (aExp == 0 && aSig == 0) { 3960 return packFloat16(aSign, 0, 0); 3961 } 3962 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3963 * even if the input is denormal; however this is harmless because 3964 * the largest possible single-precision denormal is still smaller 3965 * than the smallest representable half-precision denormal, and so we 3966 * will end up ignoring aSig and returning via the "always return zero" 3967 * codepath. 3968 */ 3969 aSig |= 0x00800000; 3970 aExp -= 0x71; 3971 3972 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3973 } 3974 3975 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3976 { 3977 flag aSign; 3978 int aExp; 3979 uint32_t aSig; 3980 3981 aSign = extractFloat16Sign(a); 3982 aExp = extractFloat16Exp(a); 3983 aSig = extractFloat16Frac(a); 3984 3985 if (aExp == 0x1f && ieee) { 3986 if (aSig) { 3987 return commonNaNToFloat64( 3988 float16ToCommonNaN(a, status), status); 3989 } 3990 return packFloat64(aSign, 0x7ff, 0); 3991 } 3992 if (aExp == 0) { 3993 if (aSig == 0) { 3994 return packFloat64(aSign, 0, 0); 3995 } 3996 3997 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3998 aExp--; 3999 } 4000 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 4001 } 4002 4003 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 4004 { 4005 flag aSign; 4006 int aExp; 4007 uint64_t aSig; 4008 uint32_t zSig; 4009 4010 a = float64_squash_input_denormal(a, status); 4011 4012 aSig = extractFloat64Frac(a); 4013 aExp = extractFloat64Exp(a); 4014 aSign = extractFloat64Sign(a); 4015 if (aExp == 0x7FF) { 4016 if (aSig) { 4017 /* Input is a NaN */ 4018 if (!ieee) { 4019 float_raise(float_flag_invalid, status); 4020 return packFloat16(aSign, 0, 0); 4021 } 4022 return commonNaNToFloat16( 4023 float64ToCommonNaN(a, status), status); 4024 } 4025 /* Infinity */ 4026 if (!ieee) { 4027 float_raise(float_flag_invalid, status); 4028 return packFloat16(aSign, 0x1f, 0x3ff); 4029 } 4030 return packFloat16(aSign, 0x1f, 0); 4031 } 4032 shift64RightJamming(aSig, 29, &aSig); 4033 zSig = aSig; 4034 if (aExp == 0 && zSig == 0) { 4035 return packFloat16(aSign, 0, 0); 4036 } 4037 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4038 * even if the input is denormal; however this is harmless because 4039 * the largest possible single-precision denormal is still smaller 4040 * than the smallest representable half-precision denormal, and so we 4041 * will end up ignoring aSig and returning via the "always return zero" 4042 * codepath. 4043 */ 4044 zSig |= 0x00800000; 4045 aExp -= 0x3F1; 4046 4047 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4048 } 4049 4050 /*---------------------------------------------------------------------------- 4051 | Returns the result of converting the double-precision floating-point value 4052 | `a' to the extended double-precision floating-point format. The conversion 4053 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4054 | Arithmetic. 4055 *----------------------------------------------------------------------------*/ 4056 4057 floatx80 float64_to_floatx80(float64 a, float_status *status) 4058 { 4059 flag aSign; 4060 int aExp; 4061 uint64_t aSig; 4062 4063 a = float64_squash_input_denormal(a, status); 4064 aSig = extractFloat64Frac( a ); 4065 aExp = extractFloat64Exp( a ); 4066 aSign = extractFloat64Sign( a ); 4067 if ( aExp == 0x7FF ) { 4068 if (aSig) { 4069 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4070 } 4071 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4072 } 4073 if ( aExp == 0 ) { 4074 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4075 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4076 } 4077 return 4078 packFloatx80( 4079 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4080 4081 } 4082 4083 /*---------------------------------------------------------------------------- 4084 | Returns the result of converting the double-precision floating-point value 4085 | `a' to the quadruple-precision floating-point format. The conversion is 4086 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4087 | Arithmetic. 4088 *----------------------------------------------------------------------------*/ 4089 4090 float128 float64_to_float128(float64 a, float_status *status) 4091 { 4092 flag aSign; 4093 int aExp; 4094 uint64_t aSig, zSig0, zSig1; 4095 4096 a = float64_squash_input_denormal(a, status); 4097 aSig = extractFloat64Frac( a ); 4098 aExp = extractFloat64Exp( a ); 4099 aSign = extractFloat64Sign( a ); 4100 if ( aExp == 0x7FF ) { 4101 if (aSig) { 4102 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4103 } 4104 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4105 } 4106 if ( aExp == 0 ) { 4107 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4108 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4109 --aExp; 4110 } 4111 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4112 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4113 4114 } 4115 4116 4117 /*---------------------------------------------------------------------------- 4118 | Returns the remainder of the double-precision floating-point value `a' 4119 | with respect to the corresponding value `b'. The operation is performed 4120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4121 *----------------------------------------------------------------------------*/ 4122 4123 float64 float64_rem(float64 a, float64 b, float_status *status) 4124 { 4125 flag aSign, zSign; 4126 int aExp, bExp, expDiff; 4127 uint64_t aSig, bSig; 4128 uint64_t q, alternateASig; 4129 int64_t sigMean; 4130 4131 a = float64_squash_input_denormal(a, status); 4132 b = float64_squash_input_denormal(b, status); 4133 aSig = extractFloat64Frac( a ); 4134 aExp = extractFloat64Exp( a ); 4135 aSign = extractFloat64Sign( a ); 4136 bSig = extractFloat64Frac( b ); 4137 bExp = extractFloat64Exp( b ); 4138 if ( aExp == 0x7FF ) { 4139 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4140 return propagateFloat64NaN(a, b, status); 4141 } 4142 float_raise(float_flag_invalid, status); 4143 return float64_default_nan(status); 4144 } 4145 if ( bExp == 0x7FF ) { 4146 if (bSig) { 4147 return propagateFloat64NaN(a, b, status); 4148 } 4149 return a; 4150 } 4151 if ( bExp == 0 ) { 4152 if ( bSig == 0 ) { 4153 float_raise(float_flag_invalid, status); 4154 return float64_default_nan(status); 4155 } 4156 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4157 } 4158 if ( aExp == 0 ) { 4159 if ( aSig == 0 ) return a; 4160 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4161 } 4162 expDiff = aExp - bExp; 4163 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4164 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4165 if ( expDiff < 0 ) { 4166 if ( expDiff < -1 ) return a; 4167 aSig >>= 1; 4168 } 4169 q = ( bSig <= aSig ); 4170 if ( q ) aSig -= bSig; 4171 expDiff -= 64; 4172 while ( 0 < expDiff ) { 4173 q = estimateDiv128To64( aSig, 0, bSig ); 4174 q = ( 2 < q ) ? q - 2 : 0; 4175 aSig = - ( ( bSig>>2 ) * q ); 4176 expDiff -= 62; 4177 } 4178 expDiff += 64; 4179 if ( 0 < expDiff ) { 4180 q = estimateDiv128To64( aSig, 0, bSig ); 4181 q = ( 2 < q ) ? q - 2 : 0; 4182 q >>= 64 - expDiff; 4183 bSig >>= 2; 4184 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4185 } 4186 else { 4187 aSig >>= 2; 4188 bSig >>= 2; 4189 } 4190 do { 4191 alternateASig = aSig; 4192 ++q; 4193 aSig -= bSig; 4194 } while ( 0 <= (int64_t) aSig ); 4195 sigMean = aSig + alternateASig; 4196 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4197 aSig = alternateASig; 4198 } 4199 zSign = ( (int64_t) aSig < 0 ); 4200 if ( zSign ) aSig = - aSig; 4201 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4202 4203 } 4204 4205 4206 /*---------------------------------------------------------------------------- 4207 | Returns the square root of the double-precision floating-point value `a'. 4208 | The operation is performed according to the IEC/IEEE Standard for Binary 4209 | Floating-Point Arithmetic. 4210 *----------------------------------------------------------------------------*/ 4211 4212 float64 float64_sqrt(float64 a, float_status *status) 4213 { 4214 flag aSign; 4215 int aExp, zExp; 4216 uint64_t aSig, zSig, doubleZSig; 4217 uint64_t rem0, rem1, term0, term1; 4218 a = float64_squash_input_denormal(a, status); 4219 4220 aSig = extractFloat64Frac( a ); 4221 aExp = extractFloat64Exp( a ); 4222 aSign = extractFloat64Sign( a ); 4223 if ( aExp == 0x7FF ) { 4224 if (aSig) { 4225 return propagateFloat64NaN(a, a, status); 4226 } 4227 if ( ! aSign ) return a; 4228 float_raise(float_flag_invalid, status); 4229 return float64_default_nan(status); 4230 } 4231 if ( aSign ) { 4232 if ( ( aExp | aSig ) == 0 ) return a; 4233 float_raise(float_flag_invalid, status); 4234 return float64_default_nan(status); 4235 } 4236 if ( aExp == 0 ) { 4237 if ( aSig == 0 ) return float64_zero; 4238 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4239 } 4240 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; 4241 aSig |= LIT64( 0x0010000000000000 ); 4242 zSig = estimateSqrt32( aExp, aSig>>21 ); 4243 aSig <<= 9 - ( aExp & 1 ); 4244 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); 4245 if ( ( zSig & 0x1FF ) <= 5 ) { 4246 doubleZSig = zSig<<1; 4247 mul64To128( zSig, zSig, &term0, &term1 ); 4248 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); 4249 while ( (int64_t) rem0 < 0 ) { 4250 --zSig; 4251 doubleZSig -= 2; 4252 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); 4253 } 4254 zSig |= ( ( rem0 | rem1 ) != 0 ); 4255 } 4256 return roundAndPackFloat64(0, zExp, zSig, status); 4257 4258 } 4259 4260 /*---------------------------------------------------------------------------- 4261 | Returns the binary log of the double-precision floating-point value `a'. 4262 | The operation is performed according to the IEC/IEEE Standard for Binary 4263 | Floating-Point Arithmetic. 4264 *----------------------------------------------------------------------------*/ 4265 float64 float64_log2(float64 a, float_status *status) 4266 { 4267 flag aSign, zSign; 4268 int aExp; 4269 uint64_t aSig, aSig0, aSig1, zSig, i; 4270 a = float64_squash_input_denormal(a, status); 4271 4272 aSig = extractFloat64Frac( a ); 4273 aExp = extractFloat64Exp( a ); 4274 aSign = extractFloat64Sign( a ); 4275 4276 if ( aExp == 0 ) { 4277 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4278 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4279 } 4280 if ( aSign ) { 4281 float_raise(float_flag_invalid, status); 4282 return float64_default_nan(status); 4283 } 4284 if ( aExp == 0x7FF ) { 4285 if (aSig) { 4286 return propagateFloat64NaN(a, float64_zero, status); 4287 } 4288 return a; 4289 } 4290 4291 aExp -= 0x3FF; 4292 aSig |= LIT64( 0x0010000000000000 ); 4293 zSign = aExp < 0; 4294 zSig = (uint64_t)aExp << 52; 4295 for (i = 1LL << 51; i > 0; i >>= 1) { 4296 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4297 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4298 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4299 aSig >>= 1; 4300 zSig |= i; 4301 } 4302 } 4303 4304 if ( zSign ) 4305 zSig = -zSig; 4306 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4307 } 4308 4309 /*---------------------------------------------------------------------------- 4310 | Returns 1 if the double-precision floating-point value `a' is equal to the 4311 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4312 | if either operand is a NaN. Otherwise, the comparison is performed 4313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4314 *----------------------------------------------------------------------------*/ 4315 4316 int float64_eq(float64 a, float64 b, float_status *status) 4317 { 4318 uint64_t av, bv; 4319 a = float64_squash_input_denormal(a, status); 4320 b = float64_squash_input_denormal(b, status); 4321 4322 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4323 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4324 ) { 4325 float_raise(float_flag_invalid, status); 4326 return 0; 4327 } 4328 av = float64_val(a); 4329 bv = float64_val(b); 4330 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4331 4332 } 4333 4334 /*---------------------------------------------------------------------------- 4335 | Returns 1 if the double-precision floating-point value `a' is less than or 4336 | equal to the corresponding value `b', and 0 otherwise. The invalid 4337 | exception is raised if either operand is a NaN. The comparison is performed 4338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4339 *----------------------------------------------------------------------------*/ 4340 4341 int float64_le(float64 a, float64 b, float_status *status) 4342 { 4343 flag aSign, bSign; 4344 uint64_t av, bv; 4345 a = float64_squash_input_denormal(a, status); 4346 b = float64_squash_input_denormal(b, status); 4347 4348 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4349 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4350 ) { 4351 float_raise(float_flag_invalid, status); 4352 return 0; 4353 } 4354 aSign = extractFloat64Sign( a ); 4355 bSign = extractFloat64Sign( b ); 4356 av = float64_val(a); 4357 bv = float64_val(b); 4358 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4359 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4360 4361 } 4362 4363 /*---------------------------------------------------------------------------- 4364 | Returns 1 if the double-precision floating-point value `a' is less than 4365 | the corresponding value `b', and 0 otherwise. The invalid exception is 4366 | raised if either operand is a NaN. The comparison is performed according 4367 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4368 *----------------------------------------------------------------------------*/ 4369 4370 int float64_lt(float64 a, float64 b, float_status *status) 4371 { 4372 flag aSign, bSign; 4373 uint64_t av, bv; 4374 4375 a = float64_squash_input_denormal(a, status); 4376 b = float64_squash_input_denormal(b, status); 4377 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4378 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4379 ) { 4380 float_raise(float_flag_invalid, status); 4381 return 0; 4382 } 4383 aSign = extractFloat64Sign( a ); 4384 bSign = extractFloat64Sign( b ); 4385 av = float64_val(a); 4386 bv = float64_val(b); 4387 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4388 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4389 4390 } 4391 4392 /*---------------------------------------------------------------------------- 4393 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4394 | be compared, and 0 otherwise. The invalid exception is raised if either 4395 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4396 | Standard for Binary Floating-Point Arithmetic. 4397 *----------------------------------------------------------------------------*/ 4398 4399 int float64_unordered(float64 a, float64 b, float_status *status) 4400 { 4401 a = float64_squash_input_denormal(a, status); 4402 b = float64_squash_input_denormal(b, status); 4403 4404 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4405 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4406 ) { 4407 float_raise(float_flag_invalid, status); 4408 return 1; 4409 } 4410 return 0; 4411 } 4412 4413 /*---------------------------------------------------------------------------- 4414 | Returns 1 if the double-precision floating-point value `a' is equal to the 4415 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4416 | exception.The comparison is performed according to the IEC/IEEE Standard 4417 | for Binary Floating-Point Arithmetic. 4418 *----------------------------------------------------------------------------*/ 4419 4420 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4421 { 4422 uint64_t av, bv; 4423 a = float64_squash_input_denormal(a, status); 4424 b = float64_squash_input_denormal(b, status); 4425 4426 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4427 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4428 ) { 4429 if (float64_is_signaling_nan(a, status) 4430 || float64_is_signaling_nan(b, status)) { 4431 float_raise(float_flag_invalid, status); 4432 } 4433 return 0; 4434 } 4435 av = float64_val(a); 4436 bv = float64_val(b); 4437 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4438 4439 } 4440 4441 /*---------------------------------------------------------------------------- 4442 | Returns 1 if the double-precision floating-point value `a' is less than or 4443 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4444 | cause an exception. Otherwise, the comparison is performed according to the 4445 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4446 *----------------------------------------------------------------------------*/ 4447 4448 int float64_le_quiet(float64 a, float64 b, float_status *status) 4449 { 4450 flag aSign, bSign; 4451 uint64_t av, bv; 4452 a = float64_squash_input_denormal(a, status); 4453 b = float64_squash_input_denormal(b, status); 4454 4455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4457 ) { 4458 if (float64_is_signaling_nan(a, status) 4459 || float64_is_signaling_nan(b, status)) { 4460 float_raise(float_flag_invalid, status); 4461 } 4462 return 0; 4463 } 4464 aSign = extractFloat64Sign( a ); 4465 bSign = extractFloat64Sign( b ); 4466 av = float64_val(a); 4467 bv = float64_val(b); 4468 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4469 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4470 4471 } 4472 4473 /*---------------------------------------------------------------------------- 4474 | Returns 1 if the double-precision floating-point value `a' is less than 4475 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4476 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4477 | Standard for Binary Floating-Point Arithmetic. 4478 *----------------------------------------------------------------------------*/ 4479 4480 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4481 { 4482 flag aSign, bSign; 4483 uint64_t av, bv; 4484 a = float64_squash_input_denormal(a, status); 4485 b = float64_squash_input_denormal(b, status); 4486 4487 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4488 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4489 ) { 4490 if (float64_is_signaling_nan(a, status) 4491 || float64_is_signaling_nan(b, status)) { 4492 float_raise(float_flag_invalid, status); 4493 } 4494 return 0; 4495 } 4496 aSign = extractFloat64Sign( a ); 4497 bSign = extractFloat64Sign( b ); 4498 av = float64_val(a); 4499 bv = float64_val(b); 4500 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4501 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4502 4503 } 4504 4505 /*---------------------------------------------------------------------------- 4506 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4507 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4508 | comparison is performed according to the IEC/IEEE Standard for Binary 4509 | Floating-Point Arithmetic. 4510 *----------------------------------------------------------------------------*/ 4511 4512 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4513 { 4514 a = float64_squash_input_denormal(a, status); 4515 b = float64_squash_input_denormal(b, status); 4516 4517 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4518 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4519 ) { 4520 if (float64_is_signaling_nan(a, status) 4521 || float64_is_signaling_nan(b, status)) { 4522 float_raise(float_flag_invalid, status); 4523 } 4524 return 1; 4525 } 4526 return 0; 4527 } 4528 4529 /*---------------------------------------------------------------------------- 4530 | Returns the result of converting the extended double-precision floating- 4531 | point value `a' to the 32-bit two's complement integer format. The 4532 | conversion is performed according to the IEC/IEEE Standard for Binary 4533 | Floating-Point Arithmetic---which means in particular that the conversion 4534 | is rounded according to the current rounding mode. If `a' is a NaN, the 4535 | largest positive integer is returned. Otherwise, if the conversion 4536 | overflows, the largest integer with the same sign as `a' is returned. 4537 *----------------------------------------------------------------------------*/ 4538 4539 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4540 { 4541 flag aSign; 4542 int32_t aExp, shiftCount; 4543 uint64_t aSig; 4544 4545 if (floatx80_invalid_encoding(a)) { 4546 float_raise(float_flag_invalid, status); 4547 return 1 << 31; 4548 } 4549 aSig = extractFloatx80Frac( a ); 4550 aExp = extractFloatx80Exp( a ); 4551 aSign = extractFloatx80Sign( a ); 4552 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4553 shiftCount = 0x4037 - aExp; 4554 if ( shiftCount <= 0 ) shiftCount = 1; 4555 shift64RightJamming( aSig, shiftCount, &aSig ); 4556 return roundAndPackInt32(aSign, aSig, status); 4557 4558 } 4559 4560 /*---------------------------------------------------------------------------- 4561 | Returns the result of converting the extended double-precision floating- 4562 | point value `a' to the 32-bit two's complement integer format. The 4563 | conversion is performed according to the IEC/IEEE Standard for Binary 4564 | Floating-Point Arithmetic, except that the conversion is always rounded 4565 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4566 | Otherwise, if the conversion overflows, the largest integer with the same 4567 | sign as `a' is returned. 4568 *----------------------------------------------------------------------------*/ 4569 4570 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4571 { 4572 flag aSign; 4573 int32_t aExp, shiftCount; 4574 uint64_t aSig, savedASig; 4575 int32_t z; 4576 4577 if (floatx80_invalid_encoding(a)) { 4578 float_raise(float_flag_invalid, status); 4579 return 1 << 31; 4580 } 4581 aSig = extractFloatx80Frac( a ); 4582 aExp = extractFloatx80Exp( a ); 4583 aSign = extractFloatx80Sign( a ); 4584 if ( 0x401E < aExp ) { 4585 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4586 goto invalid; 4587 } 4588 else if ( aExp < 0x3FFF ) { 4589 if (aExp || aSig) { 4590 status->float_exception_flags |= float_flag_inexact; 4591 } 4592 return 0; 4593 } 4594 shiftCount = 0x403E - aExp; 4595 savedASig = aSig; 4596 aSig >>= shiftCount; 4597 z = aSig; 4598 if ( aSign ) z = - z; 4599 if ( ( z < 0 ) ^ aSign ) { 4600 invalid: 4601 float_raise(float_flag_invalid, status); 4602 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4603 } 4604 if ( ( aSig<<shiftCount ) != savedASig ) { 4605 status->float_exception_flags |= float_flag_inexact; 4606 } 4607 return z; 4608 4609 } 4610 4611 /*---------------------------------------------------------------------------- 4612 | Returns the result of converting the extended double-precision floating- 4613 | point value `a' to the 64-bit two's complement integer format. The 4614 | conversion is performed according to the IEC/IEEE Standard for Binary 4615 | Floating-Point Arithmetic---which means in particular that the conversion 4616 | is rounded according to the current rounding mode. If `a' is a NaN, 4617 | the largest positive integer is returned. Otherwise, if the conversion 4618 | overflows, the largest integer with the same sign as `a' is returned. 4619 *----------------------------------------------------------------------------*/ 4620 4621 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4622 { 4623 flag aSign; 4624 int32_t aExp, shiftCount; 4625 uint64_t aSig, aSigExtra; 4626 4627 if (floatx80_invalid_encoding(a)) { 4628 float_raise(float_flag_invalid, status); 4629 return 1ULL << 63; 4630 } 4631 aSig = extractFloatx80Frac( a ); 4632 aExp = extractFloatx80Exp( a ); 4633 aSign = extractFloatx80Sign( a ); 4634 shiftCount = 0x403E - aExp; 4635 if ( shiftCount <= 0 ) { 4636 if ( shiftCount ) { 4637 float_raise(float_flag_invalid, status); 4638 if ( ! aSign 4639 || ( ( aExp == 0x7FFF ) 4640 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4641 ) { 4642 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4643 } 4644 return (int64_t) LIT64( 0x8000000000000000 ); 4645 } 4646 aSigExtra = 0; 4647 } 4648 else { 4649 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4650 } 4651 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4652 4653 } 4654 4655 /*---------------------------------------------------------------------------- 4656 | Returns the result of converting the extended double-precision floating- 4657 | point value `a' to the 64-bit two's complement integer format. The 4658 | conversion is performed according to the IEC/IEEE Standard for Binary 4659 | Floating-Point Arithmetic, except that the conversion is always rounded 4660 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4661 | Otherwise, if the conversion overflows, the largest integer with the same 4662 | sign as `a' is returned. 4663 *----------------------------------------------------------------------------*/ 4664 4665 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4666 { 4667 flag aSign; 4668 int32_t aExp, shiftCount; 4669 uint64_t aSig; 4670 int64_t z; 4671 4672 if (floatx80_invalid_encoding(a)) { 4673 float_raise(float_flag_invalid, status); 4674 return 1ULL << 63; 4675 } 4676 aSig = extractFloatx80Frac( a ); 4677 aExp = extractFloatx80Exp( a ); 4678 aSign = extractFloatx80Sign( a ); 4679 shiftCount = aExp - 0x403E; 4680 if ( 0 <= shiftCount ) { 4681 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4682 if ( ( a.high != 0xC03E ) || aSig ) { 4683 float_raise(float_flag_invalid, status); 4684 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4685 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4686 } 4687 } 4688 return (int64_t) LIT64( 0x8000000000000000 ); 4689 } 4690 else if ( aExp < 0x3FFF ) { 4691 if (aExp | aSig) { 4692 status->float_exception_flags |= float_flag_inexact; 4693 } 4694 return 0; 4695 } 4696 z = aSig>>( - shiftCount ); 4697 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4698 status->float_exception_flags |= float_flag_inexact; 4699 } 4700 if ( aSign ) z = - z; 4701 return z; 4702 4703 } 4704 4705 /*---------------------------------------------------------------------------- 4706 | Returns the result of converting the extended double-precision floating- 4707 | point value `a' to the single-precision floating-point format. The 4708 | conversion is performed according to the IEC/IEEE Standard for Binary 4709 | Floating-Point Arithmetic. 4710 *----------------------------------------------------------------------------*/ 4711 4712 float32 floatx80_to_float32(floatx80 a, float_status *status) 4713 { 4714 flag aSign; 4715 int32_t aExp; 4716 uint64_t aSig; 4717 4718 if (floatx80_invalid_encoding(a)) { 4719 float_raise(float_flag_invalid, status); 4720 return float32_default_nan(status); 4721 } 4722 aSig = extractFloatx80Frac( a ); 4723 aExp = extractFloatx80Exp( a ); 4724 aSign = extractFloatx80Sign( a ); 4725 if ( aExp == 0x7FFF ) { 4726 if ( (uint64_t) ( aSig<<1 ) ) { 4727 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4728 } 4729 return packFloat32( aSign, 0xFF, 0 ); 4730 } 4731 shift64RightJamming( aSig, 33, &aSig ); 4732 if ( aExp || aSig ) aExp -= 0x3F81; 4733 return roundAndPackFloat32(aSign, aExp, aSig, status); 4734 4735 } 4736 4737 /*---------------------------------------------------------------------------- 4738 | Returns the result of converting the extended double-precision floating- 4739 | point value `a' to the double-precision floating-point format. The 4740 | conversion is performed according to the IEC/IEEE Standard for Binary 4741 | Floating-Point Arithmetic. 4742 *----------------------------------------------------------------------------*/ 4743 4744 float64 floatx80_to_float64(floatx80 a, float_status *status) 4745 { 4746 flag aSign; 4747 int32_t aExp; 4748 uint64_t aSig, zSig; 4749 4750 if (floatx80_invalid_encoding(a)) { 4751 float_raise(float_flag_invalid, status); 4752 return float64_default_nan(status); 4753 } 4754 aSig = extractFloatx80Frac( a ); 4755 aExp = extractFloatx80Exp( a ); 4756 aSign = extractFloatx80Sign( a ); 4757 if ( aExp == 0x7FFF ) { 4758 if ( (uint64_t) ( aSig<<1 ) ) { 4759 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4760 } 4761 return packFloat64( aSign, 0x7FF, 0 ); 4762 } 4763 shift64RightJamming( aSig, 1, &zSig ); 4764 if ( aExp || aSig ) aExp -= 0x3C01; 4765 return roundAndPackFloat64(aSign, aExp, zSig, status); 4766 4767 } 4768 4769 /*---------------------------------------------------------------------------- 4770 | Returns the result of converting the extended double-precision floating- 4771 | point value `a' to the quadruple-precision floating-point format. The 4772 | conversion is performed according to the IEC/IEEE Standard for Binary 4773 | Floating-Point Arithmetic. 4774 *----------------------------------------------------------------------------*/ 4775 4776 float128 floatx80_to_float128(floatx80 a, float_status *status) 4777 { 4778 flag aSign; 4779 int aExp; 4780 uint64_t aSig, zSig0, zSig1; 4781 4782 if (floatx80_invalid_encoding(a)) { 4783 float_raise(float_flag_invalid, status); 4784 return float128_default_nan(status); 4785 } 4786 aSig = extractFloatx80Frac( a ); 4787 aExp = extractFloatx80Exp( a ); 4788 aSign = extractFloatx80Sign( a ); 4789 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4790 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4791 } 4792 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4793 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4794 4795 } 4796 4797 /*---------------------------------------------------------------------------- 4798 | Rounds the extended double-precision floating-point value `a' 4799 | to the precision provided by floatx80_rounding_precision and returns the 4800 | result as an extended double-precision floating-point value. 4801 | The operation is performed according to the IEC/IEEE Standard for Binary 4802 | Floating-Point Arithmetic. 4803 *----------------------------------------------------------------------------*/ 4804 4805 floatx80 floatx80_round(floatx80 a, float_status *status) 4806 { 4807 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4808 extractFloatx80Sign(a), 4809 extractFloatx80Exp(a), 4810 extractFloatx80Frac(a), 0, status); 4811 } 4812 4813 /*---------------------------------------------------------------------------- 4814 | Rounds the extended double-precision floating-point value `a' to an integer, 4815 | and returns the result as an extended quadruple-precision floating-point 4816 | value. The operation is performed according to the IEC/IEEE Standard for 4817 | Binary Floating-Point Arithmetic. 4818 *----------------------------------------------------------------------------*/ 4819 4820 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4821 { 4822 flag aSign; 4823 int32_t aExp; 4824 uint64_t lastBitMask, roundBitsMask; 4825 floatx80 z; 4826 4827 if (floatx80_invalid_encoding(a)) { 4828 float_raise(float_flag_invalid, status); 4829 return floatx80_default_nan(status); 4830 } 4831 aExp = extractFloatx80Exp( a ); 4832 if ( 0x403E <= aExp ) { 4833 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4834 return propagateFloatx80NaN(a, a, status); 4835 } 4836 return a; 4837 } 4838 if ( aExp < 0x3FFF ) { 4839 if ( ( aExp == 0 ) 4840 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4841 return a; 4842 } 4843 status->float_exception_flags |= float_flag_inexact; 4844 aSign = extractFloatx80Sign( a ); 4845 switch (status->float_rounding_mode) { 4846 case float_round_nearest_even: 4847 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4848 ) { 4849 return 4850 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4851 } 4852 break; 4853 case float_round_ties_away: 4854 if (aExp == 0x3FFE) { 4855 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4856 } 4857 break; 4858 case float_round_down: 4859 return 4860 aSign ? 4861 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4862 : packFloatx80( 0, 0, 0 ); 4863 case float_round_up: 4864 return 4865 aSign ? packFloatx80( 1, 0, 0 ) 4866 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4867 } 4868 return packFloatx80( aSign, 0, 0 ); 4869 } 4870 lastBitMask = 1; 4871 lastBitMask <<= 0x403E - aExp; 4872 roundBitsMask = lastBitMask - 1; 4873 z = a; 4874 switch (status->float_rounding_mode) { 4875 case float_round_nearest_even: 4876 z.low += lastBitMask>>1; 4877 if ((z.low & roundBitsMask) == 0) { 4878 z.low &= ~lastBitMask; 4879 } 4880 break; 4881 case float_round_ties_away: 4882 z.low += lastBitMask >> 1; 4883 break; 4884 case float_round_to_zero: 4885 break; 4886 case float_round_up: 4887 if (!extractFloatx80Sign(z)) { 4888 z.low += roundBitsMask; 4889 } 4890 break; 4891 case float_round_down: 4892 if (extractFloatx80Sign(z)) { 4893 z.low += roundBitsMask; 4894 } 4895 break; 4896 default: 4897 abort(); 4898 } 4899 z.low &= ~ roundBitsMask; 4900 if ( z.low == 0 ) { 4901 ++z.high; 4902 z.low = LIT64( 0x8000000000000000 ); 4903 } 4904 if (z.low != a.low) { 4905 status->float_exception_flags |= float_flag_inexact; 4906 } 4907 return z; 4908 4909 } 4910 4911 /*---------------------------------------------------------------------------- 4912 | Returns the result of adding the absolute values of the extended double- 4913 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4914 | negated before being returned. `zSign' is ignored if the result is a NaN. 4915 | The addition is performed according to the IEC/IEEE Standard for Binary 4916 | Floating-Point Arithmetic. 4917 *----------------------------------------------------------------------------*/ 4918 4919 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4920 float_status *status) 4921 { 4922 int32_t aExp, bExp, zExp; 4923 uint64_t aSig, bSig, zSig0, zSig1; 4924 int32_t expDiff; 4925 4926 aSig = extractFloatx80Frac( a ); 4927 aExp = extractFloatx80Exp( a ); 4928 bSig = extractFloatx80Frac( b ); 4929 bExp = extractFloatx80Exp( b ); 4930 expDiff = aExp - bExp; 4931 if ( 0 < expDiff ) { 4932 if ( aExp == 0x7FFF ) { 4933 if ((uint64_t)(aSig << 1)) { 4934 return propagateFloatx80NaN(a, b, status); 4935 } 4936 return a; 4937 } 4938 if ( bExp == 0 ) --expDiff; 4939 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4940 zExp = aExp; 4941 } 4942 else if ( expDiff < 0 ) { 4943 if ( bExp == 0x7FFF ) { 4944 if ((uint64_t)(bSig << 1)) { 4945 return propagateFloatx80NaN(a, b, status); 4946 } 4947 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4948 } 4949 if ( aExp == 0 ) ++expDiff; 4950 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4951 zExp = bExp; 4952 } 4953 else { 4954 if ( aExp == 0x7FFF ) { 4955 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4956 return propagateFloatx80NaN(a, b, status); 4957 } 4958 return a; 4959 } 4960 zSig1 = 0; 4961 zSig0 = aSig + bSig; 4962 if ( aExp == 0 ) { 4963 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4964 goto roundAndPack; 4965 } 4966 zExp = aExp; 4967 goto shiftRight1; 4968 } 4969 zSig0 = aSig + bSig; 4970 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4971 shiftRight1: 4972 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4973 zSig0 |= LIT64( 0x8000000000000000 ); 4974 ++zExp; 4975 roundAndPack: 4976 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4977 zSign, zExp, zSig0, zSig1, status); 4978 } 4979 4980 /*---------------------------------------------------------------------------- 4981 | Returns the result of subtracting the absolute values of the extended 4982 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4983 | difference is negated before being returned. `zSign' is ignored if the 4984 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4985 | Standard for Binary Floating-Point Arithmetic. 4986 *----------------------------------------------------------------------------*/ 4987 4988 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4989 float_status *status) 4990 { 4991 int32_t aExp, bExp, zExp; 4992 uint64_t aSig, bSig, zSig0, zSig1; 4993 int32_t expDiff; 4994 4995 aSig = extractFloatx80Frac( a ); 4996 aExp = extractFloatx80Exp( a ); 4997 bSig = extractFloatx80Frac( b ); 4998 bExp = extractFloatx80Exp( b ); 4999 expDiff = aExp - bExp; 5000 if ( 0 < expDiff ) goto aExpBigger; 5001 if ( expDiff < 0 ) goto bExpBigger; 5002 if ( aExp == 0x7FFF ) { 5003 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 5004 return propagateFloatx80NaN(a, b, status); 5005 } 5006 float_raise(float_flag_invalid, status); 5007 return floatx80_default_nan(status); 5008 } 5009 if ( aExp == 0 ) { 5010 aExp = 1; 5011 bExp = 1; 5012 } 5013 zSig1 = 0; 5014 if ( bSig < aSig ) goto aBigger; 5015 if ( aSig < bSig ) goto bBigger; 5016 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 5017 bExpBigger: 5018 if ( bExp == 0x7FFF ) { 5019 if ((uint64_t)(bSig << 1)) { 5020 return propagateFloatx80NaN(a, b, status); 5021 } 5022 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5023 } 5024 if ( aExp == 0 ) ++expDiff; 5025 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 5026 bBigger: 5027 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 5028 zExp = bExp; 5029 zSign ^= 1; 5030 goto normalizeRoundAndPack; 5031 aExpBigger: 5032 if ( aExp == 0x7FFF ) { 5033 if ((uint64_t)(aSig << 1)) { 5034 return propagateFloatx80NaN(a, b, status); 5035 } 5036 return a; 5037 } 5038 if ( bExp == 0 ) --expDiff; 5039 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 5040 aBigger: 5041 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 5042 zExp = aExp; 5043 normalizeRoundAndPack: 5044 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 5045 zSign, zExp, zSig0, zSig1, status); 5046 } 5047 5048 /*---------------------------------------------------------------------------- 5049 | Returns the result of adding the extended double-precision floating-point 5050 | values `a' and `b'. The operation is performed according to the IEC/IEEE 5051 | Standard for Binary Floating-Point Arithmetic. 5052 *----------------------------------------------------------------------------*/ 5053 5054 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 5055 { 5056 flag aSign, bSign; 5057 5058 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5059 float_raise(float_flag_invalid, status); 5060 return floatx80_default_nan(status); 5061 } 5062 aSign = extractFloatx80Sign( a ); 5063 bSign = extractFloatx80Sign( b ); 5064 if ( aSign == bSign ) { 5065 return addFloatx80Sigs(a, b, aSign, status); 5066 } 5067 else { 5068 return subFloatx80Sigs(a, b, aSign, status); 5069 } 5070 5071 } 5072 5073 /*---------------------------------------------------------------------------- 5074 | Returns the result of subtracting the extended double-precision floating- 5075 | point values `a' and `b'. The operation is performed according to the 5076 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5077 *----------------------------------------------------------------------------*/ 5078 5079 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 5080 { 5081 flag aSign, bSign; 5082 5083 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5084 float_raise(float_flag_invalid, status); 5085 return floatx80_default_nan(status); 5086 } 5087 aSign = extractFloatx80Sign( a ); 5088 bSign = extractFloatx80Sign( b ); 5089 if ( aSign == bSign ) { 5090 return subFloatx80Sigs(a, b, aSign, status); 5091 } 5092 else { 5093 return addFloatx80Sigs(a, b, aSign, status); 5094 } 5095 5096 } 5097 5098 /*---------------------------------------------------------------------------- 5099 | Returns the result of multiplying the extended double-precision floating- 5100 | point values `a' and `b'. The operation is performed according to the 5101 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5102 *----------------------------------------------------------------------------*/ 5103 5104 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5105 { 5106 flag aSign, bSign, zSign; 5107 int32_t aExp, bExp, zExp; 5108 uint64_t aSig, bSig, zSig0, zSig1; 5109 5110 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5111 float_raise(float_flag_invalid, status); 5112 return floatx80_default_nan(status); 5113 } 5114 aSig = extractFloatx80Frac( a ); 5115 aExp = extractFloatx80Exp( a ); 5116 aSign = extractFloatx80Sign( a ); 5117 bSig = extractFloatx80Frac( b ); 5118 bExp = extractFloatx80Exp( b ); 5119 bSign = extractFloatx80Sign( b ); 5120 zSign = aSign ^ bSign; 5121 if ( aExp == 0x7FFF ) { 5122 if ( (uint64_t) ( aSig<<1 ) 5123 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5124 return propagateFloatx80NaN(a, b, status); 5125 } 5126 if ( ( bExp | bSig ) == 0 ) goto invalid; 5127 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5128 } 5129 if ( bExp == 0x7FFF ) { 5130 if ((uint64_t)(bSig << 1)) { 5131 return propagateFloatx80NaN(a, b, status); 5132 } 5133 if ( ( aExp | aSig ) == 0 ) { 5134 invalid: 5135 float_raise(float_flag_invalid, status); 5136 return floatx80_default_nan(status); 5137 } 5138 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5139 } 5140 if ( aExp == 0 ) { 5141 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5142 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5143 } 5144 if ( bExp == 0 ) { 5145 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5146 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5147 } 5148 zExp = aExp + bExp - 0x3FFE; 5149 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5150 if ( 0 < (int64_t) zSig0 ) { 5151 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5152 --zExp; 5153 } 5154 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5155 zSign, zExp, zSig0, zSig1, status); 5156 } 5157 5158 /*---------------------------------------------------------------------------- 5159 | Returns the result of dividing the extended double-precision floating-point 5160 | value `a' by the corresponding value `b'. The operation is performed 5161 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5162 *----------------------------------------------------------------------------*/ 5163 5164 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5165 { 5166 flag aSign, bSign, zSign; 5167 int32_t aExp, bExp, zExp; 5168 uint64_t aSig, bSig, zSig0, zSig1; 5169 uint64_t rem0, rem1, rem2, term0, term1, term2; 5170 5171 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5172 float_raise(float_flag_invalid, status); 5173 return floatx80_default_nan(status); 5174 } 5175 aSig = extractFloatx80Frac( a ); 5176 aExp = extractFloatx80Exp( a ); 5177 aSign = extractFloatx80Sign( a ); 5178 bSig = extractFloatx80Frac( b ); 5179 bExp = extractFloatx80Exp( b ); 5180 bSign = extractFloatx80Sign( b ); 5181 zSign = aSign ^ bSign; 5182 if ( aExp == 0x7FFF ) { 5183 if ((uint64_t)(aSig << 1)) { 5184 return propagateFloatx80NaN(a, b, status); 5185 } 5186 if ( bExp == 0x7FFF ) { 5187 if ((uint64_t)(bSig << 1)) { 5188 return propagateFloatx80NaN(a, b, status); 5189 } 5190 goto invalid; 5191 } 5192 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5193 } 5194 if ( bExp == 0x7FFF ) { 5195 if ((uint64_t)(bSig << 1)) { 5196 return propagateFloatx80NaN(a, b, status); 5197 } 5198 return packFloatx80( zSign, 0, 0 ); 5199 } 5200 if ( bExp == 0 ) { 5201 if ( bSig == 0 ) { 5202 if ( ( aExp | aSig ) == 0 ) { 5203 invalid: 5204 float_raise(float_flag_invalid, status); 5205 return floatx80_default_nan(status); 5206 } 5207 float_raise(float_flag_divbyzero, status); 5208 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5209 } 5210 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5211 } 5212 if ( aExp == 0 ) { 5213 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5214 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5215 } 5216 zExp = aExp - bExp + 0x3FFE; 5217 rem1 = 0; 5218 if ( bSig <= aSig ) { 5219 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5220 ++zExp; 5221 } 5222 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5223 mul64To128( bSig, zSig0, &term0, &term1 ); 5224 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5225 while ( (int64_t) rem0 < 0 ) { 5226 --zSig0; 5227 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5228 } 5229 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5230 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5231 mul64To128( bSig, zSig1, &term1, &term2 ); 5232 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5233 while ( (int64_t) rem1 < 0 ) { 5234 --zSig1; 5235 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5236 } 5237 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5238 } 5239 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5240 zSign, zExp, zSig0, zSig1, status); 5241 } 5242 5243 /*---------------------------------------------------------------------------- 5244 | Returns the remainder of the extended double-precision floating-point value 5245 | `a' with respect to the corresponding value `b'. The operation is performed 5246 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5247 *----------------------------------------------------------------------------*/ 5248 5249 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5250 { 5251 flag aSign, zSign; 5252 int32_t aExp, bExp, expDiff; 5253 uint64_t aSig0, aSig1, bSig; 5254 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5255 5256 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5257 float_raise(float_flag_invalid, status); 5258 return floatx80_default_nan(status); 5259 } 5260 aSig0 = extractFloatx80Frac( a ); 5261 aExp = extractFloatx80Exp( a ); 5262 aSign = extractFloatx80Sign( a ); 5263 bSig = extractFloatx80Frac( b ); 5264 bExp = extractFloatx80Exp( b ); 5265 if ( aExp == 0x7FFF ) { 5266 if ( (uint64_t) ( aSig0<<1 ) 5267 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5268 return propagateFloatx80NaN(a, b, status); 5269 } 5270 goto invalid; 5271 } 5272 if ( bExp == 0x7FFF ) { 5273 if ((uint64_t)(bSig << 1)) { 5274 return propagateFloatx80NaN(a, b, status); 5275 } 5276 return a; 5277 } 5278 if ( bExp == 0 ) { 5279 if ( bSig == 0 ) { 5280 invalid: 5281 float_raise(float_flag_invalid, status); 5282 return floatx80_default_nan(status); 5283 } 5284 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5285 } 5286 if ( aExp == 0 ) { 5287 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5288 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5289 } 5290 bSig |= LIT64( 0x8000000000000000 ); 5291 zSign = aSign; 5292 expDiff = aExp - bExp; 5293 aSig1 = 0; 5294 if ( expDiff < 0 ) { 5295 if ( expDiff < -1 ) return a; 5296 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5297 expDiff = 0; 5298 } 5299 q = ( bSig <= aSig0 ); 5300 if ( q ) aSig0 -= bSig; 5301 expDiff -= 64; 5302 while ( 0 < expDiff ) { 5303 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5304 q = ( 2 < q ) ? q - 2 : 0; 5305 mul64To128( bSig, q, &term0, &term1 ); 5306 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5307 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5308 expDiff -= 62; 5309 } 5310 expDiff += 64; 5311 if ( 0 < expDiff ) { 5312 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5313 q = ( 2 < q ) ? q - 2 : 0; 5314 q >>= 64 - expDiff; 5315 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5316 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5317 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5318 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5319 ++q; 5320 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5321 } 5322 } 5323 else { 5324 term1 = 0; 5325 term0 = bSig; 5326 } 5327 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5328 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5329 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5330 && ( q & 1 ) ) 5331 ) { 5332 aSig0 = alternateASig0; 5333 aSig1 = alternateASig1; 5334 zSign = ! zSign; 5335 } 5336 return 5337 normalizeRoundAndPackFloatx80( 5338 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5339 5340 } 5341 5342 /*---------------------------------------------------------------------------- 5343 | Returns the square root of the extended double-precision floating-point 5344 | value `a'. The operation is performed according to the IEC/IEEE Standard 5345 | for Binary Floating-Point Arithmetic. 5346 *----------------------------------------------------------------------------*/ 5347 5348 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5349 { 5350 flag aSign; 5351 int32_t aExp, zExp; 5352 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5353 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5354 5355 if (floatx80_invalid_encoding(a)) { 5356 float_raise(float_flag_invalid, status); 5357 return floatx80_default_nan(status); 5358 } 5359 aSig0 = extractFloatx80Frac( a ); 5360 aExp = extractFloatx80Exp( a ); 5361 aSign = extractFloatx80Sign( a ); 5362 if ( aExp == 0x7FFF ) { 5363 if ((uint64_t)(aSig0 << 1)) { 5364 return propagateFloatx80NaN(a, a, status); 5365 } 5366 if ( ! aSign ) return a; 5367 goto invalid; 5368 } 5369 if ( aSign ) { 5370 if ( ( aExp | aSig0 ) == 0 ) return a; 5371 invalid: 5372 float_raise(float_flag_invalid, status); 5373 return floatx80_default_nan(status); 5374 } 5375 if ( aExp == 0 ) { 5376 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5377 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5378 } 5379 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5380 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5381 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5382 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5383 doubleZSig0 = zSig0<<1; 5384 mul64To128( zSig0, zSig0, &term0, &term1 ); 5385 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5386 while ( (int64_t) rem0 < 0 ) { 5387 --zSig0; 5388 doubleZSig0 -= 2; 5389 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5390 } 5391 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5392 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5393 if ( zSig1 == 0 ) zSig1 = 1; 5394 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5395 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5396 mul64To128( zSig1, zSig1, &term2, &term3 ); 5397 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5398 while ( (int64_t) rem1 < 0 ) { 5399 --zSig1; 5400 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5401 term3 |= 1; 5402 term2 |= doubleZSig0; 5403 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5404 } 5405 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5406 } 5407 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5408 zSig0 |= doubleZSig0; 5409 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5410 0, zExp, zSig0, zSig1, status); 5411 } 5412 5413 /*---------------------------------------------------------------------------- 5414 | Returns 1 if the extended double-precision floating-point value `a' is equal 5415 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5416 | raised if either operand is a NaN. Otherwise, the comparison is performed 5417 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5418 *----------------------------------------------------------------------------*/ 5419 5420 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5421 { 5422 5423 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5424 || (extractFloatx80Exp(a) == 0x7FFF 5425 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5426 || (extractFloatx80Exp(b) == 0x7FFF 5427 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5428 ) { 5429 float_raise(float_flag_invalid, status); 5430 return 0; 5431 } 5432 return 5433 ( a.low == b.low ) 5434 && ( ( a.high == b.high ) 5435 || ( ( a.low == 0 ) 5436 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5437 ); 5438 5439 } 5440 5441 /*---------------------------------------------------------------------------- 5442 | Returns 1 if the extended double-precision floating-point value `a' is 5443 | less than or equal to the corresponding value `b', and 0 otherwise. The 5444 | invalid exception is raised if either operand is a NaN. The comparison is 5445 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5446 | Arithmetic. 5447 *----------------------------------------------------------------------------*/ 5448 5449 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5450 { 5451 flag aSign, bSign; 5452 5453 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5454 || (extractFloatx80Exp(a) == 0x7FFF 5455 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5456 || (extractFloatx80Exp(b) == 0x7FFF 5457 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5458 ) { 5459 float_raise(float_flag_invalid, status); 5460 return 0; 5461 } 5462 aSign = extractFloatx80Sign( a ); 5463 bSign = extractFloatx80Sign( b ); 5464 if ( aSign != bSign ) { 5465 return 5466 aSign 5467 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5468 == 0 ); 5469 } 5470 return 5471 aSign ? le128( b.high, b.low, a.high, a.low ) 5472 : le128( a.high, a.low, b.high, b.low ); 5473 5474 } 5475 5476 /*---------------------------------------------------------------------------- 5477 | Returns 1 if the extended double-precision floating-point value `a' is 5478 | less than the corresponding value `b', and 0 otherwise. The invalid 5479 | exception is raised if either operand is a NaN. The comparison is performed 5480 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5481 *----------------------------------------------------------------------------*/ 5482 5483 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5484 { 5485 flag aSign, bSign; 5486 5487 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5488 || (extractFloatx80Exp(a) == 0x7FFF 5489 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5490 || (extractFloatx80Exp(b) == 0x7FFF 5491 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5492 ) { 5493 float_raise(float_flag_invalid, status); 5494 return 0; 5495 } 5496 aSign = extractFloatx80Sign( a ); 5497 bSign = extractFloatx80Sign( b ); 5498 if ( aSign != bSign ) { 5499 return 5500 aSign 5501 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5502 != 0 ); 5503 } 5504 return 5505 aSign ? lt128( b.high, b.low, a.high, a.low ) 5506 : lt128( a.high, a.low, b.high, b.low ); 5507 5508 } 5509 5510 /*---------------------------------------------------------------------------- 5511 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5512 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5513 | either operand is a NaN. The comparison is performed according to the 5514 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5515 *----------------------------------------------------------------------------*/ 5516 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5517 { 5518 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5519 || (extractFloatx80Exp(a) == 0x7FFF 5520 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5521 || (extractFloatx80Exp(b) == 0x7FFF 5522 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5523 ) { 5524 float_raise(float_flag_invalid, status); 5525 return 1; 5526 } 5527 return 0; 5528 } 5529 5530 /*---------------------------------------------------------------------------- 5531 | Returns 1 if the extended double-precision floating-point value `a' is 5532 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5533 | cause an exception. The comparison is performed according to the IEC/IEEE 5534 | Standard for Binary Floating-Point Arithmetic. 5535 *----------------------------------------------------------------------------*/ 5536 5537 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5538 { 5539 5540 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5541 float_raise(float_flag_invalid, status); 5542 return 0; 5543 } 5544 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5545 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5546 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5547 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5548 ) { 5549 if (floatx80_is_signaling_nan(a, status) 5550 || floatx80_is_signaling_nan(b, status)) { 5551 float_raise(float_flag_invalid, status); 5552 } 5553 return 0; 5554 } 5555 return 5556 ( a.low == b.low ) 5557 && ( ( a.high == b.high ) 5558 || ( ( a.low == 0 ) 5559 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5560 ); 5561 5562 } 5563 5564 /*---------------------------------------------------------------------------- 5565 | Returns 1 if the extended double-precision floating-point value `a' is less 5566 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5567 | do not cause an exception. Otherwise, the comparison is performed according 5568 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5569 *----------------------------------------------------------------------------*/ 5570 5571 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5572 { 5573 flag aSign, bSign; 5574 5575 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5576 float_raise(float_flag_invalid, status); 5577 return 0; 5578 } 5579 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5580 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5581 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5582 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5583 ) { 5584 if (floatx80_is_signaling_nan(a, status) 5585 || floatx80_is_signaling_nan(b, status)) { 5586 float_raise(float_flag_invalid, status); 5587 } 5588 return 0; 5589 } 5590 aSign = extractFloatx80Sign( a ); 5591 bSign = extractFloatx80Sign( b ); 5592 if ( aSign != bSign ) { 5593 return 5594 aSign 5595 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5596 == 0 ); 5597 } 5598 return 5599 aSign ? le128( b.high, b.low, a.high, a.low ) 5600 : le128( a.high, a.low, b.high, b.low ); 5601 5602 } 5603 5604 /*---------------------------------------------------------------------------- 5605 | Returns 1 if the extended double-precision floating-point value `a' is less 5606 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5607 | an exception. Otherwise, the comparison is performed according to the 5608 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5609 *----------------------------------------------------------------------------*/ 5610 5611 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5612 { 5613 flag aSign, bSign; 5614 5615 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5616 float_raise(float_flag_invalid, status); 5617 return 0; 5618 } 5619 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5620 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5621 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5622 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5623 ) { 5624 if (floatx80_is_signaling_nan(a, status) 5625 || floatx80_is_signaling_nan(b, status)) { 5626 float_raise(float_flag_invalid, status); 5627 } 5628 return 0; 5629 } 5630 aSign = extractFloatx80Sign( a ); 5631 bSign = extractFloatx80Sign( b ); 5632 if ( aSign != bSign ) { 5633 return 5634 aSign 5635 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5636 != 0 ); 5637 } 5638 return 5639 aSign ? lt128( b.high, b.low, a.high, a.low ) 5640 : lt128( a.high, a.low, b.high, b.low ); 5641 5642 } 5643 5644 /*---------------------------------------------------------------------------- 5645 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5646 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5647 | The comparison is performed according to the IEC/IEEE Standard for Binary 5648 | Floating-Point Arithmetic. 5649 *----------------------------------------------------------------------------*/ 5650 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5651 { 5652 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5653 float_raise(float_flag_invalid, status); 5654 return 1; 5655 } 5656 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5657 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5658 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5659 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5660 ) { 5661 if (floatx80_is_signaling_nan(a, status) 5662 || floatx80_is_signaling_nan(b, status)) { 5663 float_raise(float_flag_invalid, status); 5664 } 5665 return 1; 5666 } 5667 return 0; 5668 } 5669 5670 /*---------------------------------------------------------------------------- 5671 | Returns the result of converting the quadruple-precision floating-point 5672 | value `a' to the 32-bit two's complement integer format. The conversion 5673 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5674 | Arithmetic---which means in particular that the conversion is rounded 5675 | according to the current rounding mode. If `a' is a NaN, the largest 5676 | positive integer is returned. Otherwise, if the conversion overflows, the 5677 | largest integer with the same sign as `a' is returned. 5678 *----------------------------------------------------------------------------*/ 5679 5680 int32_t float128_to_int32(float128 a, float_status *status) 5681 { 5682 flag aSign; 5683 int32_t aExp, shiftCount; 5684 uint64_t aSig0, aSig1; 5685 5686 aSig1 = extractFloat128Frac1( a ); 5687 aSig0 = extractFloat128Frac0( a ); 5688 aExp = extractFloat128Exp( a ); 5689 aSign = extractFloat128Sign( a ); 5690 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5691 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5692 aSig0 |= ( aSig1 != 0 ); 5693 shiftCount = 0x4028 - aExp; 5694 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5695 return roundAndPackInt32(aSign, aSig0, status); 5696 5697 } 5698 5699 /*---------------------------------------------------------------------------- 5700 | Returns the result of converting the quadruple-precision floating-point 5701 | value `a' to the 32-bit two's complement integer format. The conversion 5702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5703 | Arithmetic, except that the conversion is always rounded toward zero. If 5704 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5705 | conversion overflows, the largest integer with the same sign as `a' is 5706 | returned. 5707 *----------------------------------------------------------------------------*/ 5708 5709 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5710 { 5711 flag aSign; 5712 int32_t aExp, shiftCount; 5713 uint64_t aSig0, aSig1, savedASig; 5714 int32_t z; 5715 5716 aSig1 = extractFloat128Frac1( a ); 5717 aSig0 = extractFloat128Frac0( a ); 5718 aExp = extractFloat128Exp( a ); 5719 aSign = extractFloat128Sign( a ); 5720 aSig0 |= ( aSig1 != 0 ); 5721 if ( 0x401E < aExp ) { 5722 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5723 goto invalid; 5724 } 5725 else if ( aExp < 0x3FFF ) { 5726 if (aExp || aSig0) { 5727 status->float_exception_flags |= float_flag_inexact; 5728 } 5729 return 0; 5730 } 5731 aSig0 |= LIT64( 0x0001000000000000 ); 5732 shiftCount = 0x402F - aExp; 5733 savedASig = aSig0; 5734 aSig0 >>= shiftCount; 5735 z = aSig0; 5736 if ( aSign ) z = - z; 5737 if ( ( z < 0 ) ^ aSign ) { 5738 invalid: 5739 float_raise(float_flag_invalid, status); 5740 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5741 } 5742 if ( ( aSig0<<shiftCount ) != savedASig ) { 5743 status->float_exception_flags |= float_flag_inexact; 5744 } 5745 return z; 5746 5747 } 5748 5749 /*---------------------------------------------------------------------------- 5750 | Returns the result of converting the quadruple-precision floating-point 5751 | value `a' to the 64-bit two's complement integer format. The conversion 5752 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5753 | Arithmetic---which means in particular that the conversion is rounded 5754 | according to the current rounding mode. If `a' is a NaN, the largest 5755 | positive integer is returned. Otherwise, if the conversion overflows, the 5756 | largest integer with the same sign as `a' is returned. 5757 *----------------------------------------------------------------------------*/ 5758 5759 int64_t float128_to_int64(float128 a, float_status *status) 5760 { 5761 flag aSign; 5762 int32_t aExp, shiftCount; 5763 uint64_t aSig0, aSig1; 5764 5765 aSig1 = extractFloat128Frac1( a ); 5766 aSig0 = extractFloat128Frac0( a ); 5767 aExp = extractFloat128Exp( a ); 5768 aSign = extractFloat128Sign( a ); 5769 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5770 shiftCount = 0x402F - aExp; 5771 if ( shiftCount <= 0 ) { 5772 if ( 0x403E < aExp ) { 5773 float_raise(float_flag_invalid, status); 5774 if ( ! aSign 5775 || ( ( aExp == 0x7FFF ) 5776 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5777 ) 5778 ) { 5779 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5780 } 5781 return (int64_t) LIT64( 0x8000000000000000 ); 5782 } 5783 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5784 } 5785 else { 5786 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5787 } 5788 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5789 5790 } 5791 5792 /*---------------------------------------------------------------------------- 5793 | Returns the result of converting the quadruple-precision floating-point 5794 | value `a' to the 64-bit two's complement integer format. The conversion 5795 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5796 | Arithmetic, except that the conversion is always rounded toward zero. 5797 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5798 | the conversion overflows, the largest integer with the same sign as `a' is 5799 | returned. 5800 *----------------------------------------------------------------------------*/ 5801 5802 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5803 { 5804 flag aSign; 5805 int32_t aExp, shiftCount; 5806 uint64_t aSig0, aSig1; 5807 int64_t z; 5808 5809 aSig1 = extractFloat128Frac1( a ); 5810 aSig0 = extractFloat128Frac0( a ); 5811 aExp = extractFloat128Exp( a ); 5812 aSign = extractFloat128Sign( a ); 5813 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5814 shiftCount = aExp - 0x402F; 5815 if ( 0 < shiftCount ) { 5816 if ( 0x403E <= aExp ) { 5817 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5818 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5819 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5820 if (aSig1) { 5821 status->float_exception_flags |= float_flag_inexact; 5822 } 5823 } 5824 else { 5825 float_raise(float_flag_invalid, status); 5826 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5827 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5828 } 5829 } 5830 return (int64_t) LIT64( 0x8000000000000000 ); 5831 } 5832 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5833 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5834 status->float_exception_flags |= float_flag_inexact; 5835 } 5836 } 5837 else { 5838 if ( aExp < 0x3FFF ) { 5839 if ( aExp | aSig0 | aSig1 ) { 5840 status->float_exception_flags |= float_flag_inexact; 5841 } 5842 return 0; 5843 } 5844 z = aSig0>>( - shiftCount ); 5845 if ( aSig1 5846 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5847 status->float_exception_flags |= float_flag_inexact; 5848 } 5849 } 5850 if ( aSign ) z = - z; 5851 return z; 5852 5853 } 5854 5855 /*---------------------------------------------------------------------------- 5856 | Returns the result of converting the quadruple-precision floating-point value 5857 | `a' to the 64-bit unsigned integer format. The conversion is 5858 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5859 | Arithmetic---which means in particular that the conversion is rounded 5860 | according to the current rounding mode. If `a' is a NaN, the largest 5861 | positive integer is returned. If the conversion overflows, the 5862 | largest unsigned integer is returned. If 'a' is negative, the value is 5863 | rounded and zero is returned; negative values that do not round to zero 5864 | will raise the inexact exception. 5865 *----------------------------------------------------------------------------*/ 5866 5867 uint64_t float128_to_uint64(float128 a, float_status *status) 5868 { 5869 flag aSign; 5870 int aExp; 5871 int shiftCount; 5872 uint64_t aSig0, aSig1; 5873 5874 aSig0 = extractFloat128Frac0(a); 5875 aSig1 = extractFloat128Frac1(a); 5876 aExp = extractFloat128Exp(a); 5877 aSign = extractFloat128Sign(a); 5878 if (aSign && (aExp > 0x3FFE)) { 5879 float_raise(float_flag_invalid, status); 5880 if (float128_is_any_nan(a)) { 5881 return LIT64(0xFFFFFFFFFFFFFFFF); 5882 } else { 5883 return 0; 5884 } 5885 } 5886 if (aExp) { 5887 aSig0 |= LIT64(0x0001000000000000); 5888 } 5889 shiftCount = 0x402F - aExp; 5890 if (shiftCount <= 0) { 5891 if (0x403E < aExp) { 5892 float_raise(float_flag_invalid, status); 5893 return LIT64(0xFFFFFFFFFFFFFFFF); 5894 } 5895 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5896 } else { 5897 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5898 } 5899 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5900 } 5901 5902 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5903 { 5904 uint64_t v; 5905 signed char current_rounding_mode = status->float_rounding_mode; 5906 5907 set_float_rounding_mode(float_round_to_zero, status); 5908 v = float128_to_uint64(a, status); 5909 set_float_rounding_mode(current_rounding_mode, status); 5910 5911 return v; 5912 } 5913 5914 /*---------------------------------------------------------------------------- 5915 | Returns the result of converting the quadruple-precision floating-point 5916 | value `a' to the 32-bit unsigned integer format. The conversion 5917 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5918 | Arithmetic except that the conversion is always rounded toward zero. 5919 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5920 | if the conversion overflows, the largest unsigned integer is returned. 5921 | If 'a' is negative, the value is rounded and zero is returned; negative 5922 | values that do not round to zero will raise the inexact exception. 5923 *----------------------------------------------------------------------------*/ 5924 5925 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5926 { 5927 uint64_t v; 5928 uint32_t res; 5929 int old_exc_flags = get_float_exception_flags(status); 5930 5931 v = float128_to_uint64_round_to_zero(a, status); 5932 if (v > 0xffffffff) { 5933 res = 0xffffffff; 5934 } else { 5935 return v; 5936 } 5937 set_float_exception_flags(old_exc_flags, status); 5938 float_raise(float_flag_invalid, status); 5939 return res; 5940 } 5941 5942 /*---------------------------------------------------------------------------- 5943 | Returns the result of converting the quadruple-precision floating-point 5944 | value `a' to the single-precision floating-point format. The conversion 5945 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5946 | Arithmetic. 5947 *----------------------------------------------------------------------------*/ 5948 5949 float32 float128_to_float32(float128 a, float_status *status) 5950 { 5951 flag aSign; 5952 int32_t aExp; 5953 uint64_t aSig0, aSig1; 5954 uint32_t zSig; 5955 5956 aSig1 = extractFloat128Frac1( a ); 5957 aSig0 = extractFloat128Frac0( a ); 5958 aExp = extractFloat128Exp( a ); 5959 aSign = extractFloat128Sign( a ); 5960 if ( aExp == 0x7FFF ) { 5961 if ( aSig0 | aSig1 ) { 5962 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5963 } 5964 return packFloat32( aSign, 0xFF, 0 ); 5965 } 5966 aSig0 |= ( aSig1 != 0 ); 5967 shift64RightJamming( aSig0, 18, &aSig0 ); 5968 zSig = aSig0; 5969 if ( aExp || zSig ) { 5970 zSig |= 0x40000000; 5971 aExp -= 0x3F81; 5972 } 5973 return roundAndPackFloat32(aSign, aExp, zSig, status); 5974 5975 } 5976 5977 /*---------------------------------------------------------------------------- 5978 | Returns the result of converting the quadruple-precision floating-point 5979 | value `a' to the double-precision floating-point format. The conversion 5980 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5981 | Arithmetic. 5982 *----------------------------------------------------------------------------*/ 5983 5984 float64 float128_to_float64(float128 a, float_status *status) 5985 { 5986 flag aSign; 5987 int32_t aExp; 5988 uint64_t aSig0, aSig1; 5989 5990 aSig1 = extractFloat128Frac1( a ); 5991 aSig0 = extractFloat128Frac0( a ); 5992 aExp = extractFloat128Exp( a ); 5993 aSign = extractFloat128Sign( a ); 5994 if ( aExp == 0x7FFF ) { 5995 if ( aSig0 | aSig1 ) { 5996 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5997 } 5998 return packFloat64( aSign, 0x7FF, 0 ); 5999 } 6000 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6001 aSig0 |= ( aSig1 != 0 ); 6002 if ( aExp || aSig0 ) { 6003 aSig0 |= LIT64( 0x4000000000000000 ); 6004 aExp -= 0x3C01; 6005 } 6006 return roundAndPackFloat64(aSign, aExp, aSig0, status); 6007 6008 } 6009 6010 /*---------------------------------------------------------------------------- 6011 | Returns the result of converting the quadruple-precision floating-point 6012 | value `a' to the extended double-precision floating-point format. The 6013 | conversion is performed according to the IEC/IEEE Standard for Binary 6014 | Floating-Point Arithmetic. 6015 *----------------------------------------------------------------------------*/ 6016 6017 floatx80 float128_to_floatx80(float128 a, float_status *status) 6018 { 6019 flag aSign; 6020 int32_t aExp; 6021 uint64_t aSig0, aSig1; 6022 6023 aSig1 = extractFloat128Frac1( a ); 6024 aSig0 = extractFloat128Frac0( a ); 6025 aExp = extractFloat128Exp( a ); 6026 aSign = extractFloat128Sign( a ); 6027 if ( aExp == 0x7FFF ) { 6028 if ( aSig0 | aSig1 ) { 6029 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 6030 } 6031 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 6032 } 6033 if ( aExp == 0 ) { 6034 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 6035 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6036 } 6037 else { 6038 aSig0 |= LIT64( 0x0001000000000000 ); 6039 } 6040 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 6041 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 6042 6043 } 6044 6045 /*---------------------------------------------------------------------------- 6046 | Rounds the quadruple-precision floating-point value `a' to an integer, and 6047 | returns the result as a quadruple-precision floating-point value. The 6048 | operation is performed according to the IEC/IEEE Standard for Binary 6049 | Floating-Point Arithmetic. 6050 *----------------------------------------------------------------------------*/ 6051 6052 float128 float128_round_to_int(float128 a, float_status *status) 6053 { 6054 flag aSign; 6055 int32_t aExp; 6056 uint64_t lastBitMask, roundBitsMask; 6057 float128 z; 6058 6059 aExp = extractFloat128Exp( a ); 6060 if ( 0x402F <= aExp ) { 6061 if ( 0x406F <= aExp ) { 6062 if ( ( aExp == 0x7FFF ) 6063 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 6064 ) { 6065 return propagateFloat128NaN(a, a, status); 6066 } 6067 return a; 6068 } 6069 lastBitMask = 1; 6070 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 6071 roundBitsMask = lastBitMask - 1; 6072 z = a; 6073 switch (status->float_rounding_mode) { 6074 case float_round_nearest_even: 6075 if ( lastBitMask ) { 6076 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 6077 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 6078 } 6079 else { 6080 if ( (int64_t) z.low < 0 ) { 6081 ++z.high; 6082 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 6083 } 6084 } 6085 break; 6086 case float_round_ties_away: 6087 if (lastBitMask) { 6088 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6089 } else { 6090 if ((int64_t) z.low < 0) { 6091 ++z.high; 6092 } 6093 } 6094 break; 6095 case float_round_to_zero: 6096 break; 6097 case float_round_up: 6098 if (!extractFloat128Sign(z)) { 6099 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6100 } 6101 break; 6102 case float_round_down: 6103 if (extractFloat128Sign(z)) { 6104 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6105 } 6106 break; 6107 default: 6108 abort(); 6109 } 6110 z.low &= ~ roundBitsMask; 6111 } 6112 else { 6113 if ( aExp < 0x3FFF ) { 6114 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6115 status->float_exception_flags |= float_flag_inexact; 6116 aSign = extractFloat128Sign( a ); 6117 switch (status->float_rounding_mode) { 6118 case float_round_nearest_even: 6119 if ( ( aExp == 0x3FFE ) 6120 && ( extractFloat128Frac0( a ) 6121 | extractFloat128Frac1( a ) ) 6122 ) { 6123 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6124 } 6125 break; 6126 case float_round_ties_away: 6127 if (aExp == 0x3FFE) { 6128 return packFloat128(aSign, 0x3FFF, 0, 0); 6129 } 6130 break; 6131 case float_round_down: 6132 return 6133 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6134 : packFloat128( 0, 0, 0, 0 ); 6135 case float_round_up: 6136 return 6137 aSign ? packFloat128( 1, 0, 0, 0 ) 6138 : packFloat128( 0, 0x3FFF, 0, 0 ); 6139 } 6140 return packFloat128( aSign, 0, 0, 0 ); 6141 } 6142 lastBitMask = 1; 6143 lastBitMask <<= 0x402F - aExp; 6144 roundBitsMask = lastBitMask - 1; 6145 z.low = 0; 6146 z.high = a.high; 6147 switch (status->float_rounding_mode) { 6148 case float_round_nearest_even: 6149 z.high += lastBitMask>>1; 6150 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6151 z.high &= ~ lastBitMask; 6152 } 6153 break; 6154 case float_round_ties_away: 6155 z.high += lastBitMask>>1; 6156 break; 6157 case float_round_to_zero: 6158 break; 6159 case float_round_up: 6160 if (!extractFloat128Sign(z)) { 6161 z.high |= ( a.low != 0 ); 6162 z.high += roundBitsMask; 6163 } 6164 break; 6165 case float_round_down: 6166 if (extractFloat128Sign(z)) { 6167 z.high |= (a.low != 0); 6168 z.high += roundBitsMask; 6169 } 6170 break; 6171 default: 6172 abort(); 6173 } 6174 z.high &= ~ roundBitsMask; 6175 } 6176 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6177 status->float_exception_flags |= float_flag_inexact; 6178 } 6179 return z; 6180 6181 } 6182 6183 /*---------------------------------------------------------------------------- 6184 | Returns the result of adding the absolute values of the quadruple-precision 6185 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6186 | before being returned. `zSign' is ignored if the result is a NaN. 6187 | The addition is performed according to the IEC/IEEE Standard for Binary 6188 | Floating-Point Arithmetic. 6189 *----------------------------------------------------------------------------*/ 6190 6191 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6192 float_status *status) 6193 { 6194 int32_t aExp, bExp, zExp; 6195 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6196 int32_t expDiff; 6197 6198 aSig1 = extractFloat128Frac1( a ); 6199 aSig0 = extractFloat128Frac0( a ); 6200 aExp = extractFloat128Exp( a ); 6201 bSig1 = extractFloat128Frac1( b ); 6202 bSig0 = extractFloat128Frac0( b ); 6203 bExp = extractFloat128Exp( b ); 6204 expDiff = aExp - bExp; 6205 if ( 0 < expDiff ) { 6206 if ( aExp == 0x7FFF ) { 6207 if (aSig0 | aSig1) { 6208 return propagateFloat128NaN(a, b, status); 6209 } 6210 return a; 6211 } 6212 if ( bExp == 0 ) { 6213 --expDiff; 6214 } 6215 else { 6216 bSig0 |= LIT64( 0x0001000000000000 ); 6217 } 6218 shift128ExtraRightJamming( 6219 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6220 zExp = aExp; 6221 } 6222 else if ( expDiff < 0 ) { 6223 if ( bExp == 0x7FFF ) { 6224 if (bSig0 | bSig1) { 6225 return propagateFloat128NaN(a, b, status); 6226 } 6227 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6228 } 6229 if ( aExp == 0 ) { 6230 ++expDiff; 6231 } 6232 else { 6233 aSig0 |= LIT64( 0x0001000000000000 ); 6234 } 6235 shift128ExtraRightJamming( 6236 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6237 zExp = bExp; 6238 } 6239 else { 6240 if ( aExp == 0x7FFF ) { 6241 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6242 return propagateFloat128NaN(a, b, status); 6243 } 6244 return a; 6245 } 6246 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6247 if ( aExp == 0 ) { 6248 if (status->flush_to_zero) { 6249 if (zSig0 | zSig1) { 6250 float_raise(float_flag_output_denormal, status); 6251 } 6252 return packFloat128(zSign, 0, 0, 0); 6253 } 6254 return packFloat128( zSign, 0, zSig0, zSig1 ); 6255 } 6256 zSig2 = 0; 6257 zSig0 |= LIT64( 0x0002000000000000 ); 6258 zExp = aExp; 6259 goto shiftRight1; 6260 } 6261 aSig0 |= LIT64( 0x0001000000000000 ); 6262 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6263 --zExp; 6264 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6265 ++zExp; 6266 shiftRight1: 6267 shift128ExtraRightJamming( 6268 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6269 roundAndPack: 6270 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6271 6272 } 6273 6274 /*---------------------------------------------------------------------------- 6275 | Returns the result of subtracting the absolute values of the quadruple- 6276 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6277 | difference is negated before being returned. `zSign' is ignored if the 6278 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6279 | Standard for Binary Floating-Point Arithmetic. 6280 *----------------------------------------------------------------------------*/ 6281 6282 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6283 float_status *status) 6284 { 6285 int32_t aExp, bExp, zExp; 6286 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6287 int32_t expDiff; 6288 6289 aSig1 = extractFloat128Frac1( a ); 6290 aSig0 = extractFloat128Frac0( a ); 6291 aExp = extractFloat128Exp( a ); 6292 bSig1 = extractFloat128Frac1( b ); 6293 bSig0 = extractFloat128Frac0( b ); 6294 bExp = extractFloat128Exp( b ); 6295 expDiff = aExp - bExp; 6296 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6297 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6298 if ( 0 < expDiff ) goto aExpBigger; 6299 if ( expDiff < 0 ) goto bExpBigger; 6300 if ( aExp == 0x7FFF ) { 6301 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6302 return propagateFloat128NaN(a, b, status); 6303 } 6304 float_raise(float_flag_invalid, status); 6305 return float128_default_nan(status); 6306 } 6307 if ( aExp == 0 ) { 6308 aExp = 1; 6309 bExp = 1; 6310 } 6311 if ( bSig0 < aSig0 ) goto aBigger; 6312 if ( aSig0 < bSig0 ) goto bBigger; 6313 if ( bSig1 < aSig1 ) goto aBigger; 6314 if ( aSig1 < bSig1 ) goto bBigger; 6315 return packFloat128(status->float_rounding_mode == float_round_down, 6316 0, 0, 0); 6317 bExpBigger: 6318 if ( bExp == 0x7FFF ) { 6319 if (bSig0 | bSig1) { 6320 return propagateFloat128NaN(a, b, status); 6321 } 6322 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6323 } 6324 if ( aExp == 0 ) { 6325 ++expDiff; 6326 } 6327 else { 6328 aSig0 |= LIT64( 0x4000000000000000 ); 6329 } 6330 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6331 bSig0 |= LIT64( 0x4000000000000000 ); 6332 bBigger: 6333 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6334 zExp = bExp; 6335 zSign ^= 1; 6336 goto normalizeRoundAndPack; 6337 aExpBigger: 6338 if ( aExp == 0x7FFF ) { 6339 if (aSig0 | aSig1) { 6340 return propagateFloat128NaN(a, b, status); 6341 } 6342 return a; 6343 } 6344 if ( bExp == 0 ) { 6345 --expDiff; 6346 } 6347 else { 6348 bSig0 |= LIT64( 0x4000000000000000 ); 6349 } 6350 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6351 aSig0 |= LIT64( 0x4000000000000000 ); 6352 aBigger: 6353 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6354 zExp = aExp; 6355 normalizeRoundAndPack: 6356 --zExp; 6357 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6358 status); 6359 6360 } 6361 6362 /*---------------------------------------------------------------------------- 6363 | Returns the result of adding the quadruple-precision floating-point values 6364 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6365 | for Binary Floating-Point Arithmetic. 6366 *----------------------------------------------------------------------------*/ 6367 6368 float128 float128_add(float128 a, float128 b, float_status *status) 6369 { 6370 flag aSign, bSign; 6371 6372 aSign = extractFloat128Sign( a ); 6373 bSign = extractFloat128Sign( b ); 6374 if ( aSign == bSign ) { 6375 return addFloat128Sigs(a, b, aSign, status); 6376 } 6377 else { 6378 return subFloat128Sigs(a, b, aSign, status); 6379 } 6380 6381 } 6382 6383 /*---------------------------------------------------------------------------- 6384 | Returns the result of subtracting the quadruple-precision floating-point 6385 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6386 | Standard for Binary Floating-Point Arithmetic. 6387 *----------------------------------------------------------------------------*/ 6388 6389 float128 float128_sub(float128 a, float128 b, float_status *status) 6390 { 6391 flag aSign, bSign; 6392 6393 aSign = extractFloat128Sign( a ); 6394 bSign = extractFloat128Sign( b ); 6395 if ( aSign == bSign ) { 6396 return subFloat128Sigs(a, b, aSign, status); 6397 } 6398 else { 6399 return addFloat128Sigs(a, b, aSign, status); 6400 } 6401 6402 } 6403 6404 /*---------------------------------------------------------------------------- 6405 | Returns the result of multiplying the quadruple-precision floating-point 6406 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6407 | Standard for Binary Floating-Point Arithmetic. 6408 *----------------------------------------------------------------------------*/ 6409 6410 float128 float128_mul(float128 a, float128 b, float_status *status) 6411 { 6412 flag aSign, bSign, zSign; 6413 int32_t aExp, bExp, zExp; 6414 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6415 6416 aSig1 = extractFloat128Frac1( a ); 6417 aSig0 = extractFloat128Frac0( a ); 6418 aExp = extractFloat128Exp( a ); 6419 aSign = extractFloat128Sign( a ); 6420 bSig1 = extractFloat128Frac1( b ); 6421 bSig0 = extractFloat128Frac0( b ); 6422 bExp = extractFloat128Exp( b ); 6423 bSign = extractFloat128Sign( b ); 6424 zSign = aSign ^ bSign; 6425 if ( aExp == 0x7FFF ) { 6426 if ( ( aSig0 | aSig1 ) 6427 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6428 return propagateFloat128NaN(a, b, status); 6429 } 6430 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6431 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6432 } 6433 if ( bExp == 0x7FFF ) { 6434 if (bSig0 | bSig1) { 6435 return propagateFloat128NaN(a, b, status); 6436 } 6437 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6438 invalid: 6439 float_raise(float_flag_invalid, status); 6440 return float128_default_nan(status); 6441 } 6442 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6443 } 6444 if ( aExp == 0 ) { 6445 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6446 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6447 } 6448 if ( bExp == 0 ) { 6449 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6450 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6451 } 6452 zExp = aExp + bExp - 0x4000; 6453 aSig0 |= LIT64( 0x0001000000000000 ); 6454 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6455 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6456 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6457 zSig2 |= ( zSig3 != 0 ); 6458 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6459 shift128ExtraRightJamming( 6460 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6461 ++zExp; 6462 } 6463 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6464 6465 } 6466 6467 /*---------------------------------------------------------------------------- 6468 | Returns the result of dividing the quadruple-precision floating-point value 6469 | `a' by the corresponding value `b'. The operation is performed according to 6470 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6471 *----------------------------------------------------------------------------*/ 6472 6473 float128 float128_div(float128 a, float128 b, float_status *status) 6474 { 6475 flag aSign, bSign, zSign; 6476 int32_t aExp, bExp, zExp; 6477 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6478 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6479 6480 aSig1 = extractFloat128Frac1( a ); 6481 aSig0 = extractFloat128Frac0( a ); 6482 aExp = extractFloat128Exp( a ); 6483 aSign = extractFloat128Sign( a ); 6484 bSig1 = extractFloat128Frac1( b ); 6485 bSig0 = extractFloat128Frac0( b ); 6486 bExp = extractFloat128Exp( b ); 6487 bSign = extractFloat128Sign( b ); 6488 zSign = aSign ^ bSign; 6489 if ( aExp == 0x7FFF ) { 6490 if (aSig0 | aSig1) { 6491 return propagateFloat128NaN(a, b, status); 6492 } 6493 if ( bExp == 0x7FFF ) { 6494 if (bSig0 | bSig1) { 6495 return propagateFloat128NaN(a, b, status); 6496 } 6497 goto invalid; 6498 } 6499 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6500 } 6501 if ( bExp == 0x7FFF ) { 6502 if (bSig0 | bSig1) { 6503 return propagateFloat128NaN(a, b, status); 6504 } 6505 return packFloat128( zSign, 0, 0, 0 ); 6506 } 6507 if ( bExp == 0 ) { 6508 if ( ( bSig0 | bSig1 ) == 0 ) { 6509 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6510 invalid: 6511 float_raise(float_flag_invalid, status); 6512 return float128_default_nan(status); 6513 } 6514 float_raise(float_flag_divbyzero, status); 6515 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6516 } 6517 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6518 } 6519 if ( aExp == 0 ) { 6520 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6521 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6522 } 6523 zExp = aExp - bExp + 0x3FFD; 6524 shortShift128Left( 6525 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6526 shortShift128Left( 6527 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6528 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6529 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6530 ++zExp; 6531 } 6532 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6533 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6534 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6535 while ( (int64_t) rem0 < 0 ) { 6536 --zSig0; 6537 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6538 } 6539 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6540 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6541 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6542 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6543 while ( (int64_t) rem1 < 0 ) { 6544 --zSig1; 6545 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6546 } 6547 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6548 } 6549 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6550 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6551 6552 } 6553 6554 /*---------------------------------------------------------------------------- 6555 | Returns the remainder of the quadruple-precision floating-point value `a' 6556 | with respect to the corresponding value `b'. The operation is performed 6557 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6558 *----------------------------------------------------------------------------*/ 6559 6560 float128 float128_rem(float128 a, float128 b, float_status *status) 6561 { 6562 flag aSign, zSign; 6563 int32_t aExp, bExp, expDiff; 6564 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6565 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6566 int64_t sigMean0; 6567 6568 aSig1 = extractFloat128Frac1( a ); 6569 aSig0 = extractFloat128Frac0( a ); 6570 aExp = extractFloat128Exp( a ); 6571 aSign = extractFloat128Sign( a ); 6572 bSig1 = extractFloat128Frac1( b ); 6573 bSig0 = extractFloat128Frac0( b ); 6574 bExp = extractFloat128Exp( b ); 6575 if ( aExp == 0x7FFF ) { 6576 if ( ( aSig0 | aSig1 ) 6577 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6578 return propagateFloat128NaN(a, b, status); 6579 } 6580 goto invalid; 6581 } 6582 if ( bExp == 0x7FFF ) { 6583 if (bSig0 | bSig1) { 6584 return propagateFloat128NaN(a, b, status); 6585 } 6586 return a; 6587 } 6588 if ( bExp == 0 ) { 6589 if ( ( bSig0 | bSig1 ) == 0 ) { 6590 invalid: 6591 float_raise(float_flag_invalid, status); 6592 return float128_default_nan(status); 6593 } 6594 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6595 } 6596 if ( aExp == 0 ) { 6597 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6598 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6599 } 6600 expDiff = aExp - bExp; 6601 if ( expDiff < -1 ) return a; 6602 shortShift128Left( 6603 aSig0 | LIT64( 0x0001000000000000 ), 6604 aSig1, 6605 15 - ( expDiff < 0 ), 6606 &aSig0, 6607 &aSig1 6608 ); 6609 shortShift128Left( 6610 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6611 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6612 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6613 expDiff -= 64; 6614 while ( 0 < expDiff ) { 6615 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6616 q = ( 4 < q ) ? q - 4 : 0; 6617 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6618 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6619 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6620 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6621 expDiff -= 61; 6622 } 6623 if ( -64 < expDiff ) { 6624 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6625 q = ( 4 < q ) ? q - 4 : 0; 6626 q >>= - expDiff; 6627 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6628 expDiff += 52; 6629 if ( expDiff < 0 ) { 6630 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6631 } 6632 else { 6633 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6634 } 6635 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6636 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6637 } 6638 else { 6639 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6640 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6641 } 6642 do { 6643 alternateASig0 = aSig0; 6644 alternateASig1 = aSig1; 6645 ++q; 6646 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6647 } while ( 0 <= (int64_t) aSig0 ); 6648 add128( 6649 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6650 if ( ( sigMean0 < 0 ) 6651 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6652 aSig0 = alternateASig0; 6653 aSig1 = alternateASig1; 6654 } 6655 zSign = ( (int64_t) aSig0 < 0 ); 6656 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6657 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6658 status); 6659 } 6660 6661 /*---------------------------------------------------------------------------- 6662 | Returns the square root of the quadruple-precision floating-point value `a'. 6663 | The operation is performed according to the IEC/IEEE Standard for Binary 6664 | Floating-Point Arithmetic. 6665 *----------------------------------------------------------------------------*/ 6666 6667 float128 float128_sqrt(float128 a, float_status *status) 6668 { 6669 flag aSign; 6670 int32_t aExp, zExp; 6671 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6672 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6673 6674 aSig1 = extractFloat128Frac1( a ); 6675 aSig0 = extractFloat128Frac0( a ); 6676 aExp = extractFloat128Exp( a ); 6677 aSign = extractFloat128Sign( a ); 6678 if ( aExp == 0x7FFF ) { 6679 if (aSig0 | aSig1) { 6680 return propagateFloat128NaN(a, a, status); 6681 } 6682 if ( ! aSign ) return a; 6683 goto invalid; 6684 } 6685 if ( aSign ) { 6686 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6687 invalid: 6688 float_raise(float_flag_invalid, status); 6689 return float128_default_nan(status); 6690 } 6691 if ( aExp == 0 ) { 6692 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6693 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6694 } 6695 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6696 aSig0 |= LIT64( 0x0001000000000000 ); 6697 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6698 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6699 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6700 doubleZSig0 = zSig0<<1; 6701 mul64To128( zSig0, zSig0, &term0, &term1 ); 6702 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6703 while ( (int64_t) rem0 < 0 ) { 6704 --zSig0; 6705 doubleZSig0 -= 2; 6706 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6707 } 6708 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6709 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6710 if ( zSig1 == 0 ) zSig1 = 1; 6711 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6712 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6713 mul64To128( zSig1, zSig1, &term2, &term3 ); 6714 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6715 while ( (int64_t) rem1 < 0 ) { 6716 --zSig1; 6717 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6718 term3 |= 1; 6719 term2 |= doubleZSig0; 6720 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6721 } 6722 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6723 } 6724 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6725 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6726 6727 } 6728 6729 /*---------------------------------------------------------------------------- 6730 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6731 | the corresponding value `b', and 0 otherwise. The invalid exception is 6732 | raised if either operand is a NaN. Otherwise, the comparison is performed 6733 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6734 *----------------------------------------------------------------------------*/ 6735 6736 int float128_eq(float128 a, float128 b, float_status *status) 6737 { 6738 6739 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6740 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6741 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6742 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6743 ) { 6744 float_raise(float_flag_invalid, status); 6745 return 0; 6746 } 6747 return 6748 ( a.low == b.low ) 6749 && ( ( a.high == b.high ) 6750 || ( ( a.low == 0 ) 6751 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6752 ); 6753 6754 } 6755 6756 /*---------------------------------------------------------------------------- 6757 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6758 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6759 | exception is raised if either operand is a NaN. The comparison is performed 6760 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6761 *----------------------------------------------------------------------------*/ 6762 6763 int float128_le(float128 a, float128 b, float_status *status) 6764 { 6765 flag aSign, bSign; 6766 6767 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6768 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6769 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6770 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6771 ) { 6772 float_raise(float_flag_invalid, status); 6773 return 0; 6774 } 6775 aSign = extractFloat128Sign( a ); 6776 bSign = extractFloat128Sign( b ); 6777 if ( aSign != bSign ) { 6778 return 6779 aSign 6780 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6781 == 0 ); 6782 } 6783 return 6784 aSign ? le128( b.high, b.low, a.high, a.low ) 6785 : le128( a.high, a.low, b.high, b.low ); 6786 6787 } 6788 6789 /*---------------------------------------------------------------------------- 6790 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6791 | the corresponding value `b', and 0 otherwise. The invalid exception is 6792 | raised if either operand is a NaN. The comparison is performed according 6793 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6794 *----------------------------------------------------------------------------*/ 6795 6796 int float128_lt(float128 a, float128 b, float_status *status) 6797 { 6798 flag aSign, bSign; 6799 6800 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6801 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6802 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6803 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6804 ) { 6805 float_raise(float_flag_invalid, status); 6806 return 0; 6807 } 6808 aSign = extractFloat128Sign( a ); 6809 bSign = extractFloat128Sign( b ); 6810 if ( aSign != bSign ) { 6811 return 6812 aSign 6813 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6814 != 0 ); 6815 } 6816 return 6817 aSign ? lt128( b.high, b.low, a.high, a.low ) 6818 : lt128( a.high, a.low, b.high, b.low ); 6819 6820 } 6821 6822 /*---------------------------------------------------------------------------- 6823 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6824 | be compared, and 0 otherwise. The invalid exception is raised if either 6825 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6826 | Standard for Binary Floating-Point Arithmetic. 6827 *----------------------------------------------------------------------------*/ 6828 6829 int float128_unordered(float128 a, float128 b, float_status *status) 6830 { 6831 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6832 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6833 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6834 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6835 ) { 6836 float_raise(float_flag_invalid, status); 6837 return 1; 6838 } 6839 return 0; 6840 } 6841 6842 /*---------------------------------------------------------------------------- 6843 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6844 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6845 | exception. The comparison is performed according to the IEC/IEEE Standard 6846 | for Binary Floating-Point Arithmetic. 6847 *----------------------------------------------------------------------------*/ 6848 6849 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6850 { 6851 6852 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6853 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6854 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6855 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6856 ) { 6857 if (float128_is_signaling_nan(a, status) 6858 || float128_is_signaling_nan(b, status)) { 6859 float_raise(float_flag_invalid, status); 6860 } 6861 return 0; 6862 } 6863 return 6864 ( a.low == b.low ) 6865 && ( ( a.high == b.high ) 6866 || ( ( a.low == 0 ) 6867 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6868 ); 6869 6870 } 6871 6872 /*---------------------------------------------------------------------------- 6873 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6874 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6875 | cause an exception. Otherwise, the comparison is performed according to the 6876 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6877 *----------------------------------------------------------------------------*/ 6878 6879 int float128_le_quiet(float128 a, float128 b, float_status *status) 6880 { 6881 flag aSign, bSign; 6882 6883 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6884 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6885 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6886 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6887 ) { 6888 if (float128_is_signaling_nan(a, status) 6889 || float128_is_signaling_nan(b, status)) { 6890 float_raise(float_flag_invalid, status); 6891 } 6892 return 0; 6893 } 6894 aSign = extractFloat128Sign( a ); 6895 bSign = extractFloat128Sign( b ); 6896 if ( aSign != bSign ) { 6897 return 6898 aSign 6899 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6900 == 0 ); 6901 } 6902 return 6903 aSign ? le128( b.high, b.low, a.high, a.low ) 6904 : le128( a.high, a.low, b.high, b.low ); 6905 6906 } 6907 6908 /*---------------------------------------------------------------------------- 6909 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6910 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6911 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6912 | Standard for Binary Floating-Point Arithmetic. 6913 *----------------------------------------------------------------------------*/ 6914 6915 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6916 { 6917 flag aSign, bSign; 6918 6919 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6920 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6921 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6922 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6923 ) { 6924 if (float128_is_signaling_nan(a, status) 6925 || float128_is_signaling_nan(b, status)) { 6926 float_raise(float_flag_invalid, status); 6927 } 6928 return 0; 6929 } 6930 aSign = extractFloat128Sign( a ); 6931 bSign = extractFloat128Sign( b ); 6932 if ( aSign != bSign ) { 6933 return 6934 aSign 6935 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6936 != 0 ); 6937 } 6938 return 6939 aSign ? lt128( b.high, b.low, a.high, a.low ) 6940 : lt128( a.high, a.low, b.high, b.low ); 6941 6942 } 6943 6944 /*---------------------------------------------------------------------------- 6945 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6946 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6947 | comparison is performed according to the IEC/IEEE Standard for Binary 6948 | Floating-Point Arithmetic. 6949 *----------------------------------------------------------------------------*/ 6950 6951 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6952 { 6953 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6954 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6955 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6956 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6957 ) { 6958 if (float128_is_signaling_nan(a, status) 6959 || float128_is_signaling_nan(b, status)) { 6960 float_raise(float_flag_invalid, status); 6961 } 6962 return 1; 6963 } 6964 return 0; 6965 } 6966 6967 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6968 int is_quiet, float_status *status) 6969 { 6970 flag aSign, bSign; 6971 6972 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6973 float_raise(float_flag_invalid, status); 6974 return float_relation_unordered; 6975 } 6976 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6977 ( extractFloatx80Frac( a )<<1 ) ) || 6978 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6979 ( extractFloatx80Frac( b )<<1 ) )) { 6980 if (!is_quiet || 6981 floatx80_is_signaling_nan(a, status) || 6982 floatx80_is_signaling_nan(b, status)) { 6983 float_raise(float_flag_invalid, status); 6984 } 6985 return float_relation_unordered; 6986 } 6987 aSign = extractFloatx80Sign( a ); 6988 bSign = extractFloatx80Sign( b ); 6989 if ( aSign != bSign ) { 6990 6991 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6992 ( ( a.low | b.low ) == 0 ) ) { 6993 /* zero case */ 6994 return float_relation_equal; 6995 } else { 6996 return 1 - (2 * aSign); 6997 } 6998 } else { 6999 if (a.low == b.low && a.high == b.high) { 7000 return float_relation_equal; 7001 } else { 7002 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7003 } 7004 } 7005 } 7006 7007 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 7008 { 7009 return floatx80_compare_internal(a, b, 0, status); 7010 } 7011 7012 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 7013 { 7014 return floatx80_compare_internal(a, b, 1, status); 7015 } 7016 7017 static inline int float128_compare_internal(float128 a, float128 b, 7018 int is_quiet, float_status *status) 7019 { 7020 flag aSign, bSign; 7021 7022 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 7023 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 7024 ( ( extractFloat128Exp( b ) == 0x7fff ) && 7025 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 7026 if (!is_quiet || 7027 float128_is_signaling_nan(a, status) || 7028 float128_is_signaling_nan(b, status)) { 7029 float_raise(float_flag_invalid, status); 7030 } 7031 return float_relation_unordered; 7032 } 7033 aSign = extractFloat128Sign( a ); 7034 bSign = extractFloat128Sign( b ); 7035 if ( aSign != bSign ) { 7036 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 7037 /* zero case */ 7038 return float_relation_equal; 7039 } else { 7040 return 1 - (2 * aSign); 7041 } 7042 } else { 7043 if (a.low == b.low && a.high == b.high) { 7044 return float_relation_equal; 7045 } else { 7046 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 7047 } 7048 } 7049 } 7050 7051 int float128_compare(float128 a, float128 b, float_status *status) 7052 { 7053 return float128_compare_internal(a, b, 0, status); 7054 } 7055 7056 int float128_compare_quiet(float128 a, float128 b, float_status *status) 7057 { 7058 return float128_compare_internal(a, b, 1, status); 7059 } 7060 7061 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 7062 { 7063 flag aSign; 7064 int32_t aExp; 7065 uint64_t aSig; 7066 7067 if (floatx80_invalid_encoding(a)) { 7068 float_raise(float_flag_invalid, status); 7069 return floatx80_default_nan(status); 7070 } 7071 aSig = extractFloatx80Frac( a ); 7072 aExp = extractFloatx80Exp( a ); 7073 aSign = extractFloatx80Sign( a ); 7074 7075 if ( aExp == 0x7FFF ) { 7076 if ( aSig<<1 ) { 7077 return propagateFloatx80NaN(a, a, status); 7078 } 7079 return a; 7080 } 7081 7082 if (aExp == 0) { 7083 if (aSig == 0) { 7084 return a; 7085 } 7086 aExp++; 7087 } 7088 7089 if (n > 0x10000) { 7090 n = 0x10000; 7091 } else if (n < -0x10000) { 7092 n = -0x10000; 7093 } 7094 7095 aExp += n; 7096 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7097 aSign, aExp, aSig, 0, status); 7098 } 7099 7100 float128 float128_scalbn(float128 a, int n, float_status *status) 7101 { 7102 flag aSign; 7103 int32_t aExp; 7104 uint64_t aSig0, aSig1; 7105 7106 aSig1 = extractFloat128Frac1( a ); 7107 aSig0 = extractFloat128Frac0( a ); 7108 aExp = extractFloat128Exp( a ); 7109 aSign = extractFloat128Sign( a ); 7110 if ( aExp == 0x7FFF ) { 7111 if ( aSig0 | aSig1 ) { 7112 return propagateFloat128NaN(a, a, status); 7113 } 7114 return a; 7115 } 7116 if (aExp != 0) { 7117 aSig0 |= LIT64( 0x0001000000000000 ); 7118 } else if (aSig0 == 0 && aSig1 == 0) { 7119 return a; 7120 } else { 7121 aExp++; 7122 } 7123 7124 if (n > 0x10000) { 7125 n = 0x10000; 7126 } else if (n < -0x10000) { 7127 n = -0x10000; 7128 } 7129 7130 aExp += n - 1; 7131 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7132 , status); 7133 7134 } 7135