1 /* 2 * QEMU float support 3 * 4 * The code in this source file is derived from release 2a of the SoftFloat 5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and 6 * some later contributions) are provided under that license, as detailed below. 7 * It has subsequently been modified by contributors to the QEMU Project, 8 * so some portions are provided under: 9 * the SoftFloat-2a license 10 * the BSD license 11 * GPL-v2-or-later 12 * 13 * Any future contributions to this file after December 1st 2014 will be 14 * taken to be licensed under the Softfloat-2a license unless specifically 15 * indicated otherwise. 16 */ 17 18 /* 19 =============================================================================== 20 This C source file is part of the SoftFloat IEC/IEEE Floating-point 21 Arithmetic Package, Release 2a. 22 23 Written by John R. Hauser. This work was made possible in part by the 24 International Computer Science Institute, located at Suite 600, 1947 Center 25 Street, Berkeley, California 94704. Funding was partially provided by the 26 National Science Foundation under grant MIP-9311980. The original version 27 of this code was written as part of a project to build a fixed-point vector 28 processor in collaboration with the University of California at Berkeley, 29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information 30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ 31 arithmetic/SoftFloat.html'. 32 33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort 34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT 35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO 36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY 37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. 38 39 Derivative works are acceptable, even for commercial purposes, so long as 40 (1) they include prominent notice that the work is derivative, and (2) they 41 include prominent notice akin to these four paragraphs for those parts of 42 this code that are retained. 43 44 =============================================================================== 45 */ 46 47 /* BSD licensing: 48 * Copyright (c) 2006, Fabrice Bellard 49 * All rights reserved. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions are met: 53 * 54 * 1. Redistributions of source code must retain the above copyright notice, 55 * this list of conditions and the following disclaimer. 56 * 57 * 2. Redistributions in binary form must reproduce the above copyright notice, 58 * this list of conditions and the following disclaimer in the documentation 59 * and/or other materials provided with the distribution. 60 * 61 * 3. Neither the name of the copyright holder nor the names of its contributors 62 * may be used to endorse or promote products derived from this software without 63 * specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 75 * THE POSSIBILITY OF SUCH DAMAGE. 76 */ 77 78 /* Portions of this work are licensed under the terms of the GNU GPL, 79 * version 2 or later. See the COPYING file in the top-level directory. 80 */ 81 82 /* softfloat (and in particular the code in softfloat-specialize.h) is 83 * target-dependent and needs the TARGET_* macros. 84 */ 85 #include "qemu/osdep.h" 86 #include "qemu/bitops.h" 87 #include "fpu/softfloat.h" 88 89 /* We only need stdlib for abort() */ 90 91 /*---------------------------------------------------------------------------- 92 | Primitive arithmetic functions, including multi-word arithmetic, and 93 | division and square root approximations. (Can be specialized to target if 94 | desired.) 95 *----------------------------------------------------------------------------*/ 96 #include "fpu/softfloat-macros.h" 97 98 /*---------------------------------------------------------------------------- 99 | Functions and definitions to determine: (1) whether tininess for underflow 100 | is detected before or after rounding by default, (2) what (if anything) 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs 103 | are propagated from function inputs to output. These details are target- 104 | specific. 105 *----------------------------------------------------------------------------*/ 106 #include "softfloat-specialize.h" 107 108 /*---------------------------------------------------------------------------- 109 | Returns the fraction bits of the half-precision floating-point value `a'. 110 *----------------------------------------------------------------------------*/ 111 112 static inline uint32_t extractFloat16Frac(float16 a) 113 { 114 return float16_val(a) & 0x3ff; 115 } 116 117 /*---------------------------------------------------------------------------- 118 | Returns the exponent bits of the half-precision floating-point value `a'. 119 *----------------------------------------------------------------------------*/ 120 121 static inline int extractFloat16Exp(float16 a) 122 { 123 return (float16_val(a) >> 10) & 0x1f; 124 } 125 126 /*---------------------------------------------------------------------------- 127 | Returns the sign bit of the single-precision floating-point value `a'. 128 *----------------------------------------------------------------------------*/ 129 130 static inline flag extractFloat16Sign(float16 a) 131 { 132 return float16_val(a)>>15; 133 } 134 135 /*---------------------------------------------------------------------------- 136 | Returns the fraction bits of the single-precision floating-point value `a'. 137 *----------------------------------------------------------------------------*/ 138 139 static inline uint32_t extractFloat32Frac(float32 a) 140 { 141 return float32_val(a) & 0x007FFFFF; 142 } 143 144 /*---------------------------------------------------------------------------- 145 | Returns the exponent bits of the single-precision floating-point value `a'. 146 *----------------------------------------------------------------------------*/ 147 148 static inline int extractFloat32Exp(float32 a) 149 { 150 return (float32_val(a) >> 23) & 0xFF; 151 } 152 153 /*---------------------------------------------------------------------------- 154 | Returns the sign bit of the single-precision floating-point value `a'. 155 *----------------------------------------------------------------------------*/ 156 157 static inline flag extractFloat32Sign(float32 a) 158 { 159 return float32_val(a) >> 31; 160 } 161 162 /*---------------------------------------------------------------------------- 163 | Returns the fraction bits of the double-precision floating-point value `a'. 164 *----------------------------------------------------------------------------*/ 165 166 static inline uint64_t extractFloat64Frac(float64 a) 167 { 168 return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF); 169 } 170 171 /*---------------------------------------------------------------------------- 172 | Returns the exponent bits of the double-precision floating-point value `a'. 173 *----------------------------------------------------------------------------*/ 174 175 static inline int extractFloat64Exp(float64 a) 176 { 177 return (float64_val(a) >> 52) & 0x7FF; 178 } 179 180 /*---------------------------------------------------------------------------- 181 | Returns the sign bit of the double-precision floating-point value `a'. 182 *----------------------------------------------------------------------------*/ 183 184 static inline flag extractFloat64Sign(float64 a) 185 { 186 return float64_val(a) >> 63; 187 } 188 189 /* 190 * Classify a floating point number. Everything above float_class_qnan 191 * is a NaN so cls >= float_class_qnan is any NaN. 192 */ 193 194 typedef enum __attribute__ ((__packed__)) { 195 float_class_unclassified, 196 float_class_zero, 197 float_class_normal, 198 float_class_inf, 199 float_class_qnan, /* all NaNs from here */ 200 float_class_snan, 201 float_class_dnan, 202 float_class_msnan, /* maybe silenced */ 203 } FloatClass; 204 205 /* 206 * Structure holding all of the decomposed parts of a float. The 207 * exponent is unbiased and the fraction is normalized. All 208 * calculations are done with a 64 bit fraction and then rounded as 209 * appropriate for the final format. 210 * 211 * Thanks to the packed FloatClass a decent compiler should be able to 212 * fit the whole structure into registers and avoid using the stack 213 * for parameter passing. 214 */ 215 216 typedef struct { 217 uint64_t frac; 218 int32_t exp; 219 FloatClass cls; 220 bool sign; 221 } FloatParts; 222 223 #define DECOMPOSED_BINARY_POINT (64 - 2) 224 #define DECOMPOSED_IMPLICIT_BIT (1ull << DECOMPOSED_BINARY_POINT) 225 #define DECOMPOSED_OVERFLOW_BIT (DECOMPOSED_IMPLICIT_BIT << 1) 226 227 /* Structure holding all of the relevant parameters for a format. 228 * exp_size: the size of the exponent field 229 * exp_bias: the offset applied to the exponent field 230 * exp_max: the maximum normalised exponent 231 * frac_size: the size of the fraction field 232 * frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT 233 * The following are computed based the size of fraction 234 * frac_lsb: least significant bit of fraction 235 * fram_lsbm1: the bit bellow the least significant bit (for rounding) 236 * round_mask/roundeven_mask: masks used for rounding 237 */ 238 typedef struct { 239 int exp_size; 240 int exp_bias; 241 int exp_max; 242 int frac_size; 243 int frac_shift; 244 uint64_t frac_lsb; 245 uint64_t frac_lsbm1; 246 uint64_t round_mask; 247 uint64_t roundeven_mask; 248 } FloatFmt; 249 250 /* Expand fields based on the size of exponent and fraction */ 251 #define FLOAT_PARAMS(E, F) \ 252 .exp_size = E, \ 253 .exp_bias = ((1 << E) - 1) >> 1, \ 254 .exp_max = (1 << E) - 1, \ 255 .frac_size = F, \ 256 .frac_shift = DECOMPOSED_BINARY_POINT - F, \ 257 .frac_lsb = 1ull << (DECOMPOSED_BINARY_POINT - F), \ 258 .frac_lsbm1 = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1), \ 259 .round_mask = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1, \ 260 .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1 261 262 static const FloatFmt float16_params = { 263 FLOAT_PARAMS(5, 10) 264 }; 265 266 static const FloatFmt float32_params = { 267 FLOAT_PARAMS(8, 23) 268 }; 269 270 static const FloatFmt float64_params = { 271 FLOAT_PARAMS(11, 52) 272 }; 273 274 /* Unpack a float to parts, but do not canonicalize. */ 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw) 276 { 277 const int sign_pos = fmt.frac_size + fmt.exp_size; 278 279 return (FloatParts) { 280 .cls = float_class_unclassified, 281 .sign = extract64(raw, sign_pos, 1), 282 .exp = extract64(raw, fmt.frac_size, fmt.exp_size), 283 .frac = extract64(raw, 0, fmt.frac_size), 284 }; 285 } 286 287 static inline FloatParts float16_unpack_raw(float16 f) 288 { 289 return unpack_raw(float16_params, f); 290 } 291 292 static inline FloatParts float32_unpack_raw(float32 f) 293 { 294 return unpack_raw(float32_params, f); 295 } 296 297 static inline FloatParts float64_unpack_raw(float64 f) 298 { 299 return unpack_raw(float64_params, f); 300 } 301 302 /* Pack a float from parts, but do not canonicalize. */ 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p) 304 { 305 const int sign_pos = fmt.frac_size + fmt.exp_size; 306 uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp); 307 return deposit64(ret, sign_pos, 1, p.sign); 308 } 309 310 static inline float16 float16_pack_raw(FloatParts p) 311 { 312 return make_float16(pack_raw(float16_params, p)); 313 } 314 315 static inline float32 float32_pack_raw(FloatParts p) 316 { 317 return make_float32(pack_raw(float32_params, p)); 318 } 319 320 static inline float64 float64_pack_raw(FloatParts p) 321 { 322 return make_float64(pack_raw(float64_params, p)); 323 } 324 325 /* Canonicalize EXP and FRAC, setting CLS. */ 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, 327 float_status *status) 328 { 329 if (part.exp == parm->exp_max) { 330 if (part.frac == 0) { 331 part.cls = float_class_inf; 332 } else { 333 #ifdef NO_SIGNALING_NANS 334 part.cls = float_class_qnan; 335 #else 336 int64_t msb = part.frac << (parm->frac_shift + 2); 337 if ((msb < 0) == status->snan_bit_is_one) { 338 part.cls = float_class_snan; 339 } else { 340 part.cls = float_class_qnan; 341 } 342 #endif 343 } 344 } else if (part.exp == 0) { 345 if (likely(part.frac == 0)) { 346 part.cls = float_class_zero; 347 } else if (status->flush_inputs_to_zero) { 348 float_raise(float_flag_input_denormal, status); 349 part.cls = float_class_zero; 350 part.frac = 0; 351 } else { 352 int shift = clz64(part.frac) - 1; 353 part.cls = float_class_normal; 354 part.exp = parm->frac_shift - parm->exp_bias - shift + 1; 355 part.frac <<= shift; 356 } 357 } else { 358 part.cls = float_class_normal; 359 part.exp -= parm->exp_bias; 360 part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift); 361 } 362 return part; 363 } 364 365 /* Round and uncanonicalize a floating-point number by parts. There 366 * are FRAC_SHIFT bits that may require rounding at the bottom of the 367 * fraction; these bits will be removed. The exponent will be biased 368 * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0]. 369 */ 370 371 static FloatParts round_canonical(FloatParts p, float_status *s, 372 const FloatFmt *parm) 373 { 374 const uint64_t frac_lsbm1 = parm->frac_lsbm1; 375 const uint64_t round_mask = parm->round_mask; 376 const uint64_t roundeven_mask = parm->roundeven_mask; 377 const int exp_max = parm->exp_max; 378 const int frac_shift = parm->frac_shift; 379 uint64_t frac, inc; 380 int exp, flags = 0; 381 bool overflow_norm; 382 383 frac = p.frac; 384 exp = p.exp; 385 386 switch (p.cls) { 387 case float_class_normal: 388 switch (s->float_rounding_mode) { 389 case float_round_nearest_even: 390 overflow_norm = false; 391 inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 392 break; 393 case float_round_ties_away: 394 overflow_norm = false; 395 inc = frac_lsbm1; 396 break; 397 case float_round_to_zero: 398 overflow_norm = true; 399 inc = 0; 400 break; 401 case float_round_up: 402 inc = p.sign ? 0 : round_mask; 403 overflow_norm = p.sign; 404 break; 405 case float_round_down: 406 inc = p.sign ? round_mask : 0; 407 overflow_norm = !p.sign; 408 break; 409 default: 410 g_assert_not_reached(); 411 } 412 413 exp += parm->exp_bias; 414 if (likely(exp > 0)) { 415 if (frac & round_mask) { 416 flags |= float_flag_inexact; 417 frac += inc; 418 if (frac & DECOMPOSED_OVERFLOW_BIT) { 419 frac >>= 1; 420 exp++; 421 } 422 } 423 frac >>= frac_shift; 424 425 if (unlikely(exp >= exp_max)) { 426 flags |= float_flag_overflow | float_flag_inexact; 427 if (overflow_norm) { 428 exp = exp_max - 1; 429 frac = -1; 430 } else { 431 p.cls = float_class_inf; 432 goto do_inf; 433 } 434 } 435 } else if (s->flush_to_zero) { 436 flags |= float_flag_output_denormal; 437 p.cls = float_class_zero; 438 goto do_zero; 439 } else { 440 bool is_tiny = (s->float_detect_tininess 441 == float_tininess_before_rounding) 442 || (exp < 0) 443 || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT); 444 445 shift64RightJamming(frac, 1 - exp, &frac); 446 if (frac & round_mask) { 447 /* Need to recompute round-to-even. */ 448 if (s->float_rounding_mode == float_round_nearest_even) { 449 inc = ((frac & roundeven_mask) != frac_lsbm1 450 ? frac_lsbm1 : 0); 451 } 452 flags |= float_flag_inexact; 453 frac += inc; 454 } 455 456 exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0); 457 frac >>= frac_shift; 458 459 if (is_tiny && (flags & float_flag_inexact)) { 460 flags |= float_flag_underflow; 461 } 462 if (exp == 0 && frac == 0) { 463 p.cls = float_class_zero; 464 } 465 } 466 break; 467 468 case float_class_zero: 469 do_zero: 470 exp = 0; 471 frac = 0; 472 break; 473 474 case float_class_inf: 475 do_inf: 476 exp = exp_max; 477 frac = 0; 478 break; 479 480 case float_class_qnan: 481 case float_class_snan: 482 exp = exp_max; 483 break; 484 485 default: 486 g_assert_not_reached(); 487 } 488 489 float_raise(flags, s); 490 p.exp = exp; 491 p.frac = frac; 492 return p; 493 } 494 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s) 496 { 497 return canonicalize(float16_unpack_raw(f), &float16_params, s); 498 } 499 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s) 501 { 502 switch (p.cls) { 503 case float_class_dnan: 504 return float16_default_nan(s); 505 case float_class_msnan: 506 return float16_maybe_silence_nan(float16_pack_raw(p), s); 507 default: 508 p = round_canonical(p, s, &float16_params); 509 return float16_pack_raw(p); 510 } 511 } 512 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s) 514 { 515 return canonicalize(float32_unpack_raw(f), &float32_params, s); 516 } 517 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s) 519 { 520 switch (p.cls) { 521 case float_class_dnan: 522 return float32_default_nan(s); 523 case float_class_msnan: 524 return float32_maybe_silence_nan(float32_pack_raw(p), s); 525 default: 526 p = round_canonical(p, s, &float32_params); 527 return float32_pack_raw(p); 528 } 529 } 530 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s) 532 { 533 return canonicalize(float64_unpack_raw(f), &float64_params, s); 534 } 535 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s) 537 { 538 switch (p.cls) { 539 case float_class_dnan: 540 return float64_default_nan(s); 541 case float_class_msnan: 542 return float64_maybe_silence_nan(float64_pack_raw(p), s); 543 default: 544 p = round_canonical(p, s, &float64_params); 545 return float64_pack_raw(p); 546 } 547 } 548 549 /* Simple helpers for checking if what NaN we have */ 550 static bool is_nan(FloatClass c) 551 { 552 return unlikely(c >= float_class_qnan); 553 } 554 static bool is_snan(FloatClass c) 555 { 556 return c == float_class_snan; 557 } 558 static bool is_qnan(FloatClass c) 559 { 560 return c == float_class_qnan; 561 } 562 563 static FloatParts return_nan(FloatParts a, float_status *s) 564 { 565 switch (a.cls) { 566 case float_class_snan: 567 s->float_exception_flags |= float_flag_invalid; 568 a.cls = float_class_msnan; 569 /* fall through */ 570 case float_class_qnan: 571 if (s->default_nan_mode) { 572 a.cls = float_class_dnan; 573 } 574 break; 575 576 default: 577 g_assert_not_reached(); 578 } 579 return a; 580 } 581 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s) 583 { 584 if (is_snan(a.cls) || is_snan(b.cls)) { 585 s->float_exception_flags |= float_flag_invalid; 586 } 587 588 if (s->default_nan_mode) { 589 a.cls = float_class_dnan; 590 } else { 591 if (pickNaN(is_qnan(a.cls), is_snan(a.cls), 592 is_qnan(b.cls), is_snan(b.cls), 593 a.frac > b.frac || 594 (a.frac == b.frac && a.sign < b.sign))) { 595 a = b; 596 } 597 a.cls = float_class_msnan; 598 } 599 return a; 600 } 601 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c, 603 bool inf_zero, float_status *s) 604 { 605 if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) { 606 s->float_exception_flags |= float_flag_invalid; 607 } 608 609 if (s->default_nan_mode) { 610 a.cls = float_class_dnan; 611 } else { 612 switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls), 613 is_qnan(b.cls), is_snan(b.cls), 614 is_qnan(c.cls), is_snan(c.cls), 615 inf_zero, s)) { 616 case 0: 617 break; 618 case 1: 619 a = b; 620 break; 621 case 2: 622 a = c; 623 break; 624 case 3: 625 a.cls = float_class_dnan; 626 return a; 627 default: 628 g_assert_not_reached(); 629 } 630 631 a.cls = float_class_msnan; 632 } 633 return a; 634 } 635 636 /* 637 * Returns the result of adding or subtracting the values of the 638 * floating-point values `a' and `b'. The operation is performed 639 * according to the IEC/IEEE Standard for Binary Floating-Point 640 * Arithmetic. 641 */ 642 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract, 644 float_status *s) 645 { 646 bool a_sign = a.sign; 647 bool b_sign = b.sign ^ subtract; 648 649 if (a_sign != b_sign) { 650 /* Subtraction */ 651 652 if (a.cls == float_class_normal && b.cls == float_class_normal) { 653 if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) { 654 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 655 a.frac = a.frac - b.frac; 656 } else { 657 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 658 a.frac = b.frac - a.frac; 659 a.exp = b.exp; 660 a_sign ^= 1; 661 } 662 663 if (a.frac == 0) { 664 a.cls = float_class_zero; 665 a.sign = s->float_rounding_mode == float_round_down; 666 } else { 667 int shift = clz64(a.frac) - 1; 668 a.frac = a.frac << shift; 669 a.exp = a.exp - shift; 670 a.sign = a_sign; 671 } 672 return a; 673 } 674 if (is_nan(a.cls) || is_nan(b.cls)) { 675 return pick_nan(a, b, s); 676 } 677 if (a.cls == float_class_inf) { 678 if (b.cls == float_class_inf) { 679 float_raise(float_flag_invalid, s); 680 a.cls = float_class_dnan; 681 } 682 return a; 683 } 684 if (a.cls == float_class_zero && b.cls == float_class_zero) { 685 a.sign = s->float_rounding_mode == float_round_down; 686 return a; 687 } 688 if (a.cls == float_class_zero || b.cls == float_class_inf) { 689 b.sign = a_sign ^ 1; 690 return b; 691 } 692 if (b.cls == float_class_zero) { 693 return a; 694 } 695 } else { 696 /* Addition */ 697 if (a.cls == float_class_normal && b.cls == float_class_normal) { 698 if (a.exp > b.exp) { 699 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac); 700 } else if (a.exp < b.exp) { 701 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac); 702 a.exp = b.exp; 703 } 704 a.frac += b.frac; 705 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 706 a.frac >>= 1; 707 a.exp += 1; 708 } 709 return a; 710 } 711 if (is_nan(a.cls) || is_nan(b.cls)) { 712 return pick_nan(a, b, s); 713 } 714 if (a.cls == float_class_inf || b.cls == float_class_zero) { 715 return a; 716 } 717 if (b.cls == float_class_inf || a.cls == float_class_zero) { 718 b.sign = b_sign; 719 return b; 720 } 721 } 722 g_assert_not_reached(); 723 } 724 725 /* 726 * Returns the result of adding or subtracting the floating-point 727 * values `a' and `b'. The operation is performed according to the 728 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 729 */ 730 731 float16 __attribute__((flatten)) float16_add(float16 a, float16 b, 732 float_status *status) 733 { 734 FloatParts pa = float16_unpack_canonical(a, status); 735 FloatParts pb = float16_unpack_canonical(b, status); 736 FloatParts pr = addsub_floats(pa, pb, false, status); 737 738 return float16_round_pack_canonical(pr, status); 739 } 740 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b, 742 float_status *status) 743 { 744 FloatParts pa = float32_unpack_canonical(a, status); 745 FloatParts pb = float32_unpack_canonical(b, status); 746 FloatParts pr = addsub_floats(pa, pb, false, status); 747 748 return float32_round_pack_canonical(pr, status); 749 } 750 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b, 752 float_status *status) 753 { 754 FloatParts pa = float64_unpack_canonical(a, status); 755 FloatParts pb = float64_unpack_canonical(b, status); 756 FloatParts pr = addsub_floats(pa, pb, false, status); 757 758 return float64_round_pack_canonical(pr, status); 759 } 760 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b, 762 float_status *status) 763 { 764 FloatParts pa = float16_unpack_canonical(a, status); 765 FloatParts pb = float16_unpack_canonical(b, status); 766 FloatParts pr = addsub_floats(pa, pb, true, status); 767 768 return float16_round_pack_canonical(pr, status); 769 } 770 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b, 772 float_status *status) 773 { 774 FloatParts pa = float32_unpack_canonical(a, status); 775 FloatParts pb = float32_unpack_canonical(b, status); 776 FloatParts pr = addsub_floats(pa, pb, true, status); 777 778 return float32_round_pack_canonical(pr, status); 779 } 780 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b, 782 float_status *status) 783 { 784 FloatParts pa = float64_unpack_canonical(a, status); 785 FloatParts pb = float64_unpack_canonical(b, status); 786 FloatParts pr = addsub_floats(pa, pb, true, status); 787 788 return float64_round_pack_canonical(pr, status); 789 } 790 791 /* 792 * Returns the result of multiplying the floating-point values `a' and 793 * `b'. The operation is performed according to the IEC/IEEE Standard 794 * for Binary Floating-Point Arithmetic. 795 */ 796 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s) 798 { 799 bool sign = a.sign ^ b.sign; 800 801 if (a.cls == float_class_normal && b.cls == float_class_normal) { 802 uint64_t hi, lo; 803 int exp = a.exp + b.exp; 804 805 mul64To128(a.frac, b.frac, &hi, &lo); 806 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 807 if (lo & DECOMPOSED_OVERFLOW_BIT) { 808 shift64RightJamming(lo, 1, &lo); 809 exp += 1; 810 } 811 812 /* Re-use a */ 813 a.exp = exp; 814 a.sign = sign; 815 a.frac = lo; 816 return a; 817 } 818 /* handle all the NaN cases */ 819 if (is_nan(a.cls) || is_nan(b.cls)) { 820 return pick_nan(a, b, s); 821 } 822 /* Inf * Zero == NaN */ 823 if ((a.cls == float_class_inf && b.cls == float_class_zero) || 824 (a.cls == float_class_zero && b.cls == float_class_inf)) { 825 s->float_exception_flags |= float_flag_invalid; 826 a.cls = float_class_dnan; 827 a.sign = sign; 828 return a; 829 } 830 /* Multiply by 0 or Inf */ 831 if (a.cls == float_class_inf || a.cls == float_class_zero) { 832 a.sign = sign; 833 return a; 834 } 835 if (b.cls == float_class_inf || b.cls == float_class_zero) { 836 b.sign = sign; 837 return b; 838 } 839 g_assert_not_reached(); 840 } 841 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b, 843 float_status *status) 844 { 845 FloatParts pa = float16_unpack_canonical(a, status); 846 FloatParts pb = float16_unpack_canonical(b, status); 847 FloatParts pr = mul_floats(pa, pb, status); 848 849 return float16_round_pack_canonical(pr, status); 850 } 851 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b, 853 float_status *status) 854 { 855 FloatParts pa = float32_unpack_canonical(a, status); 856 FloatParts pb = float32_unpack_canonical(b, status); 857 FloatParts pr = mul_floats(pa, pb, status); 858 859 return float32_round_pack_canonical(pr, status); 860 } 861 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b, 863 float_status *status) 864 { 865 FloatParts pa = float64_unpack_canonical(a, status); 866 FloatParts pb = float64_unpack_canonical(b, status); 867 FloatParts pr = mul_floats(pa, pb, status); 868 869 return float64_round_pack_canonical(pr, status); 870 } 871 872 /* 873 * Returns the result of multiplying the floating-point values `a' and 874 * `b' then adding 'c', with no intermediate rounding step after the 875 * multiplication. The operation is performed according to the 876 * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008. 877 * The flags argument allows the caller to select negation of the 878 * addend, the intermediate product, or the final result. (The 879 * difference between this and having the caller do a separate 880 * negation is that negating externally will flip the sign bit on 881 * NaNs.) 882 */ 883 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c, 885 int flags, float_status *s) 886 { 887 bool inf_zero = ((1 << a.cls) | (1 << b.cls)) == 888 ((1 << float_class_inf) | (1 << float_class_zero)); 889 bool p_sign; 890 bool sign_flip = flags & float_muladd_negate_result; 891 FloatClass p_class; 892 uint64_t hi, lo; 893 int p_exp; 894 895 /* It is implementation-defined whether the cases of (0,inf,qnan) 896 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN 897 * they return if they do), so we have to hand this information 898 * off to the target-specific pick-a-NaN routine. 899 */ 900 if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) { 901 return pick_nan_muladd(a, b, c, inf_zero, s); 902 } 903 904 if (inf_zero) { 905 s->float_exception_flags |= float_flag_invalid; 906 a.cls = float_class_dnan; 907 return a; 908 } 909 910 if (flags & float_muladd_negate_c) { 911 c.sign ^= 1; 912 } 913 914 p_sign = a.sign ^ b.sign; 915 916 if (flags & float_muladd_negate_product) { 917 p_sign ^= 1; 918 } 919 920 if (a.cls == float_class_inf || b.cls == float_class_inf) { 921 p_class = float_class_inf; 922 } else if (a.cls == float_class_zero || b.cls == float_class_zero) { 923 p_class = float_class_zero; 924 } else { 925 p_class = float_class_normal; 926 } 927 928 if (c.cls == float_class_inf) { 929 if (p_class == float_class_inf && p_sign != c.sign) { 930 s->float_exception_flags |= float_flag_invalid; 931 a.cls = float_class_dnan; 932 } else { 933 a.cls = float_class_inf; 934 a.sign = c.sign ^ sign_flip; 935 } 936 return a; 937 } 938 939 if (p_class == float_class_inf) { 940 a.cls = float_class_inf; 941 a.sign = p_sign ^ sign_flip; 942 return a; 943 } 944 945 if (p_class == float_class_zero) { 946 if (c.cls == float_class_zero) { 947 if (p_sign != c.sign) { 948 p_sign = s->float_rounding_mode == float_round_down; 949 } 950 c.sign = p_sign; 951 } else if (flags & float_muladd_halve_result) { 952 c.exp -= 1; 953 } 954 c.sign ^= sign_flip; 955 return c; 956 } 957 958 /* a & b should be normals now... */ 959 assert(a.cls == float_class_normal && 960 b.cls == float_class_normal); 961 962 p_exp = a.exp + b.exp; 963 964 /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit 965 * result. 966 */ 967 mul64To128(a.frac, b.frac, &hi, &lo); 968 /* binary point now at bit 124 */ 969 970 /* check for overflow */ 971 if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) { 972 shift128RightJamming(hi, lo, 1, &hi, &lo); 973 p_exp += 1; 974 } 975 976 /* + add/sub */ 977 if (c.cls == float_class_zero) { 978 /* move binary point back to 62 */ 979 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 980 } else { 981 int exp_diff = p_exp - c.exp; 982 if (p_sign == c.sign) { 983 /* Addition */ 984 if (exp_diff <= 0) { 985 shift128RightJamming(hi, lo, 986 DECOMPOSED_BINARY_POINT - exp_diff, 987 &hi, &lo); 988 lo += c.frac; 989 p_exp = c.exp; 990 } else { 991 uint64_t c_hi, c_lo; 992 /* shift c to the same binary point as the product (124) */ 993 c_hi = c.frac >> 2; 994 c_lo = 0; 995 shift128RightJamming(c_hi, c_lo, 996 exp_diff, 997 &c_hi, &c_lo); 998 add128(hi, lo, c_hi, c_lo, &hi, &lo); 999 /* move binary point back to 62 */ 1000 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo); 1001 } 1002 1003 if (lo & DECOMPOSED_OVERFLOW_BIT) { 1004 shift64RightJamming(lo, 1, &lo); 1005 p_exp += 1; 1006 } 1007 1008 } else { 1009 /* Subtraction */ 1010 uint64_t c_hi, c_lo; 1011 /* make C binary point match product at bit 124 */ 1012 c_hi = c.frac >> 2; 1013 c_lo = 0; 1014 1015 if (exp_diff <= 0) { 1016 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo); 1017 if (exp_diff == 0 1018 && 1019 (hi > c_hi || (hi == c_hi && lo >= c_lo))) { 1020 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1021 } else { 1022 sub128(c_hi, c_lo, hi, lo, &hi, &lo); 1023 p_sign ^= 1; 1024 p_exp = c.exp; 1025 } 1026 } else { 1027 shift128RightJamming(c_hi, c_lo, 1028 exp_diff, 1029 &c_hi, &c_lo); 1030 sub128(hi, lo, c_hi, c_lo, &hi, &lo); 1031 } 1032 1033 if (hi == 0 && lo == 0) { 1034 a.cls = float_class_zero; 1035 a.sign = s->float_rounding_mode == float_round_down; 1036 a.sign ^= sign_flip; 1037 return a; 1038 } else { 1039 int shift; 1040 if (hi != 0) { 1041 shift = clz64(hi); 1042 } else { 1043 shift = clz64(lo) + 64; 1044 } 1045 /* Normalizing to a binary point of 124 is the 1046 correct adjust for the exponent. However since we're 1047 shifting, we might as well put the binary point back 1048 at 62 where we really want it. Therefore shift as 1049 if we're leaving 1 bit at the top of the word, but 1050 adjust the exponent as if we're leaving 3 bits. */ 1051 shift -= 1; 1052 if (shift >= 64) { 1053 lo = lo << (shift - 64); 1054 } else { 1055 hi = (hi << shift) | (lo >> (64 - shift)); 1056 lo = hi | ((lo << shift) != 0); 1057 } 1058 p_exp -= shift - 2; 1059 } 1060 } 1061 } 1062 1063 if (flags & float_muladd_halve_result) { 1064 p_exp -= 1; 1065 } 1066 1067 /* finally prepare our result */ 1068 a.cls = float_class_normal; 1069 a.sign = p_sign ^ sign_flip; 1070 a.exp = p_exp; 1071 a.frac = lo; 1072 1073 return a; 1074 } 1075 1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c, 1077 int flags, float_status *status) 1078 { 1079 FloatParts pa = float16_unpack_canonical(a, status); 1080 FloatParts pb = float16_unpack_canonical(b, status); 1081 FloatParts pc = float16_unpack_canonical(c, status); 1082 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1083 1084 return float16_round_pack_canonical(pr, status); 1085 } 1086 1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c, 1088 int flags, float_status *status) 1089 { 1090 FloatParts pa = float32_unpack_canonical(a, status); 1091 FloatParts pb = float32_unpack_canonical(b, status); 1092 FloatParts pc = float32_unpack_canonical(c, status); 1093 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1094 1095 return float32_round_pack_canonical(pr, status); 1096 } 1097 1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c, 1099 int flags, float_status *status) 1100 { 1101 FloatParts pa = float64_unpack_canonical(a, status); 1102 FloatParts pb = float64_unpack_canonical(b, status); 1103 FloatParts pc = float64_unpack_canonical(c, status); 1104 FloatParts pr = muladd_floats(pa, pb, pc, flags, status); 1105 1106 return float64_round_pack_canonical(pr, status); 1107 } 1108 1109 /* 1110 * Returns the result of dividing the floating-point value `a' by the 1111 * corresponding value `b'. The operation is performed according to 1112 * the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1113 */ 1114 1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s) 1116 { 1117 bool sign = a.sign ^ b.sign; 1118 1119 if (a.cls == float_class_normal && b.cls == float_class_normal) { 1120 uint64_t temp_lo, temp_hi; 1121 int exp = a.exp - b.exp; 1122 if (a.frac < b.frac) { 1123 exp -= 1; 1124 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, 1125 &temp_hi, &temp_lo); 1126 } else { 1127 shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, 1128 &temp_hi, &temp_lo); 1129 } 1130 /* LSB of quot is set if inexact which roundandpack will use 1131 * to set flags. Yet again we re-use a for the result */ 1132 a.frac = div128To64(temp_lo, temp_hi, b.frac); 1133 a.sign = sign; 1134 a.exp = exp; 1135 return a; 1136 } 1137 /* handle all the NaN cases */ 1138 if (is_nan(a.cls) || is_nan(b.cls)) { 1139 return pick_nan(a, b, s); 1140 } 1141 /* 0/0 or Inf/Inf */ 1142 if (a.cls == b.cls 1143 && 1144 (a.cls == float_class_inf || a.cls == float_class_zero)) { 1145 s->float_exception_flags |= float_flag_invalid; 1146 a.cls = float_class_dnan; 1147 return a; 1148 } 1149 /* Div 0 => Inf */ 1150 if (b.cls == float_class_zero) { 1151 s->float_exception_flags |= float_flag_divbyzero; 1152 a.cls = float_class_inf; 1153 a.sign = sign; 1154 return a; 1155 } 1156 /* Inf / x or 0 / x */ 1157 if (a.cls == float_class_inf || a.cls == float_class_zero) { 1158 a.sign = sign; 1159 return a; 1160 } 1161 /* Div by Inf */ 1162 if (b.cls == float_class_inf) { 1163 a.cls = float_class_zero; 1164 a.sign = sign; 1165 return a; 1166 } 1167 g_assert_not_reached(); 1168 } 1169 1170 float16 float16_div(float16 a, float16 b, float_status *status) 1171 { 1172 FloatParts pa = float16_unpack_canonical(a, status); 1173 FloatParts pb = float16_unpack_canonical(b, status); 1174 FloatParts pr = div_floats(pa, pb, status); 1175 1176 return float16_round_pack_canonical(pr, status); 1177 } 1178 1179 float32 float32_div(float32 a, float32 b, float_status *status) 1180 { 1181 FloatParts pa = float32_unpack_canonical(a, status); 1182 FloatParts pb = float32_unpack_canonical(b, status); 1183 FloatParts pr = div_floats(pa, pb, status); 1184 1185 return float32_round_pack_canonical(pr, status); 1186 } 1187 1188 float64 float64_div(float64 a, float64 b, float_status *status) 1189 { 1190 FloatParts pa = float64_unpack_canonical(a, status); 1191 FloatParts pb = float64_unpack_canonical(b, status); 1192 FloatParts pr = div_floats(pa, pb, status); 1193 1194 return float64_round_pack_canonical(pr, status); 1195 } 1196 1197 /* 1198 * Rounds the floating-point value `a' to an integer, and returns the 1199 * result as a floating-point value. The operation is performed 1200 * according to the IEC/IEEE Standard for Binary Floating-Point 1201 * Arithmetic. 1202 */ 1203 1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s) 1205 { 1206 if (is_nan(a.cls)) { 1207 return return_nan(a, s); 1208 } 1209 1210 switch (a.cls) { 1211 case float_class_zero: 1212 case float_class_inf: 1213 case float_class_qnan: 1214 /* already "integral" */ 1215 break; 1216 case float_class_normal: 1217 if (a.exp >= DECOMPOSED_BINARY_POINT) { 1218 /* already integral */ 1219 break; 1220 } 1221 if (a.exp < 0) { 1222 bool one; 1223 /* all fractional */ 1224 s->float_exception_flags |= float_flag_inexact; 1225 switch (rounding_mode) { 1226 case float_round_nearest_even: 1227 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT; 1228 break; 1229 case float_round_ties_away: 1230 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT; 1231 break; 1232 case float_round_to_zero: 1233 one = false; 1234 break; 1235 case float_round_up: 1236 one = !a.sign; 1237 break; 1238 case float_round_down: 1239 one = a.sign; 1240 break; 1241 default: 1242 g_assert_not_reached(); 1243 } 1244 1245 if (one) { 1246 a.frac = DECOMPOSED_IMPLICIT_BIT; 1247 a.exp = 0; 1248 } else { 1249 a.cls = float_class_zero; 1250 } 1251 } else { 1252 uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp; 1253 uint64_t frac_lsbm1 = frac_lsb >> 1; 1254 uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb; 1255 uint64_t rnd_mask = rnd_even_mask >> 1; 1256 uint64_t inc; 1257 1258 switch (rounding_mode) { 1259 case float_round_nearest_even: 1260 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0); 1261 break; 1262 case float_round_ties_away: 1263 inc = frac_lsbm1; 1264 break; 1265 case float_round_to_zero: 1266 inc = 0; 1267 break; 1268 case float_round_up: 1269 inc = a.sign ? 0 : rnd_mask; 1270 break; 1271 case float_round_down: 1272 inc = a.sign ? rnd_mask : 0; 1273 break; 1274 default: 1275 g_assert_not_reached(); 1276 } 1277 1278 if (a.frac & rnd_mask) { 1279 s->float_exception_flags |= float_flag_inexact; 1280 a.frac += inc; 1281 a.frac &= ~rnd_mask; 1282 if (a.frac & DECOMPOSED_OVERFLOW_BIT) { 1283 a.frac >>= 1; 1284 a.exp++; 1285 } 1286 } 1287 } 1288 break; 1289 default: 1290 g_assert_not_reached(); 1291 } 1292 return a; 1293 } 1294 1295 float16 float16_round_to_int(float16 a, float_status *s) 1296 { 1297 FloatParts pa = float16_unpack_canonical(a, s); 1298 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1299 return float16_round_pack_canonical(pr, s); 1300 } 1301 1302 float32 float32_round_to_int(float32 a, float_status *s) 1303 { 1304 FloatParts pa = float32_unpack_canonical(a, s); 1305 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1306 return float32_round_pack_canonical(pr, s); 1307 } 1308 1309 float64 float64_round_to_int(float64 a, float_status *s) 1310 { 1311 FloatParts pa = float64_unpack_canonical(a, s); 1312 FloatParts pr = round_to_int(pa, s->float_rounding_mode, s); 1313 return float64_round_pack_canonical(pr, s); 1314 } 1315 1316 float64 float64_trunc_to_int(float64 a, float_status *s) 1317 { 1318 FloatParts pa = float64_unpack_canonical(a, s); 1319 FloatParts pr = round_to_int(pa, float_round_to_zero, s); 1320 return float64_round_pack_canonical(pr, s); 1321 } 1322 1323 /* 1324 * Returns the result of converting the floating-point value `a' to 1325 * the two's complement integer format. The conversion is performed 1326 * according to the IEC/IEEE Standard for Binary Floating-Point 1327 * Arithmetic---which means in particular that the conversion is 1328 * rounded according to the current rounding mode. If `a' is a NaN, 1329 * the largest positive integer is returned. Otherwise, if the 1330 * conversion overflows, the largest integer with the same sign as `a' 1331 * is returned. 1332 */ 1333 1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode, 1335 int64_t min, int64_t max, 1336 float_status *s) 1337 { 1338 uint64_t r; 1339 int orig_flags = get_float_exception_flags(s); 1340 FloatParts p = round_to_int(in, rmode, s); 1341 1342 switch (p.cls) { 1343 case float_class_snan: 1344 case float_class_qnan: 1345 return max; 1346 case float_class_inf: 1347 return p.sign ? min : max; 1348 case float_class_zero: 1349 return 0; 1350 case float_class_normal: 1351 if (p.exp < DECOMPOSED_BINARY_POINT) { 1352 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1353 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1354 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1355 } else { 1356 r = UINT64_MAX; 1357 } 1358 if (p.sign) { 1359 if (r < -(uint64_t) min) { 1360 return -r; 1361 } else { 1362 s->float_exception_flags = orig_flags | float_flag_invalid; 1363 return min; 1364 } 1365 } else { 1366 if (r < max) { 1367 return r; 1368 } else { 1369 s->float_exception_flags = orig_flags | float_flag_invalid; 1370 return max; 1371 } 1372 } 1373 default: 1374 g_assert_not_reached(); 1375 } 1376 } 1377 1378 #define FLOAT_TO_INT(fsz, isz) \ 1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a, \ 1380 float_status *s) \ 1381 { \ 1382 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1383 return round_to_int_and_pack(p, s->float_rounding_mode, \ 1384 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1385 s); \ 1386 } \ 1387 \ 1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero \ 1389 (float ## fsz a, float_status *s) \ 1390 { \ 1391 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1392 return round_to_int_and_pack(p, float_round_to_zero, \ 1393 INT ## isz ## _MIN, INT ## isz ## _MAX,\ 1394 s); \ 1395 } 1396 1397 FLOAT_TO_INT(16, 16) 1398 FLOAT_TO_INT(16, 32) 1399 FLOAT_TO_INT(16, 64) 1400 1401 FLOAT_TO_INT(32, 16) 1402 FLOAT_TO_INT(32, 32) 1403 FLOAT_TO_INT(32, 64) 1404 1405 FLOAT_TO_INT(64, 16) 1406 FLOAT_TO_INT(64, 32) 1407 FLOAT_TO_INT(64, 64) 1408 1409 #undef FLOAT_TO_INT 1410 1411 /* 1412 * Returns the result of converting the floating-point value `a' to 1413 * the unsigned integer format. The conversion is performed according 1414 * to the IEC/IEEE Standard for Binary Floating-Point 1415 * Arithmetic---which means in particular that the conversion is 1416 * rounded according to the current rounding mode. If `a' is a NaN, 1417 * the largest unsigned integer is returned. Otherwise, if the 1418 * conversion overflows, the largest unsigned integer is returned. If 1419 * the 'a' is negative, the result is rounded and zero is returned; 1420 * values that do not round to zero will raise the inexact exception 1421 * flag. 1422 */ 1423 1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max, 1425 float_status *s) 1426 { 1427 int orig_flags = get_float_exception_flags(s); 1428 FloatParts p = round_to_int(in, rmode, s); 1429 1430 switch (p.cls) { 1431 case float_class_snan: 1432 case float_class_qnan: 1433 s->float_exception_flags = orig_flags | float_flag_invalid; 1434 return max; 1435 case float_class_inf: 1436 return p.sign ? 0 : max; 1437 case float_class_zero: 1438 return 0; 1439 case float_class_normal: 1440 { 1441 uint64_t r; 1442 if (p.sign) { 1443 s->float_exception_flags = orig_flags | float_flag_invalid; 1444 return 0; 1445 } 1446 1447 if (p.exp < DECOMPOSED_BINARY_POINT) { 1448 r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp); 1449 } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) { 1450 r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT); 1451 } else { 1452 s->float_exception_flags = orig_flags | float_flag_invalid; 1453 return max; 1454 } 1455 1456 /* For uint64 this will never trip, but if p.exp is too large 1457 * to shift a decomposed fraction we shall have exited via the 1458 * 3rd leg above. 1459 */ 1460 if (r > max) { 1461 s->float_exception_flags = orig_flags | float_flag_invalid; 1462 return max; 1463 } else { 1464 return r; 1465 } 1466 } 1467 default: 1468 g_assert_not_reached(); 1469 } 1470 } 1471 1472 #define FLOAT_TO_UINT(fsz, isz) \ 1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a, \ 1474 float_status *s) \ 1475 { \ 1476 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1477 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1478 UINT ## isz ## _MAX, s); \ 1479 } \ 1480 \ 1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero \ 1482 (float ## fsz a, float_status *s) \ 1483 { \ 1484 FloatParts p = float ## fsz ## _unpack_canonical(a, s); \ 1485 return round_to_uint_and_pack(p, s->float_rounding_mode, \ 1486 UINT ## isz ## _MAX, s); \ 1487 } 1488 1489 FLOAT_TO_UINT(16, 16) 1490 FLOAT_TO_UINT(16, 32) 1491 FLOAT_TO_UINT(16, 64) 1492 1493 FLOAT_TO_UINT(32, 16) 1494 FLOAT_TO_UINT(32, 32) 1495 FLOAT_TO_UINT(32, 64) 1496 1497 FLOAT_TO_UINT(64, 16) 1498 FLOAT_TO_UINT(64, 32) 1499 FLOAT_TO_UINT(64, 64) 1500 1501 #undef FLOAT_TO_UINT 1502 1503 /* 1504 * Integer to float conversions 1505 * 1506 * Returns the result of converting the two's complement integer `a' 1507 * to the floating-point format. The conversion is performed according 1508 * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1509 */ 1510 1511 static FloatParts int_to_float(int64_t a, float_status *status) 1512 { 1513 FloatParts r; 1514 if (a == 0) { 1515 r.cls = float_class_zero; 1516 r.sign = false; 1517 } else if (a == (1ULL << 63)) { 1518 r.cls = float_class_normal; 1519 r.sign = true; 1520 r.frac = DECOMPOSED_IMPLICIT_BIT; 1521 r.exp = 63; 1522 } else { 1523 uint64_t f; 1524 if (a < 0) { 1525 f = -a; 1526 r.sign = true; 1527 } else { 1528 f = a; 1529 r.sign = false; 1530 } 1531 int shift = clz64(f) - 1; 1532 r.cls = float_class_normal; 1533 r.exp = (DECOMPOSED_BINARY_POINT - shift); 1534 r.frac = f << shift; 1535 } 1536 1537 return r; 1538 } 1539 1540 float16 int64_to_float16(int64_t a, float_status *status) 1541 { 1542 FloatParts pa = int_to_float(a, status); 1543 return float16_round_pack_canonical(pa, status); 1544 } 1545 1546 float16 int32_to_float16(int32_t a, float_status *status) 1547 { 1548 return int64_to_float16(a, status); 1549 } 1550 1551 float16 int16_to_float16(int16_t a, float_status *status) 1552 { 1553 return int64_to_float16(a, status); 1554 } 1555 1556 float32 int64_to_float32(int64_t a, float_status *status) 1557 { 1558 FloatParts pa = int_to_float(a, status); 1559 return float32_round_pack_canonical(pa, status); 1560 } 1561 1562 float32 int32_to_float32(int32_t a, float_status *status) 1563 { 1564 return int64_to_float32(a, status); 1565 } 1566 1567 float32 int16_to_float32(int16_t a, float_status *status) 1568 { 1569 return int64_to_float32(a, status); 1570 } 1571 1572 float64 int64_to_float64(int64_t a, float_status *status) 1573 { 1574 FloatParts pa = int_to_float(a, status); 1575 return float64_round_pack_canonical(pa, status); 1576 } 1577 1578 float64 int32_to_float64(int32_t a, float_status *status) 1579 { 1580 return int64_to_float64(a, status); 1581 } 1582 1583 float64 int16_to_float64(int16_t a, float_status *status) 1584 { 1585 return int64_to_float64(a, status); 1586 } 1587 1588 1589 /* 1590 * Unsigned Integer to float conversions 1591 * 1592 * Returns the result of converting the unsigned integer `a' to the 1593 * floating-point format. The conversion is performed according to the 1594 * IEC/IEEE Standard for Binary Floating-Point Arithmetic. 1595 */ 1596 1597 static FloatParts uint_to_float(uint64_t a, float_status *status) 1598 { 1599 FloatParts r = { .sign = false}; 1600 1601 if (a == 0) { 1602 r.cls = float_class_zero; 1603 } else { 1604 int spare_bits = clz64(a) - 1; 1605 r.cls = float_class_normal; 1606 r.exp = DECOMPOSED_BINARY_POINT - spare_bits; 1607 if (spare_bits < 0) { 1608 shift64RightJamming(a, -spare_bits, &a); 1609 r.frac = a; 1610 } else { 1611 r.frac = a << spare_bits; 1612 } 1613 } 1614 1615 return r; 1616 } 1617 1618 float16 uint64_to_float16(uint64_t a, float_status *status) 1619 { 1620 FloatParts pa = uint_to_float(a, status); 1621 return float16_round_pack_canonical(pa, status); 1622 } 1623 1624 float16 uint32_to_float16(uint32_t a, float_status *status) 1625 { 1626 return uint64_to_float16(a, status); 1627 } 1628 1629 float16 uint16_to_float16(uint16_t a, float_status *status) 1630 { 1631 return uint64_to_float16(a, status); 1632 } 1633 1634 float32 uint64_to_float32(uint64_t a, float_status *status) 1635 { 1636 FloatParts pa = uint_to_float(a, status); 1637 return float32_round_pack_canonical(pa, status); 1638 } 1639 1640 float32 uint32_to_float32(uint32_t a, float_status *status) 1641 { 1642 return uint64_to_float32(a, status); 1643 } 1644 1645 float32 uint16_to_float32(uint16_t a, float_status *status) 1646 { 1647 return uint64_to_float32(a, status); 1648 } 1649 1650 float64 uint64_to_float64(uint64_t a, float_status *status) 1651 { 1652 FloatParts pa = uint_to_float(a, status); 1653 return float64_round_pack_canonical(pa, status); 1654 } 1655 1656 float64 uint32_to_float64(uint32_t a, float_status *status) 1657 { 1658 return uint64_to_float64(a, status); 1659 } 1660 1661 float64 uint16_to_float64(uint16_t a, float_status *status) 1662 { 1663 return uint64_to_float64(a, status); 1664 } 1665 1666 /* Float Min/Max */ 1667 /* min() and max() functions. These can't be implemented as 1668 * 'compare and pick one input' because that would mishandle 1669 * NaNs and +0 vs -0. 1670 * 1671 * minnum() and maxnum() functions. These are similar to the min() 1672 * and max() functions but if one of the arguments is a QNaN and 1673 * the other is numerical then the numerical argument is returned. 1674 * SNaNs will get quietened before being returned. 1675 * minnum() and maxnum correspond to the IEEE 754-2008 minNum() 1676 * and maxNum() operations. min() and max() are the typical min/max 1677 * semantics provided by many CPUs which predate that specification. 1678 * 1679 * minnummag() and maxnummag() functions correspond to minNumMag() 1680 * and minNumMag() from the IEEE-754 2008. 1681 */ 1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin, 1683 bool ieee, bool ismag, float_status *s) 1684 { 1685 if (unlikely(is_nan(a.cls) || is_nan(b.cls))) { 1686 if (ieee) { 1687 /* Takes two floating-point values `a' and `b', one of 1688 * which is a NaN, and returns the appropriate NaN 1689 * result. If either `a' or `b' is a signaling NaN, 1690 * the invalid exception is raised. 1691 */ 1692 if (is_snan(a.cls) || is_snan(b.cls)) { 1693 return pick_nan(a, b, s); 1694 } else if (is_nan(a.cls) && !is_nan(b.cls)) { 1695 return b; 1696 } else if (is_nan(b.cls) && !is_nan(a.cls)) { 1697 return a; 1698 } 1699 } 1700 return pick_nan(a, b, s); 1701 } else { 1702 int a_exp, b_exp; 1703 bool a_sign, b_sign; 1704 1705 switch (a.cls) { 1706 case float_class_normal: 1707 a_exp = a.exp; 1708 break; 1709 case float_class_inf: 1710 a_exp = INT_MAX; 1711 break; 1712 case float_class_zero: 1713 a_exp = INT_MIN; 1714 break; 1715 default: 1716 g_assert_not_reached(); 1717 break; 1718 } 1719 switch (b.cls) { 1720 case float_class_normal: 1721 b_exp = b.exp; 1722 break; 1723 case float_class_inf: 1724 b_exp = INT_MAX; 1725 break; 1726 case float_class_zero: 1727 b_exp = INT_MIN; 1728 break; 1729 default: 1730 g_assert_not_reached(); 1731 break; 1732 } 1733 1734 a_sign = a.sign; 1735 b_sign = b.sign; 1736 if (ismag) { 1737 a_sign = b_sign = 0; 1738 } 1739 1740 if (a_sign == b_sign) { 1741 bool a_less = a_exp < b_exp; 1742 if (a_exp == b_exp) { 1743 a_less = a.frac < b.frac; 1744 } 1745 return a_sign ^ a_less ^ ismin ? b : a; 1746 } else { 1747 return a_sign ^ ismin ? b : a; 1748 } 1749 } 1750 } 1751 1752 #define MINMAX(sz, name, ismin, isiee, ismag) \ 1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b, \ 1754 float_status *s) \ 1755 { \ 1756 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1757 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1758 FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s); \ 1759 \ 1760 return float ## sz ## _round_pack_canonical(pr, s); \ 1761 } 1762 1763 MINMAX(16, min, true, false, false) 1764 MINMAX(16, minnum, true, true, false) 1765 MINMAX(16, minnummag, true, true, true) 1766 MINMAX(16, max, false, false, false) 1767 MINMAX(16, maxnum, false, true, false) 1768 MINMAX(16, maxnummag, false, true, true) 1769 1770 MINMAX(32, min, true, false, false) 1771 MINMAX(32, minnum, true, true, false) 1772 MINMAX(32, minnummag, true, true, true) 1773 MINMAX(32, max, false, false, false) 1774 MINMAX(32, maxnum, false, true, false) 1775 MINMAX(32, maxnummag, false, true, true) 1776 1777 MINMAX(64, min, true, false, false) 1778 MINMAX(64, minnum, true, true, false) 1779 MINMAX(64, minnummag, true, true, true) 1780 MINMAX(64, max, false, false, false) 1781 MINMAX(64, maxnum, false, true, false) 1782 MINMAX(64, maxnummag, false, true, true) 1783 1784 #undef MINMAX 1785 1786 /* Floating point compare */ 1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, 1788 float_status *s) 1789 { 1790 if (is_nan(a.cls) || is_nan(b.cls)) { 1791 if (!is_quiet || 1792 a.cls == float_class_snan || 1793 b.cls == float_class_snan) { 1794 s->float_exception_flags |= float_flag_invalid; 1795 } 1796 return float_relation_unordered; 1797 } 1798 1799 if (a.cls == float_class_zero) { 1800 if (b.cls == float_class_zero) { 1801 return float_relation_equal; 1802 } 1803 return b.sign ? float_relation_greater : float_relation_less; 1804 } else if (b.cls == float_class_zero) { 1805 return a.sign ? float_relation_less : float_relation_greater; 1806 } 1807 1808 /* The only really important thing about infinity is its sign. If 1809 * both are infinities the sign marks the smallest of the two. 1810 */ 1811 if (a.cls == float_class_inf) { 1812 if ((b.cls == float_class_inf) && (a.sign == b.sign)) { 1813 return float_relation_equal; 1814 } 1815 return a.sign ? float_relation_less : float_relation_greater; 1816 } else if (b.cls == float_class_inf) { 1817 return b.sign ? float_relation_greater : float_relation_less; 1818 } 1819 1820 if (a.sign != b.sign) { 1821 return a.sign ? float_relation_less : float_relation_greater; 1822 } 1823 1824 if (a.exp == b.exp) { 1825 if (a.frac == b.frac) { 1826 return float_relation_equal; 1827 } 1828 if (a.sign) { 1829 return a.frac > b.frac ? 1830 float_relation_less : float_relation_greater; 1831 } else { 1832 return a.frac > b.frac ? 1833 float_relation_greater : float_relation_less; 1834 } 1835 } else { 1836 if (a.sign) { 1837 return a.exp > b.exp ? float_relation_less : float_relation_greater; 1838 } else { 1839 return a.exp > b.exp ? float_relation_greater : float_relation_less; 1840 } 1841 } 1842 } 1843 1844 #define COMPARE(sz) \ 1845 int float ## sz ## _compare(float ## sz a, float ## sz b, \ 1846 float_status *s) \ 1847 { \ 1848 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1849 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1850 return compare_floats(pa, pb, false, s); \ 1851 } \ 1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ 1853 float_status *s) \ 1854 { \ 1855 FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ 1856 FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ 1857 return compare_floats(pa, pb, true, s); \ 1858 } 1859 1860 COMPARE(16) 1861 COMPARE(32) 1862 COMPARE(64) 1863 1864 #undef COMPARE 1865 1866 /* Multiply A by 2 raised to the power N. */ 1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) 1868 { 1869 if (unlikely(is_nan(a.cls))) { 1870 return return_nan(a, s); 1871 } 1872 if (a.cls == float_class_normal) { 1873 a.exp += n; 1874 } 1875 return a; 1876 } 1877 1878 float16 float16_scalbn(float16 a, int n, float_status *status) 1879 { 1880 FloatParts pa = float16_unpack_canonical(a, status); 1881 FloatParts pr = scalbn_decomposed(pa, n, status); 1882 return float16_round_pack_canonical(pr, status); 1883 } 1884 1885 float32 float32_scalbn(float32 a, int n, float_status *status) 1886 { 1887 FloatParts pa = float32_unpack_canonical(a, status); 1888 FloatParts pr = scalbn_decomposed(pa, n, status); 1889 return float32_round_pack_canonical(pr, status); 1890 } 1891 1892 float64 float64_scalbn(float64 a, int n, float_status *status) 1893 { 1894 FloatParts pa = float64_unpack_canonical(a, status); 1895 FloatParts pr = scalbn_decomposed(pa, n, status); 1896 return float64_round_pack_canonical(pr, status); 1897 } 1898 1899 /* 1900 * Square Root 1901 * 1902 * The old softfloat code did an approximation step before zeroing in 1903 * on the final result. However for simpleness we just compute the 1904 * square root by iterating down from the implicit bit to enough extra 1905 * bits to ensure we get a correctly rounded result. 1906 * 1907 * This does mean however the calculation is slower than before, 1908 * especially for 64 bit floats. 1909 */ 1910 1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) 1912 { 1913 uint64_t a_frac, r_frac, s_frac; 1914 int bit, last_bit; 1915 1916 if (is_nan(a.cls)) { 1917 return return_nan(a, s); 1918 } 1919 if (a.cls == float_class_zero) { 1920 return a; /* sqrt(+-0) = +-0 */ 1921 } 1922 if (a.sign) { 1923 s->float_exception_flags |= float_flag_invalid; 1924 a.cls = float_class_dnan; 1925 return a; 1926 } 1927 if (a.cls == float_class_inf) { 1928 return a; /* sqrt(+inf) = +inf */ 1929 } 1930 1931 assert(a.cls == float_class_normal); 1932 1933 /* We need two overflow bits at the top. Adding room for that is a 1934 * right shift. If the exponent is odd, we can discard the low bit 1935 * by multiplying the fraction by 2; that's a left shift. Combine 1936 * those and we shift right if the exponent is even. 1937 */ 1938 a_frac = a.frac; 1939 if (!(a.exp & 1)) { 1940 a_frac >>= 1; 1941 } 1942 a.exp >>= 1; 1943 1944 /* Bit-by-bit computation of sqrt. */ 1945 r_frac = 0; 1946 s_frac = 0; 1947 1948 /* Iterate from implicit bit down to the 3 extra bits to compute a 1949 * properly rounded result. Remember we've inserted one more bit 1950 * at the top, so these positions are one less. 1951 */ 1952 bit = DECOMPOSED_BINARY_POINT - 1; 1953 last_bit = MAX(p->frac_shift - 4, 0); 1954 do { 1955 uint64_t q = 1ULL << bit; 1956 uint64_t t_frac = s_frac + q; 1957 if (t_frac <= a_frac) { 1958 s_frac = t_frac + q; 1959 a_frac -= t_frac; 1960 r_frac += q; 1961 } 1962 a_frac <<= 1; 1963 } while (--bit >= last_bit); 1964 1965 /* Undo the right shift done above. If there is any remaining 1966 * fraction, the result is inexact. Set the sticky bit. 1967 */ 1968 a.frac = (r_frac << 1) + (a_frac != 0); 1969 1970 return a; 1971 } 1972 1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status) 1974 { 1975 FloatParts pa = float16_unpack_canonical(a, status); 1976 FloatParts pr = sqrt_float(pa, status, &float16_params); 1977 return float16_round_pack_canonical(pr, status); 1978 } 1979 1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status) 1981 { 1982 FloatParts pa = float32_unpack_canonical(a, status); 1983 FloatParts pr = sqrt_float(pa, status, &float32_params); 1984 return float32_round_pack_canonical(pr, status); 1985 } 1986 1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status) 1988 { 1989 FloatParts pa = float64_unpack_canonical(a, status); 1990 FloatParts pr = sqrt_float(pa, status, &float64_params); 1991 return float64_round_pack_canonical(pr, status); 1992 } 1993 1994 1995 /*---------------------------------------------------------------------------- 1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the 1998 | input. If `zSign' is 1, the input is negated before being converted to an 1999 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input 2000 | is simply rounded to an integer, with the inexact exception raised if the 2001 | input cannot be represented exactly as an integer. However, if the fixed- 2002 | point input is too large, the invalid exception is raised and the largest 2003 | positive or negative integer is returned. 2004 *----------------------------------------------------------------------------*/ 2005 2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status) 2007 { 2008 int8_t roundingMode; 2009 flag roundNearestEven; 2010 int8_t roundIncrement, roundBits; 2011 int32_t z; 2012 2013 roundingMode = status->float_rounding_mode; 2014 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2015 switch (roundingMode) { 2016 case float_round_nearest_even: 2017 case float_round_ties_away: 2018 roundIncrement = 0x40; 2019 break; 2020 case float_round_to_zero: 2021 roundIncrement = 0; 2022 break; 2023 case float_round_up: 2024 roundIncrement = zSign ? 0 : 0x7f; 2025 break; 2026 case float_round_down: 2027 roundIncrement = zSign ? 0x7f : 0; 2028 break; 2029 default: 2030 abort(); 2031 } 2032 roundBits = absZ & 0x7F; 2033 absZ = ( absZ + roundIncrement )>>7; 2034 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2035 z = absZ; 2036 if ( zSign ) z = - z; 2037 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { 2038 float_raise(float_flag_invalid, status); 2039 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 2040 } 2041 if (roundBits) { 2042 status->float_exception_flags |= float_flag_inexact; 2043 } 2044 return z; 2045 2046 } 2047 2048 /*---------------------------------------------------------------------------- 2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2050 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2051 | and returns the properly rounded 64-bit integer corresponding to the input. 2052 | If `zSign' is 1, the input is negated before being converted to an integer. 2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with 2054 | the inexact exception raised if the input cannot be represented exactly as 2055 | an integer. However, if the fixed-point input is too large, the invalid 2056 | exception is raised and the largest positive or negative integer is 2057 | returned. 2058 *----------------------------------------------------------------------------*/ 2059 2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1, 2061 float_status *status) 2062 { 2063 int8_t roundingMode; 2064 flag roundNearestEven, increment; 2065 int64_t z; 2066 2067 roundingMode = status->float_rounding_mode; 2068 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2069 switch (roundingMode) { 2070 case float_round_nearest_even: 2071 case float_round_ties_away: 2072 increment = ((int64_t) absZ1 < 0); 2073 break; 2074 case float_round_to_zero: 2075 increment = 0; 2076 break; 2077 case float_round_up: 2078 increment = !zSign && absZ1; 2079 break; 2080 case float_round_down: 2081 increment = zSign && absZ1; 2082 break; 2083 default: 2084 abort(); 2085 } 2086 if ( increment ) { 2087 ++absZ0; 2088 if ( absZ0 == 0 ) goto overflow; 2089 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven ); 2090 } 2091 z = absZ0; 2092 if ( zSign ) z = - z; 2093 if ( z && ( ( z < 0 ) ^ zSign ) ) { 2094 overflow: 2095 float_raise(float_flag_invalid, status); 2096 return 2097 zSign ? (int64_t) LIT64( 0x8000000000000000 ) 2098 : LIT64( 0x7FFFFFFFFFFFFFFF ); 2099 } 2100 if (absZ1) { 2101 status->float_exception_flags |= float_flag_inexact; 2102 } 2103 return z; 2104 2105 } 2106 2107 /*---------------------------------------------------------------------------- 2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and 2109 | `absZ1', with binary point between bits 63 and 64 (between the input words), 2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the 2111 | input. Ordinarily, the fixed-point input is simply rounded to an integer, 2112 | with the inexact exception raised if the input cannot be represented exactly 2113 | as an integer. However, if the fixed-point input is too large, the invalid 2114 | exception is raised and the largest unsigned integer is returned. 2115 *----------------------------------------------------------------------------*/ 2116 2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0, 2118 uint64_t absZ1, float_status *status) 2119 { 2120 int8_t roundingMode; 2121 flag roundNearestEven, increment; 2122 2123 roundingMode = status->float_rounding_mode; 2124 roundNearestEven = (roundingMode == float_round_nearest_even); 2125 switch (roundingMode) { 2126 case float_round_nearest_even: 2127 case float_round_ties_away: 2128 increment = ((int64_t)absZ1 < 0); 2129 break; 2130 case float_round_to_zero: 2131 increment = 0; 2132 break; 2133 case float_round_up: 2134 increment = !zSign && absZ1; 2135 break; 2136 case float_round_down: 2137 increment = zSign && absZ1; 2138 break; 2139 default: 2140 abort(); 2141 } 2142 if (increment) { 2143 ++absZ0; 2144 if (absZ0 == 0) { 2145 float_raise(float_flag_invalid, status); 2146 return LIT64(0xFFFFFFFFFFFFFFFF); 2147 } 2148 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven); 2149 } 2150 2151 if (zSign && absZ0) { 2152 float_raise(float_flag_invalid, status); 2153 return 0; 2154 } 2155 2156 if (absZ1) { 2157 status->float_exception_flags |= float_flag_inexact; 2158 } 2159 return absZ0; 2160 } 2161 2162 /*---------------------------------------------------------------------------- 2163 | If `a' is denormal and we are in flush-to-zero mode then set the 2164 | input-denormal exception and return zero. Otherwise just return the value. 2165 *----------------------------------------------------------------------------*/ 2166 float32 float32_squash_input_denormal(float32 a, float_status *status) 2167 { 2168 if (status->flush_inputs_to_zero) { 2169 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { 2170 float_raise(float_flag_input_denormal, status); 2171 return make_float32(float32_val(a) & 0x80000000); 2172 } 2173 } 2174 return a; 2175 } 2176 2177 /*---------------------------------------------------------------------------- 2178 | Normalizes the subnormal single-precision floating-point value represented 2179 | by the denormalized significand `aSig'. The normalized exponent and 2180 | significand are stored at the locations pointed to by `zExpPtr' and 2181 | `zSigPtr', respectively. 2182 *----------------------------------------------------------------------------*/ 2183 2184 static void 2185 normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr) 2186 { 2187 int8_t shiftCount; 2188 2189 shiftCount = countLeadingZeros32( aSig ) - 8; 2190 *zSigPtr = aSig<<shiftCount; 2191 *zExpPtr = 1 - shiftCount; 2192 2193 } 2194 2195 /*---------------------------------------------------------------------------- 2196 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2197 | and significand `zSig', and returns the proper single-precision floating- 2198 | point value corresponding to the abstract input. Ordinarily, the abstract 2199 | value is simply rounded and packed into the single-precision format, with 2200 | the inexact exception raised if the abstract input cannot be represented 2201 | exactly. However, if the abstract value is too large, the overflow and 2202 | inexact exceptions are raised and an infinity or maximal finite value is 2203 | returned. If the abstract value is too small, the input value is rounded to 2204 | a subnormal number, and the underflow and inexact exceptions are raised if 2205 | the abstract input cannot be represented exactly as a subnormal single- 2206 | precision floating-point number. 2207 | The input significand `zSig' has its binary point between bits 30 2208 | and 29, which is 7 bits to the left of the usual location. This shifted 2209 | significand must be normalized or smaller. If `zSig' is not normalized, 2210 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2211 | and it must not require rounding. In the usual case that `zSig' is 2212 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2213 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2214 | Binary Floating-Point Arithmetic. 2215 *----------------------------------------------------------------------------*/ 2216 2217 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2218 float_status *status) 2219 { 2220 int8_t roundingMode; 2221 flag roundNearestEven; 2222 int8_t roundIncrement, roundBits; 2223 flag isTiny; 2224 2225 roundingMode = status->float_rounding_mode; 2226 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2227 switch (roundingMode) { 2228 case float_round_nearest_even: 2229 case float_round_ties_away: 2230 roundIncrement = 0x40; 2231 break; 2232 case float_round_to_zero: 2233 roundIncrement = 0; 2234 break; 2235 case float_round_up: 2236 roundIncrement = zSign ? 0 : 0x7f; 2237 break; 2238 case float_round_down: 2239 roundIncrement = zSign ? 0x7f : 0; 2240 break; 2241 default: 2242 abort(); 2243 break; 2244 } 2245 roundBits = zSig & 0x7F; 2246 if ( 0xFD <= (uint16_t) zExp ) { 2247 if ( ( 0xFD < zExp ) 2248 || ( ( zExp == 0xFD ) 2249 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) ) 2250 ) { 2251 float_raise(float_flag_overflow | float_flag_inexact, status); 2252 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); 2253 } 2254 if ( zExp < 0 ) { 2255 if (status->flush_to_zero) { 2256 float_raise(float_flag_output_denormal, status); 2257 return packFloat32(zSign, 0, 0); 2258 } 2259 isTiny = 2260 (status->float_detect_tininess 2261 == float_tininess_before_rounding) 2262 || ( zExp < -1 ) 2263 || ( zSig + roundIncrement < 0x80000000 ); 2264 shift32RightJamming( zSig, - zExp, &zSig ); 2265 zExp = 0; 2266 roundBits = zSig & 0x7F; 2267 if (isTiny && roundBits) { 2268 float_raise(float_flag_underflow, status); 2269 } 2270 } 2271 } 2272 if (roundBits) { 2273 status->float_exception_flags |= float_flag_inexact; 2274 } 2275 zSig = ( zSig + roundIncrement )>>7; 2276 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); 2277 if ( zSig == 0 ) zExp = 0; 2278 return packFloat32( zSign, zExp, zSig ); 2279 2280 } 2281 2282 /*---------------------------------------------------------------------------- 2283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2284 | and significand `zSig', and returns the proper single-precision floating- 2285 | point value corresponding to the abstract input. This routine is just like 2286 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. 2287 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2288 | floating-point exponent. 2289 *----------------------------------------------------------------------------*/ 2290 2291 static float32 2292 normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig, 2293 float_status *status) 2294 { 2295 int8_t shiftCount; 2296 2297 shiftCount = countLeadingZeros32( zSig ) - 1; 2298 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount, 2299 status); 2300 2301 } 2302 2303 /*---------------------------------------------------------------------------- 2304 | If `a' is denormal and we are in flush-to-zero mode then set the 2305 | input-denormal exception and return zero. Otherwise just return the value. 2306 *----------------------------------------------------------------------------*/ 2307 float64 float64_squash_input_denormal(float64 a, float_status *status) 2308 { 2309 if (status->flush_inputs_to_zero) { 2310 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { 2311 float_raise(float_flag_input_denormal, status); 2312 return make_float64(float64_val(a) & (1ULL << 63)); 2313 } 2314 } 2315 return a; 2316 } 2317 2318 /*---------------------------------------------------------------------------- 2319 | Normalizes the subnormal double-precision floating-point value represented 2320 | by the denormalized significand `aSig'. The normalized exponent and 2321 | significand are stored at the locations pointed to by `zExpPtr' and 2322 | `zSigPtr', respectively. 2323 *----------------------------------------------------------------------------*/ 2324 2325 static void 2326 normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr) 2327 { 2328 int8_t shiftCount; 2329 2330 shiftCount = countLeadingZeros64( aSig ) - 11; 2331 *zSigPtr = aSig<<shiftCount; 2332 *zExpPtr = 1 - shiftCount; 2333 2334 } 2335 2336 /*---------------------------------------------------------------------------- 2337 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 2338 | double-precision floating-point value, returning the result. After being 2339 | shifted into the proper positions, the three fields are simply added 2340 | together to form the result. This means that any integer portion of `zSig' 2341 | will be added into the exponent. Since a properly normalized significand 2342 | will have an integer portion equal to 1, the `zExp' input should be 1 less 2343 | than the desired result exponent whenever `zSig' is a complete, normalized 2344 | significand. 2345 *----------------------------------------------------------------------------*/ 2346 2347 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig) 2348 { 2349 2350 return make_float64( 2351 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig); 2352 2353 } 2354 2355 /*---------------------------------------------------------------------------- 2356 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2357 | and significand `zSig', and returns the proper double-precision floating- 2358 | point value corresponding to the abstract input. Ordinarily, the abstract 2359 | value is simply rounded and packed into the double-precision format, with 2360 | the inexact exception raised if the abstract input cannot be represented 2361 | exactly. However, if the abstract value is too large, the overflow and 2362 | inexact exceptions are raised and an infinity or maximal finite value is 2363 | returned. If the abstract value is too small, the input value is rounded to 2364 | a subnormal number, and the underflow and inexact exceptions are raised if 2365 | the abstract input cannot be represented exactly as a subnormal double- 2366 | precision floating-point number. 2367 | The input significand `zSig' has its binary point between bits 62 2368 | and 61, which is 10 bits to the left of the usual location. This shifted 2369 | significand must be normalized or smaller. If `zSig' is not normalized, 2370 | `zExp' must be 0; in that case, the result returned is a subnormal number, 2371 | and it must not require rounding. In the usual case that `zSig' is 2372 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 2373 | The handling of underflow and overflow follows the IEC/IEEE Standard for 2374 | Binary Floating-Point Arithmetic. 2375 *----------------------------------------------------------------------------*/ 2376 2377 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2378 float_status *status) 2379 { 2380 int8_t roundingMode; 2381 flag roundNearestEven; 2382 int roundIncrement, roundBits; 2383 flag isTiny; 2384 2385 roundingMode = status->float_rounding_mode; 2386 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2387 switch (roundingMode) { 2388 case float_round_nearest_even: 2389 case float_round_ties_away: 2390 roundIncrement = 0x200; 2391 break; 2392 case float_round_to_zero: 2393 roundIncrement = 0; 2394 break; 2395 case float_round_up: 2396 roundIncrement = zSign ? 0 : 0x3ff; 2397 break; 2398 case float_round_down: 2399 roundIncrement = zSign ? 0x3ff : 0; 2400 break; 2401 case float_round_to_odd: 2402 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2403 break; 2404 default: 2405 abort(); 2406 } 2407 roundBits = zSig & 0x3FF; 2408 if ( 0x7FD <= (uint16_t) zExp ) { 2409 if ( ( 0x7FD < zExp ) 2410 || ( ( zExp == 0x7FD ) 2411 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) ) 2412 ) { 2413 bool overflow_to_inf = roundingMode != float_round_to_odd && 2414 roundIncrement != 0; 2415 float_raise(float_flag_overflow | float_flag_inexact, status); 2416 return packFloat64(zSign, 0x7FF, -(!overflow_to_inf)); 2417 } 2418 if ( zExp < 0 ) { 2419 if (status->flush_to_zero) { 2420 float_raise(float_flag_output_denormal, status); 2421 return packFloat64(zSign, 0, 0); 2422 } 2423 isTiny = 2424 (status->float_detect_tininess 2425 == float_tininess_before_rounding) 2426 || ( zExp < -1 ) 2427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); 2428 shift64RightJamming( zSig, - zExp, &zSig ); 2429 zExp = 0; 2430 roundBits = zSig & 0x3FF; 2431 if (isTiny && roundBits) { 2432 float_raise(float_flag_underflow, status); 2433 } 2434 if (roundingMode == float_round_to_odd) { 2435 /* 2436 * For round-to-odd case, the roundIncrement depends on 2437 * zSig which just changed. 2438 */ 2439 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff; 2440 } 2441 } 2442 } 2443 if (roundBits) { 2444 status->float_exception_flags |= float_flag_inexact; 2445 } 2446 zSig = ( zSig + roundIncrement )>>10; 2447 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); 2448 if ( zSig == 0 ) zExp = 0; 2449 return packFloat64( zSign, zExp, zSig ); 2450 2451 } 2452 2453 /*---------------------------------------------------------------------------- 2454 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2455 | and significand `zSig', and returns the proper double-precision floating- 2456 | point value corresponding to the abstract input. This routine is just like 2457 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. 2458 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' 2459 | floating-point exponent. 2460 *----------------------------------------------------------------------------*/ 2461 2462 static float64 2463 normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig, 2464 float_status *status) 2465 { 2466 int8_t shiftCount; 2467 2468 shiftCount = countLeadingZeros64( zSig ) - 1; 2469 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount, 2470 status); 2471 2472 } 2473 2474 /*---------------------------------------------------------------------------- 2475 | Normalizes the subnormal extended double-precision floating-point value 2476 | represented by the denormalized significand `aSig'. The normalized exponent 2477 | and significand are stored at the locations pointed to by `zExpPtr' and 2478 | `zSigPtr', respectively. 2479 *----------------------------------------------------------------------------*/ 2480 2481 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, 2482 uint64_t *zSigPtr) 2483 { 2484 int8_t shiftCount; 2485 2486 shiftCount = countLeadingZeros64( aSig ); 2487 *zSigPtr = aSig<<shiftCount; 2488 *zExpPtr = 1 - shiftCount; 2489 } 2490 2491 /*---------------------------------------------------------------------------- 2492 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2493 | and extended significand formed by the concatenation of `zSig0' and `zSig1', 2494 | and returns the proper extended double-precision floating-point value 2495 | corresponding to the abstract input. Ordinarily, the abstract value is 2496 | rounded and packed into the extended double-precision format, with the 2497 | inexact exception raised if the abstract input cannot be represented 2498 | exactly. However, if the abstract value is too large, the overflow and 2499 | inexact exceptions are raised and an infinity or maximal finite value is 2500 | returned. If the abstract value is too small, the input value is rounded to 2501 | a subnormal number, and the underflow and inexact exceptions are raised if 2502 | the abstract input cannot be represented exactly as a subnormal extended 2503 | double-precision floating-point number. 2504 | If `roundingPrecision' is 32 or 64, the result is rounded to the same 2505 | number of bits as single or double precision, respectively. Otherwise, the 2506 | result is rounded to the full precision of the extended double-precision 2507 | format. 2508 | The input significand must be normalized or smaller. If the input 2509 | significand is not normalized, `zExp' must be 0; in that case, the result 2510 | returned is a subnormal number, and it must not require rounding. The 2511 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary 2512 | Floating-Point Arithmetic. 2513 *----------------------------------------------------------------------------*/ 2514 2515 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign, 2516 int32_t zExp, uint64_t zSig0, uint64_t zSig1, 2517 float_status *status) 2518 { 2519 int8_t roundingMode; 2520 flag roundNearestEven, increment, isTiny; 2521 int64_t roundIncrement, roundMask, roundBits; 2522 2523 roundingMode = status->float_rounding_mode; 2524 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2525 if ( roundingPrecision == 80 ) goto precision80; 2526 if ( roundingPrecision == 64 ) { 2527 roundIncrement = LIT64( 0x0000000000000400 ); 2528 roundMask = LIT64( 0x00000000000007FF ); 2529 } 2530 else if ( roundingPrecision == 32 ) { 2531 roundIncrement = LIT64( 0x0000008000000000 ); 2532 roundMask = LIT64( 0x000000FFFFFFFFFF ); 2533 } 2534 else { 2535 goto precision80; 2536 } 2537 zSig0 |= ( zSig1 != 0 ); 2538 switch (roundingMode) { 2539 case float_round_nearest_even: 2540 case float_round_ties_away: 2541 break; 2542 case float_round_to_zero: 2543 roundIncrement = 0; 2544 break; 2545 case float_round_up: 2546 roundIncrement = zSign ? 0 : roundMask; 2547 break; 2548 case float_round_down: 2549 roundIncrement = zSign ? roundMask : 0; 2550 break; 2551 default: 2552 abort(); 2553 } 2554 roundBits = zSig0 & roundMask; 2555 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2556 if ( ( 0x7FFE < zExp ) 2557 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) 2558 ) { 2559 goto overflow; 2560 } 2561 if ( zExp <= 0 ) { 2562 if (status->flush_to_zero) { 2563 float_raise(float_flag_output_denormal, status); 2564 return packFloatx80(zSign, 0, 0); 2565 } 2566 isTiny = 2567 (status->float_detect_tininess 2568 == float_tininess_before_rounding) 2569 || ( zExp < 0 ) 2570 || ( zSig0 <= zSig0 + roundIncrement ); 2571 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); 2572 zExp = 0; 2573 roundBits = zSig0 & roundMask; 2574 if (isTiny && roundBits) { 2575 float_raise(float_flag_underflow, status); 2576 } 2577 if (roundBits) { 2578 status->float_exception_flags |= float_flag_inexact; 2579 } 2580 zSig0 += roundIncrement; 2581 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2582 roundIncrement = roundMask + 1; 2583 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2584 roundMask |= roundIncrement; 2585 } 2586 zSig0 &= ~ roundMask; 2587 return packFloatx80( zSign, zExp, zSig0 ); 2588 } 2589 } 2590 if (roundBits) { 2591 status->float_exception_flags |= float_flag_inexact; 2592 } 2593 zSig0 += roundIncrement; 2594 if ( zSig0 < roundIncrement ) { 2595 ++zExp; 2596 zSig0 = LIT64( 0x8000000000000000 ); 2597 } 2598 roundIncrement = roundMask + 1; 2599 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { 2600 roundMask |= roundIncrement; 2601 } 2602 zSig0 &= ~ roundMask; 2603 if ( zSig0 == 0 ) zExp = 0; 2604 return packFloatx80( zSign, zExp, zSig0 ); 2605 precision80: 2606 switch (roundingMode) { 2607 case float_round_nearest_even: 2608 case float_round_ties_away: 2609 increment = ((int64_t)zSig1 < 0); 2610 break; 2611 case float_round_to_zero: 2612 increment = 0; 2613 break; 2614 case float_round_up: 2615 increment = !zSign && zSig1; 2616 break; 2617 case float_round_down: 2618 increment = zSign && zSig1; 2619 break; 2620 default: 2621 abort(); 2622 } 2623 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) { 2624 if ( ( 0x7FFE < zExp ) 2625 || ( ( zExp == 0x7FFE ) 2626 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) 2627 && increment 2628 ) 2629 ) { 2630 roundMask = 0; 2631 overflow: 2632 float_raise(float_flag_overflow | float_flag_inexact, status); 2633 if ( ( roundingMode == float_round_to_zero ) 2634 || ( zSign && ( roundingMode == float_round_up ) ) 2635 || ( ! zSign && ( roundingMode == float_round_down ) ) 2636 ) { 2637 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); 2638 } 2639 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 2640 } 2641 if ( zExp <= 0 ) { 2642 isTiny = 2643 (status->float_detect_tininess 2644 == float_tininess_before_rounding) 2645 || ( zExp < 0 ) 2646 || ! increment 2647 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); 2648 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); 2649 zExp = 0; 2650 if (isTiny && zSig1) { 2651 float_raise(float_flag_underflow, status); 2652 } 2653 if (zSig1) { 2654 status->float_exception_flags |= float_flag_inexact; 2655 } 2656 switch (roundingMode) { 2657 case float_round_nearest_even: 2658 case float_round_ties_away: 2659 increment = ((int64_t)zSig1 < 0); 2660 break; 2661 case float_round_to_zero: 2662 increment = 0; 2663 break; 2664 case float_round_up: 2665 increment = !zSign && zSig1; 2666 break; 2667 case float_round_down: 2668 increment = zSign && zSig1; 2669 break; 2670 default: 2671 abort(); 2672 } 2673 if ( increment ) { 2674 ++zSig0; 2675 zSig0 &= 2676 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2677 if ( (int64_t) zSig0 < 0 ) zExp = 1; 2678 } 2679 return packFloatx80( zSign, zExp, zSig0 ); 2680 } 2681 } 2682 if (zSig1) { 2683 status->float_exception_flags |= float_flag_inexact; 2684 } 2685 if ( increment ) { 2686 ++zSig0; 2687 if ( zSig0 == 0 ) { 2688 ++zExp; 2689 zSig0 = LIT64( 0x8000000000000000 ); 2690 } 2691 else { 2692 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven ); 2693 } 2694 } 2695 else { 2696 if ( zSig0 == 0 ) zExp = 0; 2697 } 2698 return packFloatx80( zSign, zExp, zSig0 ); 2699 2700 } 2701 2702 /*---------------------------------------------------------------------------- 2703 | Takes an abstract floating-point value having sign `zSign', exponent 2704 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', 2705 | and returns the proper extended double-precision floating-point value 2706 | corresponding to the abstract input. This routine is just like 2707 | `roundAndPackFloatx80' except that the input significand does not have to be 2708 | normalized. 2709 *----------------------------------------------------------------------------*/ 2710 2711 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision, 2712 flag zSign, int32_t zExp, 2713 uint64_t zSig0, uint64_t zSig1, 2714 float_status *status) 2715 { 2716 int8_t shiftCount; 2717 2718 if ( zSig0 == 0 ) { 2719 zSig0 = zSig1; 2720 zSig1 = 0; 2721 zExp -= 64; 2722 } 2723 shiftCount = countLeadingZeros64( zSig0 ); 2724 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 2725 zExp -= shiftCount; 2726 return roundAndPackFloatx80(roundingPrecision, zSign, zExp, 2727 zSig0, zSig1, status); 2728 2729 } 2730 2731 /*---------------------------------------------------------------------------- 2732 | Returns the least-significant 64 fraction bits of the quadruple-precision 2733 | floating-point value `a'. 2734 *----------------------------------------------------------------------------*/ 2735 2736 static inline uint64_t extractFloat128Frac1( float128 a ) 2737 { 2738 2739 return a.low; 2740 2741 } 2742 2743 /*---------------------------------------------------------------------------- 2744 | Returns the most-significant 48 fraction bits of the quadruple-precision 2745 | floating-point value `a'. 2746 *----------------------------------------------------------------------------*/ 2747 2748 static inline uint64_t extractFloat128Frac0( float128 a ) 2749 { 2750 2751 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); 2752 2753 } 2754 2755 /*---------------------------------------------------------------------------- 2756 | Returns the exponent bits of the quadruple-precision floating-point value 2757 | `a'. 2758 *----------------------------------------------------------------------------*/ 2759 2760 static inline int32_t extractFloat128Exp( float128 a ) 2761 { 2762 2763 return ( a.high>>48 ) & 0x7FFF; 2764 2765 } 2766 2767 /*---------------------------------------------------------------------------- 2768 | Returns the sign bit of the quadruple-precision floating-point value `a'. 2769 *----------------------------------------------------------------------------*/ 2770 2771 static inline flag extractFloat128Sign( float128 a ) 2772 { 2773 2774 return a.high>>63; 2775 2776 } 2777 2778 /*---------------------------------------------------------------------------- 2779 | Normalizes the subnormal quadruple-precision floating-point value 2780 | represented by the denormalized significand formed by the concatenation of 2781 | `aSig0' and `aSig1'. The normalized exponent is stored at the location 2782 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized 2783 | significand are stored at the location pointed to by `zSig0Ptr', and the 2784 | least significant 64 bits of the normalized significand are stored at the 2785 | location pointed to by `zSig1Ptr'. 2786 *----------------------------------------------------------------------------*/ 2787 2788 static void 2789 normalizeFloat128Subnormal( 2790 uint64_t aSig0, 2791 uint64_t aSig1, 2792 int32_t *zExpPtr, 2793 uint64_t *zSig0Ptr, 2794 uint64_t *zSig1Ptr 2795 ) 2796 { 2797 int8_t shiftCount; 2798 2799 if ( aSig0 == 0 ) { 2800 shiftCount = countLeadingZeros64( aSig1 ) - 15; 2801 if ( shiftCount < 0 ) { 2802 *zSig0Ptr = aSig1>>( - shiftCount ); 2803 *zSig1Ptr = aSig1<<( shiftCount & 63 ); 2804 } 2805 else { 2806 *zSig0Ptr = aSig1<<shiftCount; 2807 *zSig1Ptr = 0; 2808 } 2809 *zExpPtr = - shiftCount - 63; 2810 } 2811 else { 2812 shiftCount = countLeadingZeros64( aSig0 ) - 15; 2813 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); 2814 *zExpPtr = 1 - shiftCount; 2815 } 2816 2817 } 2818 2819 /*---------------------------------------------------------------------------- 2820 | Packs the sign `zSign', the exponent `zExp', and the significand formed 2821 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision 2822 | floating-point value, returning the result. After being shifted into the 2823 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply 2824 | added together to form the most significant 32 bits of the result. This 2825 | means that any integer portion of `zSig0' will be added into the exponent. 2826 | Since a properly normalized significand will have an integer portion equal 2827 | to 1, the `zExp' input should be 1 less than the desired result exponent 2828 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized 2829 | significand. 2830 *----------------------------------------------------------------------------*/ 2831 2832 static inline float128 2833 packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 ) 2834 { 2835 float128 z; 2836 2837 z.low = zSig1; 2838 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0; 2839 return z; 2840 2841 } 2842 2843 /*---------------------------------------------------------------------------- 2844 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2845 | and extended significand formed by the concatenation of `zSig0', `zSig1', 2846 | and `zSig2', and returns the proper quadruple-precision floating-point value 2847 | corresponding to the abstract input. Ordinarily, the abstract value is 2848 | simply rounded and packed into the quadruple-precision format, with the 2849 | inexact exception raised if the abstract input cannot be represented 2850 | exactly. However, if the abstract value is too large, the overflow and 2851 | inexact exceptions are raised and an infinity or maximal finite value is 2852 | returned. If the abstract value is too small, the input value is rounded to 2853 | a subnormal number, and the underflow and inexact exceptions are raised if 2854 | the abstract input cannot be represented exactly as a subnormal quadruple- 2855 | precision floating-point number. 2856 | The input significand must be normalized or smaller. If the input 2857 | significand is not normalized, `zExp' must be 0; in that case, the result 2858 | returned is a subnormal number, and it must not require rounding. In the 2859 | usual case that the input significand is normalized, `zExp' must be 1 less 2860 | than the ``true'' floating-point exponent. The handling of underflow and 2861 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 2862 *----------------------------------------------------------------------------*/ 2863 2864 static float128 roundAndPackFloat128(flag zSign, int32_t zExp, 2865 uint64_t zSig0, uint64_t zSig1, 2866 uint64_t zSig2, float_status *status) 2867 { 2868 int8_t roundingMode; 2869 flag roundNearestEven, increment, isTiny; 2870 2871 roundingMode = status->float_rounding_mode; 2872 roundNearestEven = ( roundingMode == float_round_nearest_even ); 2873 switch (roundingMode) { 2874 case float_round_nearest_even: 2875 case float_round_ties_away: 2876 increment = ((int64_t)zSig2 < 0); 2877 break; 2878 case float_round_to_zero: 2879 increment = 0; 2880 break; 2881 case float_round_up: 2882 increment = !zSign && zSig2; 2883 break; 2884 case float_round_down: 2885 increment = zSign && zSig2; 2886 break; 2887 case float_round_to_odd: 2888 increment = !(zSig1 & 0x1) && zSig2; 2889 break; 2890 default: 2891 abort(); 2892 } 2893 if ( 0x7FFD <= (uint32_t) zExp ) { 2894 if ( ( 0x7FFD < zExp ) 2895 || ( ( zExp == 0x7FFD ) 2896 && eq128( 2897 LIT64( 0x0001FFFFFFFFFFFF ), 2898 LIT64( 0xFFFFFFFFFFFFFFFF ), 2899 zSig0, 2900 zSig1 2901 ) 2902 && increment 2903 ) 2904 ) { 2905 float_raise(float_flag_overflow | float_flag_inexact, status); 2906 if ( ( roundingMode == float_round_to_zero ) 2907 || ( zSign && ( roundingMode == float_round_up ) ) 2908 || ( ! zSign && ( roundingMode == float_round_down ) ) 2909 || (roundingMode == float_round_to_odd) 2910 ) { 2911 return 2912 packFloat128( 2913 zSign, 2914 0x7FFE, 2915 LIT64( 0x0000FFFFFFFFFFFF ), 2916 LIT64( 0xFFFFFFFFFFFFFFFF ) 2917 ); 2918 } 2919 return packFloat128( zSign, 0x7FFF, 0, 0 ); 2920 } 2921 if ( zExp < 0 ) { 2922 if (status->flush_to_zero) { 2923 float_raise(float_flag_output_denormal, status); 2924 return packFloat128(zSign, 0, 0, 0); 2925 } 2926 isTiny = 2927 (status->float_detect_tininess 2928 == float_tininess_before_rounding) 2929 || ( zExp < -1 ) 2930 || ! increment 2931 || lt128( 2932 zSig0, 2933 zSig1, 2934 LIT64( 0x0001FFFFFFFFFFFF ), 2935 LIT64( 0xFFFFFFFFFFFFFFFF ) 2936 ); 2937 shift128ExtraRightJamming( 2938 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); 2939 zExp = 0; 2940 if (isTiny && zSig2) { 2941 float_raise(float_flag_underflow, status); 2942 } 2943 switch (roundingMode) { 2944 case float_round_nearest_even: 2945 case float_round_ties_away: 2946 increment = ((int64_t)zSig2 < 0); 2947 break; 2948 case float_round_to_zero: 2949 increment = 0; 2950 break; 2951 case float_round_up: 2952 increment = !zSign && zSig2; 2953 break; 2954 case float_round_down: 2955 increment = zSign && zSig2; 2956 break; 2957 case float_round_to_odd: 2958 increment = !(zSig1 & 0x1) && zSig2; 2959 break; 2960 default: 2961 abort(); 2962 } 2963 } 2964 } 2965 if (zSig2) { 2966 status->float_exception_flags |= float_flag_inexact; 2967 } 2968 if ( increment ) { 2969 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); 2970 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); 2971 } 2972 else { 2973 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; 2974 } 2975 return packFloat128( zSign, zExp, zSig0, zSig1 ); 2976 2977 } 2978 2979 /*---------------------------------------------------------------------------- 2980 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 2981 | and significand formed by the concatenation of `zSig0' and `zSig1', and 2982 | returns the proper quadruple-precision floating-point value corresponding 2983 | to the abstract input. This routine is just like `roundAndPackFloat128' 2984 | except that the input significand has fewer bits and does not have to be 2985 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- 2986 | point exponent. 2987 *----------------------------------------------------------------------------*/ 2988 2989 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp, 2990 uint64_t zSig0, uint64_t zSig1, 2991 float_status *status) 2992 { 2993 int8_t shiftCount; 2994 uint64_t zSig2; 2995 2996 if ( zSig0 == 0 ) { 2997 zSig0 = zSig1; 2998 zSig1 = 0; 2999 zExp -= 64; 3000 } 3001 shiftCount = countLeadingZeros64( zSig0 ) - 15; 3002 if ( 0 <= shiftCount ) { 3003 zSig2 = 0; 3004 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3005 } 3006 else { 3007 shift128ExtraRightJamming( 3008 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); 3009 } 3010 zExp -= shiftCount; 3011 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 3012 3013 } 3014 3015 3016 /*---------------------------------------------------------------------------- 3017 | Returns the result of converting the 32-bit two's complement integer `a' 3018 | to the extended double-precision floating-point format. The conversion 3019 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3020 | Arithmetic. 3021 *----------------------------------------------------------------------------*/ 3022 3023 floatx80 int32_to_floatx80(int32_t a, float_status *status) 3024 { 3025 flag zSign; 3026 uint32_t absA; 3027 int8_t shiftCount; 3028 uint64_t zSig; 3029 3030 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3031 zSign = ( a < 0 ); 3032 absA = zSign ? - a : a; 3033 shiftCount = countLeadingZeros32( absA ) + 32; 3034 zSig = absA; 3035 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); 3036 3037 } 3038 3039 /*---------------------------------------------------------------------------- 3040 | Returns the result of converting the 32-bit two's complement integer `a' to 3041 | the quadruple-precision floating-point format. The conversion is performed 3042 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3043 *----------------------------------------------------------------------------*/ 3044 3045 float128 int32_to_float128(int32_t a, float_status *status) 3046 { 3047 flag zSign; 3048 uint32_t absA; 3049 int8_t shiftCount; 3050 uint64_t zSig0; 3051 3052 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3053 zSign = ( a < 0 ); 3054 absA = zSign ? - a : a; 3055 shiftCount = countLeadingZeros32( absA ) + 17; 3056 zSig0 = absA; 3057 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); 3058 3059 } 3060 3061 /*---------------------------------------------------------------------------- 3062 | Returns the result of converting the 64-bit two's complement integer `a' 3063 | to the extended double-precision floating-point format. The conversion 3064 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3065 | Arithmetic. 3066 *----------------------------------------------------------------------------*/ 3067 3068 floatx80 int64_to_floatx80(int64_t a, float_status *status) 3069 { 3070 flag zSign; 3071 uint64_t absA; 3072 int8_t shiftCount; 3073 3074 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); 3075 zSign = ( a < 0 ); 3076 absA = zSign ? - a : a; 3077 shiftCount = countLeadingZeros64( absA ); 3078 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); 3079 3080 } 3081 3082 /*---------------------------------------------------------------------------- 3083 | Returns the result of converting the 64-bit two's complement integer `a' to 3084 | the quadruple-precision floating-point format. The conversion is performed 3085 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3086 *----------------------------------------------------------------------------*/ 3087 3088 float128 int64_to_float128(int64_t a, float_status *status) 3089 { 3090 flag zSign; 3091 uint64_t absA; 3092 int8_t shiftCount; 3093 int32_t zExp; 3094 uint64_t zSig0, zSig1; 3095 3096 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); 3097 zSign = ( a < 0 ); 3098 absA = zSign ? - a : a; 3099 shiftCount = countLeadingZeros64( absA ) + 49; 3100 zExp = 0x406E - shiftCount; 3101 if ( 64 <= shiftCount ) { 3102 zSig1 = 0; 3103 zSig0 = absA; 3104 shiftCount -= 64; 3105 } 3106 else { 3107 zSig1 = absA; 3108 zSig0 = 0; 3109 } 3110 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); 3111 return packFloat128( zSign, zExp, zSig0, zSig1 ); 3112 3113 } 3114 3115 /*---------------------------------------------------------------------------- 3116 | Returns the result of converting the 64-bit unsigned integer `a' 3117 | to the quadruple-precision floating-point format. The conversion is performed 3118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3119 *----------------------------------------------------------------------------*/ 3120 3121 float128 uint64_to_float128(uint64_t a, float_status *status) 3122 { 3123 if (a == 0) { 3124 return float128_zero; 3125 } 3126 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status); 3127 } 3128 3129 3130 3131 3132 /*---------------------------------------------------------------------------- 3133 | Returns the result of converting the single-precision floating-point value 3134 | `a' to the double-precision floating-point format. The conversion is 3135 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3136 | Arithmetic. 3137 *----------------------------------------------------------------------------*/ 3138 3139 float64 float32_to_float64(float32 a, float_status *status) 3140 { 3141 flag aSign; 3142 int aExp; 3143 uint32_t aSig; 3144 a = float32_squash_input_denormal(a, status); 3145 3146 aSig = extractFloat32Frac( a ); 3147 aExp = extractFloat32Exp( a ); 3148 aSign = extractFloat32Sign( a ); 3149 if ( aExp == 0xFF ) { 3150 if (aSig) { 3151 return commonNaNToFloat64(float32ToCommonNaN(a, status), status); 3152 } 3153 return packFloat64( aSign, 0x7FF, 0 ); 3154 } 3155 if ( aExp == 0 ) { 3156 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); 3157 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3158 --aExp; 3159 } 3160 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 ); 3161 3162 } 3163 3164 /*---------------------------------------------------------------------------- 3165 | Returns the result of converting the single-precision floating-point value 3166 | `a' to the extended double-precision floating-point format. The conversion 3167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 3168 | Arithmetic. 3169 *----------------------------------------------------------------------------*/ 3170 3171 floatx80 float32_to_floatx80(float32 a, float_status *status) 3172 { 3173 flag aSign; 3174 int aExp; 3175 uint32_t aSig; 3176 3177 a = float32_squash_input_denormal(a, status); 3178 aSig = extractFloat32Frac( a ); 3179 aExp = extractFloat32Exp( a ); 3180 aSign = extractFloat32Sign( a ); 3181 if ( aExp == 0xFF ) { 3182 if (aSig) { 3183 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status); 3184 } 3185 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 3186 } 3187 if ( aExp == 0 ) { 3188 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 3189 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3190 } 3191 aSig |= 0x00800000; 3192 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 ); 3193 3194 } 3195 3196 /*---------------------------------------------------------------------------- 3197 | Returns the result of converting the single-precision floating-point value 3198 | `a' to the double-precision floating-point format. The conversion is 3199 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3200 | Arithmetic. 3201 *----------------------------------------------------------------------------*/ 3202 3203 float128 float32_to_float128(float32 a, float_status *status) 3204 { 3205 flag aSign; 3206 int aExp; 3207 uint32_t aSig; 3208 3209 a = float32_squash_input_denormal(a, status); 3210 aSig = extractFloat32Frac( a ); 3211 aExp = extractFloat32Exp( a ); 3212 aSign = extractFloat32Sign( a ); 3213 if ( aExp == 0xFF ) { 3214 if (aSig) { 3215 return commonNaNToFloat128(float32ToCommonNaN(a, status), status); 3216 } 3217 return packFloat128( aSign, 0x7FFF, 0, 0 ); 3218 } 3219 if ( aExp == 0 ) { 3220 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 3221 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3222 --aExp; 3223 } 3224 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 ); 3225 3226 } 3227 3228 /*---------------------------------------------------------------------------- 3229 | Returns the remainder of the single-precision floating-point value `a' 3230 | with respect to the corresponding value `b'. The operation is performed 3231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3232 *----------------------------------------------------------------------------*/ 3233 3234 float32 float32_rem(float32 a, float32 b, float_status *status) 3235 { 3236 flag aSign, zSign; 3237 int aExp, bExp, expDiff; 3238 uint32_t aSig, bSig; 3239 uint32_t q; 3240 uint64_t aSig64, bSig64, q64; 3241 uint32_t alternateASig; 3242 int32_t sigMean; 3243 a = float32_squash_input_denormal(a, status); 3244 b = float32_squash_input_denormal(b, status); 3245 3246 aSig = extractFloat32Frac( a ); 3247 aExp = extractFloat32Exp( a ); 3248 aSign = extractFloat32Sign( a ); 3249 bSig = extractFloat32Frac( b ); 3250 bExp = extractFloat32Exp( b ); 3251 if ( aExp == 0xFF ) { 3252 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { 3253 return propagateFloat32NaN(a, b, status); 3254 } 3255 float_raise(float_flag_invalid, status); 3256 return float32_default_nan(status); 3257 } 3258 if ( bExp == 0xFF ) { 3259 if (bSig) { 3260 return propagateFloat32NaN(a, b, status); 3261 } 3262 return a; 3263 } 3264 if ( bExp == 0 ) { 3265 if ( bSig == 0 ) { 3266 float_raise(float_flag_invalid, status); 3267 return float32_default_nan(status); 3268 } 3269 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); 3270 } 3271 if ( aExp == 0 ) { 3272 if ( aSig == 0 ) return a; 3273 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3274 } 3275 expDiff = aExp - bExp; 3276 aSig |= 0x00800000; 3277 bSig |= 0x00800000; 3278 if ( expDiff < 32 ) { 3279 aSig <<= 8; 3280 bSig <<= 8; 3281 if ( expDiff < 0 ) { 3282 if ( expDiff < -1 ) return a; 3283 aSig >>= 1; 3284 } 3285 q = ( bSig <= aSig ); 3286 if ( q ) aSig -= bSig; 3287 if ( 0 < expDiff ) { 3288 q = ( ( (uint64_t) aSig )<<32 ) / bSig; 3289 q >>= 32 - expDiff; 3290 bSig >>= 2; 3291 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 3292 } 3293 else { 3294 aSig >>= 2; 3295 bSig >>= 2; 3296 } 3297 } 3298 else { 3299 if ( bSig <= aSig ) aSig -= bSig; 3300 aSig64 = ( (uint64_t) aSig )<<40; 3301 bSig64 = ( (uint64_t) bSig )<<40; 3302 expDiff -= 64; 3303 while ( 0 < expDiff ) { 3304 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3305 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3306 aSig64 = - ( ( bSig * q64 )<<38 ); 3307 expDiff -= 62; 3308 } 3309 expDiff += 64; 3310 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); 3311 q64 = ( 2 < q64 ) ? q64 - 2 : 0; 3312 q = q64>>( 64 - expDiff ); 3313 bSig <<= 6; 3314 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; 3315 } 3316 do { 3317 alternateASig = aSig; 3318 ++q; 3319 aSig -= bSig; 3320 } while ( 0 <= (int32_t) aSig ); 3321 sigMean = aSig + alternateASig; 3322 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 3323 aSig = alternateASig; 3324 } 3325 zSign = ( (int32_t) aSig < 0 ); 3326 if ( zSign ) aSig = - aSig; 3327 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status); 3328 } 3329 3330 3331 3332 /*---------------------------------------------------------------------------- 3333 | Returns the binary exponential of the single-precision floating-point value 3334 | `a'. The operation is performed according to the IEC/IEEE Standard for 3335 | Binary Floating-Point Arithmetic. 3336 | 3337 | Uses the following identities: 3338 | 3339 | 1. ------------------------------------------------------------------------- 3340 | x x*ln(2) 3341 | 2 = e 3342 | 3343 | 2. ------------------------------------------------------------------------- 3344 | 2 3 4 5 n 3345 | x x x x x x x 3346 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ... 3347 | 1! 2! 3! 4! 5! n! 3348 *----------------------------------------------------------------------------*/ 3349 3350 static const float64 float32_exp2_coefficients[15] = 3351 { 3352 const_float64( 0x3ff0000000000000ll ), /* 1 */ 3353 const_float64( 0x3fe0000000000000ll ), /* 2 */ 3354 const_float64( 0x3fc5555555555555ll ), /* 3 */ 3355 const_float64( 0x3fa5555555555555ll ), /* 4 */ 3356 const_float64( 0x3f81111111111111ll ), /* 5 */ 3357 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */ 3358 const_float64( 0x3f2a01a01a01a01all ), /* 7 */ 3359 const_float64( 0x3efa01a01a01a01all ), /* 8 */ 3360 const_float64( 0x3ec71de3a556c734ll ), /* 9 */ 3361 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */ 3362 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */ 3363 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */ 3364 const_float64( 0x3de6124613a86d09ll ), /* 13 */ 3365 const_float64( 0x3da93974a8c07c9dll ), /* 14 */ 3366 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */ 3367 }; 3368 3369 float32 float32_exp2(float32 a, float_status *status) 3370 { 3371 flag aSign; 3372 int aExp; 3373 uint32_t aSig; 3374 float64 r, x, xn; 3375 int i; 3376 a = float32_squash_input_denormal(a, status); 3377 3378 aSig = extractFloat32Frac( a ); 3379 aExp = extractFloat32Exp( a ); 3380 aSign = extractFloat32Sign( a ); 3381 3382 if ( aExp == 0xFF) { 3383 if (aSig) { 3384 return propagateFloat32NaN(a, float32_zero, status); 3385 } 3386 return (aSign) ? float32_zero : a; 3387 } 3388 if (aExp == 0) { 3389 if (aSig == 0) return float32_one; 3390 } 3391 3392 float_raise(float_flag_inexact, status); 3393 3394 /* ******************************* */ 3395 /* using float64 for approximation */ 3396 /* ******************************* */ 3397 x = float32_to_float64(a, status); 3398 x = float64_mul(x, float64_ln2, status); 3399 3400 xn = x; 3401 r = float64_one; 3402 for (i = 0 ; i < 15 ; i++) { 3403 float64 f; 3404 3405 f = float64_mul(xn, float32_exp2_coefficients[i], status); 3406 r = float64_add(r, f, status); 3407 3408 xn = float64_mul(xn, x, status); 3409 } 3410 3411 return float64_to_float32(r, status); 3412 } 3413 3414 /*---------------------------------------------------------------------------- 3415 | Returns the binary log of the single-precision floating-point value `a'. 3416 | The operation is performed according to the IEC/IEEE Standard for Binary 3417 | Floating-Point Arithmetic. 3418 *----------------------------------------------------------------------------*/ 3419 float32 float32_log2(float32 a, float_status *status) 3420 { 3421 flag aSign, zSign; 3422 int aExp; 3423 uint32_t aSig, zSig, i; 3424 3425 a = float32_squash_input_denormal(a, status); 3426 aSig = extractFloat32Frac( a ); 3427 aExp = extractFloat32Exp( a ); 3428 aSign = extractFloat32Sign( a ); 3429 3430 if ( aExp == 0 ) { 3431 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 ); 3432 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); 3433 } 3434 if ( aSign ) { 3435 float_raise(float_flag_invalid, status); 3436 return float32_default_nan(status); 3437 } 3438 if ( aExp == 0xFF ) { 3439 if (aSig) { 3440 return propagateFloat32NaN(a, float32_zero, status); 3441 } 3442 return a; 3443 } 3444 3445 aExp -= 0x7F; 3446 aSig |= 0x00800000; 3447 zSign = aExp < 0; 3448 zSig = aExp << 23; 3449 3450 for (i = 1 << 22; i > 0; i >>= 1) { 3451 aSig = ( (uint64_t)aSig * aSig ) >> 23; 3452 if ( aSig & 0x01000000 ) { 3453 aSig >>= 1; 3454 zSig |= i; 3455 } 3456 } 3457 3458 if ( zSign ) 3459 zSig = -zSig; 3460 3461 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status); 3462 } 3463 3464 /*---------------------------------------------------------------------------- 3465 | Returns 1 if the single-precision floating-point value `a' is equal to 3466 | the corresponding value `b', and 0 otherwise. The invalid exception is 3467 | raised if either operand is a NaN. Otherwise, the comparison is performed 3468 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3469 *----------------------------------------------------------------------------*/ 3470 3471 int float32_eq(float32 a, float32 b, float_status *status) 3472 { 3473 uint32_t av, bv; 3474 a = float32_squash_input_denormal(a, status); 3475 b = float32_squash_input_denormal(b, status); 3476 3477 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3478 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3479 ) { 3480 float_raise(float_flag_invalid, status); 3481 return 0; 3482 } 3483 av = float32_val(a); 3484 bv = float32_val(b); 3485 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3486 } 3487 3488 /*---------------------------------------------------------------------------- 3489 | Returns 1 if the single-precision floating-point value `a' is less than 3490 | or equal to the corresponding value `b', and 0 otherwise. The invalid 3491 | exception is raised if either operand is a NaN. The comparison is performed 3492 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3493 *----------------------------------------------------------------------------*/ 3494 3495 int float32_le(float32 a, float32 b, float_status *status) 3496 { 3497 flag aSign, bSign; 3498 uint32_t av, bv; 3499 a = float32_squash_input_denormal(a, status); 3500 b = float32_squash_input_denormal(b, status); 3501 3502 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3503 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3504 ) { 3505 float_raise(float_flag_invalid, status); 3506 return 0; 3507 } 3508 aSign = extractFloat32Sign( a ); 3509 bSign = extractFloat32Sign( b ); 3510 av = float32_val(a); 3511 bv = float32_val(b); 3512 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3513 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3514 3515 } 3516 3517 /*---------------------------------------------------------------------------- 3518 | Returns 1 if the single-precision floating-point value `a' is less than 3519 | the corresponding value `b', and 0 otherwise. The invalid exception is 3520 | raised if either operand is a NaN. The comparison is performed according 3521 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3522 *----------------------------------------------------------------------------*/ 3523 3524 int float32_lt(float32 a, float32 b, float_status *status) 3525 { 3526 flag aSign, bSign; 3527 uint32_t av, bv; 3528 a = float32_squash_input_denormal(a, status); 3529 b = float32_squash_input_denormal(b, status); 3530 3531 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3532 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3533 ) { 3534 float_raise(float_flag_invalid, status); 3535 return 0; 3536 } 3537 aSign = extractFloat32Sign( a ); 3538 bSign = extractFloat32Sign( b ); 3539 av = float32_val(a); 3540 bv = float32_val(b); 3541 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3542 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3543 3544 } 3545 3546 /*---------------------------------------------------------------------------- 3547 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3548 | be compared, and 0 otherwise. The invalid exception is raised if either 3549 | operand is a NaN. The comparison is performed according to the IEC/IEEE 3550 | Standard for Binary Floating-Point Arithmetic. 3551 *----------------------------------------------------------------------------*/ 3552 3553 int float32_unordered(float32 a, float32 b, float_status *status) 3554 { 3555 a = float32_squash_input_denormal(a, status); 3556 b = float32_squash_input_denormal(b, status); 3557 3558 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3559 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3560 ) { 3561 float_raise(float_flag_invalid, status); 3562 return 1; 3563 } 3564 return 0; 3565 } 3566 3567 /*---------------------------------------------------------------------------- 3568 | Returns 1 if the single-precision floating-point value `a' is equal to 3569 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3570 | exception. The comparison is performed according to the IEC/IEEE Standard 3571 | for Binary Floating-Point Arithmetic. 3572 *----------------------------------------------------------------------------*/ 3573 3574 int float32_eq_quiet(float32 a, float32 b, float_status *status) 3575 { 3576 a = float32_squash_input_denormal(a, status); 3577 b = float32_squash_input_denormal(b, status); 3578 3579 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3580 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3581 ) { 3582 if (float32_is_signaling_nan(a, status) 3583 || float32_is_signaling_nan(b, status)) { 3584 float_raise(float_flag_invalid, status); 3585 } 3586 return 0; 3587 } 3588 return ( float32_val(a) == float32_val(b) ) || 3589 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); 3590 } 3591 3592 /*---------------------------------------------------------------------------- 3593 | Returns 1 if the single-precision floating-point value `a' is less than or 3594 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 3595 | cause an exception. Otherwise, the comparison is performed according to the 3596 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 3597 *----------------------------------------------------------------------------*/ 3598 3599 int float32_le_quiet(float32 a, float32 b, float_status *status) 3600 { 3601 flag aSign, bSign; 3602 uint32_t av, bv; 3603 a = float32_squash_input_denormal(a, status); 3604 b = float32_squash_input_denormal(b, status); 3605 3606 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3607 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3608 ) { 3609 if (float32_is_signaling_nan(a, status) 3610 || float32_is_signaling_nan(b, status)) { 3611 float_raise(float_flag_invalid, status); 3612 } 3613 return 0; 3614 } 3615 aSign = extractFloat32Sign( a ); 3616 bSign = extractFloat32Sign( b ); 3617 av = float32_val(a); 3618 bv = float32_val(b); 3619 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 ); 3620 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 3621 3622 } 3623 3624 /*---------------------------------------------------------------------------- 3625 | Returns 1 if the single-precision floating-point value `a' is less than 3626 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 3627 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 3628 | Standard for Binary Floating-Point Arithmetic. 3629 *----------------------------------------------------------------------------*/ 3630 3631 int float32_lt_quiet(float32 a, float32 b, float_status *status) 3632 { 3633 flag aSign, bSign; 3634 uint32_t av, bv; 3635 a = float32_squash_input_denormal(a, status); 3636 b = float32_squash_input_denormal(b, status); 3637 3638 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3639 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3640 ) { 3641 if (float32_is_signaling_nan(a, status) 3642 || float32_is_signaling_nan(b, status)) { 3643 float_raise(float_flag_invalid, status); 3644 } 3645 return 0; 3646 } 3647 aSign = extractFloat32Sign( a ); 3648 bSign = extractFloat32Sign( b ); 3649 av = float32_val(a); 3650 bv = float32_val(b); 3651 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 ); 3652 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 3653 3654 } 3655 3656 /*---------------------------------------------------------------------------- 3657 | Returns 1 if the single-precision floating-point values `a' and `b' cannot 3658 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 3659 | comparison is performed according to the IEC/IEEE Standard for Binary 3660 | Floating-Point Arithmetic. 3661 *----------------------------------------------------------------------------*/ 3662 3663 int float32_unordered_quiet(float32 a, float32 b, float_status *status) 3664 { 3665 a = float32_squash_input_denormal(a, status); 3666 b = float32_squash_input_denormal(b, status); 3667 3668 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) 3669 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) 3670 ) { 3671 if (float32_is_signaling_nan(a, status) 3672 || float32_is_signaling_nan(b, status)) { 3673 float_raise(float_flag_invalid, status); 3674 } 3675 return 1; 3676 } 3677 return 0; 3678 } 3679 3680 3681 /*---------------------------------------------------------------------------- 3682 | Returns the result of converting the double-precision floating-point value 3683 | `a' to the single-precision floating-point format. The conversion is 3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point 3685 | Arithmetic. 3686 *----------------------------------------------------------------------------*/ 3687 3688 float32 float64_to_float32(float64 a, float_status *status) 3689 { 3690 flag aSign; 3691 int aExp; 3692 uint64_t aSig; 3693 uint32_t zSig; 3694 a = float64_squash_input_denormal(a, status); 3695 3696 aSig = extractFloat64Frac( a ); 3697 aExp = extractFloat64Exp( a ); 3698 aSign = extractFloat64Sign( a ); 3699 if ( aExp == 0x7FF ) { 3700 if (aSig) { 3701 return commonNaNToFloat32(float64ToCommonNaN(a, status), status); 3702 } 3703 return packFloat32( aSign, 0xFF, 0 ); 3704 } 3705 shift64RightJamming( aSig, 22, &aSig ); 3706 zSig = aSig; 3707 if ( aExp || zSig ) { 3708 zSig |= 0x40000000; 3709 aExp -= 0x381; 3710 } 3711 return roundAndPackFloat32(aSign, aExp, zSig, status); 3712 3713 } 3714 3715 3716 /*---------------------------------------------------------------------------- 3717 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a 3718 | half-precision floating-point value, returning the result. After being 3719 | shifted into the proper positions, the three fields are simply added 3720 | together to form the result. This means that any integer portion of `zSig' 3721 | will be added into the exponent. Since a properly normalized significand 3722 | will have an integer portion equal to 1, the `zExp' input should be 1 less 3723 | than the desired result exponent whenever `zSig' is a complete, normalized 3724 | significand. 3725 *----------------------------------------------------------------------------*/ 3726 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig) 3727 { 3728 return make_float16( 3729 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig); 3730 } 3731 3732 /*---------------------------------------------------------------------------- 3733 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', 3734 | and significand `zSig', and returns the proper half-precision floating- 3735 | point value corresponding to the abstract input. Ordinarily, the abstract 3736 | value is simply rounded and packed into the half-precision format, with 3737 | the inexact exception raised if the abstract input cannot be represented 3738 | exactly. However, if the abstract value is too large, the overflow and 3739 | inexact exceptions are raised and an infinity or maximal finite value is 3740 | returned. If the abstract value is too small, the input value is rounded to 3741 | a subnormal number, and the underflow and inexact exceptions are raised if 3742 | the abstract input cannot be represented exactly as a subnormal half- 3743 | precision floating-point number. 3744 | The `ieee' flag indicates whether to use IEEE standard half precision, or 3745 | ARM-style "alternative representation", which omits the NaN and Inf 3746 | encodings in order to raise the maximum representable exponent by one. 3747 | The input significand `zSig' has its binary point between bits 22 3748 | and 23, which is 13 bits to the left of the usual location. This shifted 3749 | significand must be normalized or smaller. If `zSig' is not normalized, 3750 | `zExp' must be 0; in that case, the result returned is a subnormal number, 3751 | and it must not require rounding. In the usual case that `zSig' is 3752 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. 3753 | Note the slightly odd position of the binary point in zSig compared with the 3754 | other roundAndPackFloat functions. This should probably be fixed if we 3755 | need to implement more float16 routines than just conversion. 3756 | The handling of underflow and overflow follows the IEC/IEEE Standard for 3757 | Binary Floating-Point Arithmetic. 3758 *----------------------------------------------------------------------------*/ 3759 3760 static float16 roundAndPackFloat16(flag zSign, int zExp, 3761 uint32_t zSig, flag ieee, 3762 float_status *status) 3763 { 3764 int maxexp = ieee ? 29 : 30; 3765 uint32_t mask; 3766 uint32_t increment; 3767 bool rounding_bumps_exp; 3768 bool is_tiny = false; 3769 3770 /* Calculate the mask of bits of the mantissa which are not 3771 * representable in half-precision and will be lost. 3772 */ 3773 if (zExp < 1) { 3774 /* Will be denormal in halfprec */ 3775 mask = 0x00ffffff; 3776 if (zExp >= -11) { 3777 mask >>= 11 + zExp; 3778 } 3779 } else { 3780 /* Normal number in halfprec */ 3781 mask = 0x00001fff; 3782 } 3783 3784 switch (status->float_rounding_mode) { 3785 case float_round_nearest_even: 3786 increment = (mask + 1) >> 1; 3787 if ((zSig & mask) == increment) { 3788 increment = zSig & (increment << 1); 3789 } 3790 break; 3791 case float_round_ties_away: 3792 increment = (mask + 1) >> 1; 3793 break; 3794 case float_round_up: 3795 increment = zSign ? 0 : mask; 3796 break; 3797 case float_round_down: 3798 increment = zSign ? mask : 0; 3799 break; 3800 default: /* round_to_zero */ 3801 increment = 0; 3802 break; 3803 } 3804 3805 rounding_bumps_exp = (zSig + increment >= 0x01000000); 3806 3807 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) { 3808 if (ieee) { 3809 float_raise(float_flag_overflow | float_flag_inexact, status); 3810 return packFloat16(zSign, 0x1f, 0); 3811 } else { 3812 float_raise(float_flag_invalid, status); 3813 return packFloat16(zSign, 0x1f, 0x3ff); 3814 } 3815 } 3816 3817 if (zExp < 0) { 3818 /* Note that flush-to-zero does not affect half-precision results */ 3819 is_tiny = 3820 (status->float_detect_tininess == float_tininess_before_rounding) 3821 || (zExp < -1) 3822 || (!rounding_bumps_exp); 3823 } 3824 if (zSig & mask) { 3825 float_raise(float_flag_inexact, status); 3826 if (is_tiny) { 3827 float_raise(float_flag_underflow, status); 3828 } 3829 } 3830 3831 zSig += increment; 3832 if (rounding_bumps_exp) { 3833 zSig >>= 1; 3834 zExp++; 3835 } 3836 3837 if (zExp < -10) { 3838 return packFloat16(zSign, 0, 0); 3839 } 3840 if (zExp < 0) { 3841 zSig >>= -zExp; 3842 zExp = 0; 3843 } 3844 return packFloat16(zSign, zExp, zSig >> 13); 3845 } 3846 3847 /*---------------------------------------------------------------------------- 3848 | If `a' is denormal and we are in flush-to-zero mode then set the 3849 | input-denormal exception and return zero. Otherwise just return the value. 3850 *----------------------------------------------------------------------------*/ 3851 float16 float16_squash_input_denormal(float16 a, float_status *status) 3852 { 3853 if (status->flush_inputs_to_zero) { 3854 if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) { 3855 float_raise(float_flag_input_denormal, status); 3856 return make_float16(float16_val(a) & 0x8000); 3857 } 3858 } 3859 return a; 3860 } 3861 3862 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr, 3863 uint32_t *zSigPtr) 3864 { 3865 int8_t shiftCount = countLeadingZeros32(aSig) - 21; 3866 *zSigPtr = aSig << shiftCount; 3867 *zExpPtr = 1 - shiftCount; 3868 } 3869 3870 /* Half precision floats come in two formats: standard IEEE and "ARM" format. 3871 The latter gains extra exponent range by omitting the NaN/Inf encodings. */ 3872 3873 float32 float16_to_float32(float16 a, flag ieee, float_status *status) 3874 { 3875 flag aSign; 3876 int aExp; 3877 uint32_t aSig; 3878 3879 aSign = extractFloat16Sign(a); 3880 aExp = extractFloat16Exp(a); 3881 aSig = extractFloat16Frac(a); 3882 3883 if (aExp == 0x1f && ieee) { 3884 if (aSig) { 3885 return commonNaNToFloat32(float16ToCommonNaN(a, status), status); 3886 } 3887 return packFloat32(aSign, 0xff, 0); 3888 } 3889 if (aExp == 0) { 3890 if (aSig == 0) { 3891 return packFloat32(aSign, 0, 0); 3892 } 3893 3894 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3895 aExp--; 3896 } 3897 return packFloat32( aSign, aExp + 0x70, aSig << 13); 3898 } 3899 3900 float16 float32_to_float16(float32 a, flag ieee, float_status *status) 3901 { 3902 flag aSign; 3903 int aExp; 3904 uint32_t aSig; 3905 3906 a = float32_squash_input_denormal(a, status); 3907 3908 aSig = extractFloat32Frac( a ); 3909 aExp = extractFloat32Exp( a ); 3910 aSign = extractFloat32Sign( a ); 3911 if ( aExp == 0xFF ) { 3912 if (aSig) { 3913 /* Input is a NaN */ 3914 if (!ieee) { 3915 float_raise(float_flag_invalid, status); 3916 return packFloat16(aSign, 0, 0); 3917 } 3918 return commonNaNToFloat16( 3919 float32ToCommonNaN(a, status), status); 3920 } 3921 /* Infinity */ 3922 if (!ieee) { 3923 float_raise(float_flag_invalid, status); 3924 return packFloat16(aSign, 0x1f, 0x3ff); 3925 } 3926 return packFloat16(aSign, 0x1f, 0); 3927 } 3928 if (aExp == 0 && aSig == 0) { 3929 return packFloat16(aSign, 0, 0); 3930 } 3931 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 3932 * even if the input is denormal; however this is harmless because 3933 * the largest possible single-precision denormal is still smaller 3934 * than the smallest representable half-precision denormal, and so we 3935 * will end up ignoring aSig and returning via the "always return zero" 3936 * codepath. 3937 */ 3938 aSig |= 0x00800000; 3939 aExp -= 0x71; 3940 3941 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status); 3942 } 3943 3944 float64 float16_to_float64(float16 a, flag ieee, float_status *status) 3945 { 3946 flag aSign; 3947 int aExp; 3948 uint32_t aSig; 3949 3950 aSign = extractFloat16Sign(a); 3951 aExp = extractFloat16Exp(a); 3952 aSig = extractFloat16Frac(a); 3953 3954 if (aExp == 0x1f && ieee) { 3955 if (aSig) { 3956 return commonNaNToFloat64( 3957 float16ToCommonNaN(a, status), status); 3958 } 3959 return packFloat64(aSign, 0x7ff, 0); 3960 } 3961 if (aExp == 0) { 3962 if (aSig == 0) { 3963 return packFloat64(aSign, 0, 0); 3964 } 3965 3966 normalizeFloat16Subnormal(aSig, &aExp, &aSig); 3967 aExp--; 3968 } 3969 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42); 3970 } 3971 3972 float16 float64_to_float16(float64 a, flag ieee, float_status *status) 3973 { 3974 flag aSign; 3975 int aExp; 3976 uint64_t aSig; 3977 uint32_t zSig; 3978 3979 a = float64_squash_input_denormal(a, status); 3980 3981 aSig = extractFloat64Frac(a); 3982 aExp = extractFloat64Exp(a); 3983 aSign = extractFloat64Sign(a); 3984 if (aExp == 0x7FF) { 3985 if (aSig) { 3986 /* Input is a NaN */ 3987 if (!ieee) { 3988 float_raise(float_flag_invalid, status); 3989 return packFloat16(aSign, 0, 0); 3990 } 3991 return commonNaNToFloat16( 3992 float64ToCommonNaN(a, status), status); 3993 } 3994 /* Infinity */ 3995 if (!ieee) { 3996 float_raise(float_flag_invalid, status); 3997 return packFloat16(aSign, 0x1f, 0x3ff); 3998 } 3999 return packFloat16(aSign, 0x1f, 0); 4000 } 4001 shift64RightJamming(aSig, 29, &aSig); 4002 zSig = aSig; 4003 if (aExp == 0 && zSig == 0) { 4004 return packFloat16(aSign, 0, 0); 4005 } 4006 /* Decimal point between bits 22 and 23. Note that we add the 1 bit 4007 * even if the input is denormal; however this is harmless because 4008 * the largest possible single-precision denormal is still smaller 4009 * than the smallest representable half-precision denormal, and so we 4010 * will end up ignoring aSig and returning via the "always return zero" 4011 * codepath. 4012 */ 4013 zSig |= 0x00800000; 4014 aExp -= 0x3F1; 4015 4016 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status); 4017 } 4018 4019 /*---------------------------------------------------------------------------- 4020 | Returns the result of converting the double-precision floating-point value 4021 | `a' to the extended double-precision floating-point format. The conversion 4022 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 4023 | Arithmetic. 4024 *----------------------------------------------------------------------------*/ 4025 4026 floatx80 float64_to_floatx80(float64 a, float_status *status) 4027 { 4028 flag aSign; 4029 int aExp; 4030 uint64_t aSig; 4031 4032 a = float64_squash_input_denormal(a, status); 4033 aSig = extractFloat64Frac( a ); 4034 aExp = extractFloat64Exp( a ); 4035 aSign = extractFloat64Sign( a ); 4036 if ( aExp == 0x7FF ) { 4037 if (aSig) { 4038 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status); 4039 } 4040 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4041 } 4042 if ( aExp == 0 ) { 4043 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); 4044 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4045 } 4046 return 4047 packFloatx80( 4048 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); 4049 4050 } 4051 4052 /*---------------------------------------------------------------------------- 4053 | Returns the result of converting the double-precision floating-point value 4054 | `a' to the quadruple-precision floating-point format. The conversion is 4055 | performed according to the IEC/IEEE Standard for Binary Floating-Point 4056 | Arithmetic. 4057 *----------------------------------------------------------------------------*/ 4058 4059 float128 float64_to_float128(float64 a, float_status *status) 4060 { 4061 flag aSign; 4062 int aExp; 4063 uint64_t aSig, zSig0, zSig1; 4064 4065 a = float64_squash_input_denormal(a, status); 4066 aSig = extractFloat64Frac( a ); 4067 aExp = extractFloat64Exp( a ); 4068 aSign = extractFloat64Sign( a ); 4069 if ( aExp == 0x7FF ) { 4070 if (aSig) { 4071 return commonNaNToFloat128(float64ToCommonNaN(a, status), status); 4072 } 4073 return packFloat128( aSign, 0x7FFF, 0, 0 ); 4074 } 4075 if ( aExp == 0 ) { 4076 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); 4077 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4078 --aExp; 4079 } 4080 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); 4081 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); 4082 4083 } 4084 4085 4086 /*---------------------------------------------------------------------------- 4087 | Returns the remainder of the double-precision floating-point value `a' 4088 | with respect to the corresponding value `b'. The operation is performed 4089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4090 *----------------------------------------------------------------------------*/ 4091 4092 float64 float64_rem(float64 a, float64 b, float_status *status) 4093 { 4094 flag aSign, zSign; 4095 int aExp, bExp, expDiff; 4096 uint64_t aSig, bSig; 4097 uint64_t q, alternateASig; 4098 int64_t sigMean; 4099 4100 a = float64_squash_input_denormal(a, status); 4101 b = float64_squash_input_denormal(b, status); 4102 aSig = extractFloat64Frac( a ); 4103 aExp = extractFloat64Exp( a ); 4104 aSign = extractFloat64Sign( a ); 4105 bSig = extractFloat64Frac( b ); 4106 bExp = extractFloat64Exp( b ); 4107 if ( aExp == 0x7FF ) { 4108 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { 4109 return propagateFloat64NaN(a, b, status); 4110 } 4111 float_raise(float_flag_invalid, status); 4112 return float64_default_nan(status); 4113 } 4114 if ( bExp == 0x7FF ) { 4115 if (bSig) { 4116 return propagateFloat64NaN(a, b, status); 4117 } 4118 return a; 4119 } 4120 if ( bExp == 0 ) { 4121 if ( bSig == 0 ) { 4122 float_raise(float_flag_invalid, status); 4123 return float64_default_nan(status); 4124 } 4125 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); 4126 } 4127 if ( aExp == 0 ) { 4128 if ( aSig == 0 ) return a; 4129 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4130 } 4131 expDiff = aExp - bExp; 4132 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; 4133 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; 4134 if ( expDiff < 0 ) { 4135 if ( expDiff < -1 ) return a; 4136 aSig >>= 1; 4137 } 4138 q = ( bSig <= aSig ); 4139 if ( q ) aSig -= bSig; 4140 expDiff -= 64; 4141 while ( 0 < expDiff ) { 4142 q = estimateDiv128To64( aSig, 0, bSig ); 4143 q = ( 2 < q ) ? q - 2 : 0; 4144 aSig = - ( ( bSig>>2 ) * q ); 4145 expDiff -= 62; 4146 } 4147 expDiff += 64; 4148 if ( 0 < expDiff ) { 4149 q = estimateDiv128To64( aSig, 0, bSig ); 4150 q = ( 2 < q ) ? q - 2 : 0; 4151 q >>= 64 - expDiff; 4152 bSig >>= 2; 4153 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; 4154 } 4155 else { 4156 aSig >>= 2; 4157 bSig >>= 2; 4158 } 4159 do { 4160 alternateASig = aSig; 4161 ++q; 4162 aSig -= bSig; 4163 } while ( 0 <= (int64_t) aSig ); 4164 sigMean = aSig + alternateASig; 4165 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { 4166 aSig = alternateASig; 4167 } 4168 zSign = ( (int64_t) aSig < 0 ); 4169 if ( zSign ) aSig = - aSig; 4170 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status); 4171 4172 } 4173 4174 /*---------------------------------------------------------------------------- 4175 | Returns the binary log of the double-precision floating-point value `a'. 4176 | The operation is performed according to the IEC/IEEE Standard for Binary 4177 | Floating-Point Arithmetic. 4178 *----------------------------------------------------------------------------*/ 4179 float64 float64_log2(float64 a, float_status *status) 4180 { 4181 flag aSign, zSign; 4182 int aExp; 4183 uint64_t aSig, aSig0, aSig1, zSig, i; 4184 a = float64_squash_input_denormal(a, status); 4185 4186 aSig = extractFloat64Frac( a ); 4187 aExp = extractFloat64Exp( a ); 4188 aSign = extractFloat64Sign( a ); 4189 4190 if ( aExp == 0 ) { 4191 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 ); 4192 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); 4193 } 4194 if ( aSign ) { 4195 float_raise(float_flag_invalid, status); 4196 return float64_default_nan(status); 4197 } 4198 if ( aExp == 0x7FF ) { 4199 if (aSig) { 4200 return propagateFloat64NaN(a, float64_zero, status); 4201 } 4202 return a; 4203 } 4204 4205 aExp -= 0x3FF; 4206 aSig |= LIT64( 0x0010000000000000 ); 4207 zSign = aExp < 0; 4208 zSig = (uint64_t)aExp << 52; 4209 for (i = 1LL << 51; i > 0; i >>= 1) { 4210 mul64To128( aSig, aSig, &aSig0, &aSig1 ); 4211 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 ); 4212 if ( aSig & LIT64( 0x0020000000000000 ) ) { 4213 aSig >>= 1; 4214 zSig |= i; 4215 } 4216 } 4217 4218 if ( zSign ) 4219 zSig = -zSig; 4220 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status); 4221 } 4222 4223 /*---------------------------------------------------------------------------- 4224 | Returns 1 if the double-precision floating-point value `a' is equal to the 4225 | corresponding value `b', and 0 otherwise. The invalid exception is raised 4226 | if either operand is a NaN. Otherwise, the comparison is performed 4227 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4228 *----------------------------------------------------------------------------*/ 4229 4230 int float64_eq(float64 a, float64 b, float_status *status) 4231 { 4232 uint64_t av, bv; 4233 a = float64_squash_input_denormal(a, status); 4234 b = float64_squash_input_denormal(b, status); 4235 4236 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4237 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4238 ) { 4239 float_raise(float_flag_invalid, status); 4240 return 0; 4241 } 4242 av = float64_val(a); 4243 bv = float64_val(b); 4244 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4245 4246 } 4247 4248 /*---------------------------------------------------------------------------- 4249 | Returns 1 if the double-precision floating-point value `a' is less than or 4250 | equal to the corresponding value `b', and 0 otherwise. The invalid 4251 | exception is raised if either operand is a NaN. The comparison is performed 4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4253 *----------------------------------------------------------------------------*/ 4254 4255 int float64_le(float64 a, float64 b, float_status *status) 4256 { 4257 flag aSign, bSign; 4258 uint64_t av, bv; 4259 a = float64_squash_input_denormal(a, status); 4260 b = float64_squash_input_denormal(b, status); 4261 4262 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4263 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4264 ) { 4265 float_raise(float_flag_invalid, status); 4266 return 0; 4267 } 4268 aSign = extractFloat64Sign( a ); 4269 bSign = extractFloat64Sign( b ); 4270 av = float64_val(a); 4271 bv = float64_val(b); 4272 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4273 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4274 4275 } 4276 4277 /*---------------------------------------------------------------------------- 4278 | Returns 1 if the double-precision floating-point value `a' is less than 4279 | the corresponding value `b', and 0 otherwise. The invalid exception is 4280 | raised if either operand is a NaN. The comparison is performed according 4281 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4282 *----------------------------------------------------------------------------*/ 4283 4284 int float64_lt(float64 a, float64 b, float_status *status) 4285 { 4286 flag aSign, bSign; 4287 uint64_t av, bv; 4288 4289 a = float64_squash_input_denormal(a, status); 4290 b = float64_squash_input_denormal(b, status); 4291 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4292 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4293 ) { 4294 float_raise(float_flag_invalid, status); 4295 return 0; 4296 } 4297 aSign = extractFloat64Sign( a ); 4298 bSign = extractFloat64Sign( b ); 4299 av = float64_val(a); 4300 bv = float64_val(b); 4301 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4302 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4303 4304 } 4305 4306 /*---------------------------------------------------------------------------- 4307 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4308 | be compared, and 0 otherwise. The invalid exception is raised if either 4309 | operand is a NaN. The comparison is performed according to the IEC/IEEE 4310 | Standard for Binary Floating-Point Arithmetic. 4311 *----------------------------------------------------------------------------*/ 4312 4313 int float64_unordered(float64 a, float64 b, float_status *status) 4314 { 4315 a = float64_squash_input_denormal(a, status); 4316 b = float64_squash_input_denormal(b, status); 4317 4318 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4319 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4320 ) { 4321 float_raise(float_flag_invalid, status); 4322 return 1; 4323 } 4324 return 0; 4325 } 4326 4327 /*---------------------------------------------------------------------------- 4328 | Returns 1 if the double-precision floating-point value `a' is equal to the 4329 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4330 | exception.The comparison is performed according to the IEC/IEEE Standard 4331 | for Binary Floating-Point Arithmetic. 4332 *----------------------------------------------------------------------------*/ 4333 4334 int float64_eq_quiet(float64 a, float64 b, float_status *status) 4335 { 4336 uint64_t av, bv; 4337 a = float64_squash_input_denormal(a, status); 4338 b = float64_squash_input_denormal(b, status); 4339 4340 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4341 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4342 ) { 4343 if (float64_is_signaling_nan(a, status) 4344 || float64_is_signaling_nan(b, status)) { 4345 float_raise(float_flag_invalid, status); 4346 } 4347 return 0; 4348 } 4349 av = float64_val(a); 4350 bv = float64_val(b); 4351 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4352 4353 } 4354 4355 /*---------------------------------------------------------------------------- 4356 | Returns 1 if the double-precision floating-point value `a' is less than or 4357 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 4358 | cause an exception. Otherwise, the comparison is performed according to the 4359 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4360 *----------------------------------------------------------------------------*/ 4361 4362 int float64_le_quiet(float64 a, float64 b, float_status *status) 4363 { 4364 flag aSign, bSign; 4365 uint64_t av, bv; 4366 a = float64_squash_input_denormal(a, status); 4367 b = float64_squash_input_denormal(b, status); 4368 4369 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4370 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4371 ) { 4372 if (float64_is_signaling_nan(a, status) 4373 || float64_is_signaling_nan(b, status)) { 4374 float_raise(float_flag_invalid, status); 4375 } 4376 return 0; 4377 } 4378 aSign = extractFloat64Sign( a ); 4379 bSign = extractFloat64Sign( b ); 4380 av = float64_val(a); 4381 bv = float64_val(b); 4382 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 ); 4383 return ( av == bv ) || ( aSign ^ ( av < bv ) ); 4384 4385 } 4386 4387 /*---------------------------------------------------------------------------- 4388 | Returns 1 if the double-precision floating-point value `a' is less than 4389 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 4390 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 4391 | Standard for Binary Floating-Point Arithmetic. 4392 *----------------------------------------------------------------------------*/ 4393 4394 int float64_lt_quiet(float64 a, float64 b, float_status *status) 4395 { 4396 flag aSign, bSign; 4397 uint64_t av, bv; 4398 a = float64_squash_input_denormal(a, status); 4399 b = float64_squash_input_denormal(b, status); 4400 4401 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4402 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4403 ) { 4404 if (float64_is_signaling_nan(a, status) 4405 || float64_is_signaling_nan(b, status)) { 4406 float_raise(float_flag_invalid, status); 4407 } 4408 return 0; 4409 } 4410 aSign = extractFloat64Sign( a ); 4411 bSign = extractFloat64Sign( b ); 4412 av = float64_val(a); 4413 bv = float64_val(b); 4414 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 ); 4415 return ( av != bv ) && ( aSign ^ ( av < bv ) ); 4416 4417 } 4418 4419 /*---------------------------------------------------------------------------- 4420 | Returns 1 if the double-precision floating-point values `a' and `b' cannot 4421 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 4422 | comparison is performed according to the IEC/IEEE Standard for Binary 4423 | Floating-Point Arithmetic. 4424 *----------------------------------------------------------------------------*/ 4425 4426 int float64_unordered_quiet(float64 a, float64 b, float_status *status) 4427 { 4428 a = float64_squash_input_denormal(a, status); 4429 b = float64_squash_input_denormal(b, status); 4430 4431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) 4432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) 4433 ) { 4434 if (float64_is_signaling_nan(a, status) 4435 || float64_is_signaling_nan(b, status)) { 4436 float_raise(float_flag_invalid, status); 4437 } 4438 return 1; 4439 } 4440 return 0; 4441 } 4442 4443 /*---------------------------------------------------------------------------- 4444 | Returns the result of converting the extended double-precision floating- 4445 | point value `a' to the 32-bit two's complement integer format. The 4446 | conversion is performed according to the IEC/IEEE Standard for Binary 4447 | Floating-Point Arithmetic---which means in particular that the conversion 4448 | is rounded according to the current rounding mode. If `a' is a NaN, the 4449 | largest positive integer is returned. Otherwise, if the conversion 4450 | overflows, the largest integer with the same sign as `a' is returned. 4451 *----------------------------------------------------------------------------*/ 4452 4453 int32_t floatx80_to_int32(floatx80 a, float_status *status) 4454 { 4455 flag aSign; 4456 int32_t aExp, shiftCount; 4457 uint64_t aSig; 4458 4459 if (floatx80_invalid_encoding(a)) { 4460 float_raise(float_flag_invalid, status); 4461 return 1 << 31; 4462 } 4463 aSig = extractFloatx80Frac( a ); 4464 aExp = extractFloatx80Exp( a ); 4465 aSign = extractFloatx80Sign( a ); 4466 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4467 shiftCount = 0x4037 - aExp; 4468 if ( shiftCount <= 0 ) shiftCount = 1; 4469 shift64RightJamming( aSig, shiftCount, &aSig ); 4470 return roundAndPackInt32(aSign, aSig, status); 4471 4472 } 4473 4474 /*---------------------------------------------------------------------------- 4475 | Returns the result of converting the extended double-precision floating- 4476 | point value `a' to the 32-bit two's complement integer format. The 4477 | conversion is performed according to the IEC/IEEE Standard for Binary 4478 | Floating-Point Arithmetic, except that the conversion is always rounded 4479 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4480 | Otherwise, if the conversion overflows, the largest integer with the same 4481 | sign as `a' is returned. 4482 *----------------------------------------------------------------------------*/ 4483 4484 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status) 4485 { 4486 flag aSign; 4487 int32_t aExp, shiftCount; 4488 uint64_t aSig, savedASig; 4489 int32_t z; 4490 4491 if (floatx80_invalid_encoding(a)) { 4492 float_raise(float_flag_invalid, status); 4493 return 1 << 31; 4494 } 4495 aSig = extractFloatx80Frac( a ); 4496 aExp = extractFloatx80Exp( a ); 4497 aSign = extractFloatx80Sign( a ); 4498 if ( 0x401E < aExp ) { 4499 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0; 4500 goto invalid; 4501 } 4502 else if ( aExp < 0x3FFF ) { 4503 if (aExp || aSig) { 4504 status->float_exception_flags |= float_flag_inexact; 4505 } 4506 return 0; 4507 } 4508 shiftCount = 0x403E - aExp; 4509 savedASig = aSig; 4510 aSig >>= shiftCount; 4511 z = aSig; 4512 if ( aSign ) z = - z; 4513 if ( ( z < 0 ) ^ aSign ) { 4514 invalid: 4515 float_raise(float_flag_invalid, status); 4516 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 4517 } 4518 if ( ( aSig<<shiftCount ) != savedASig ) { 4519 status->float_exception_flags |= float_flag_inexact; 4520 } 4521 return z; 4522 4523 } 4524 4525 /*---------------------------------------------------------------------------- 4526 | Returns the result of converting the extended double-precision floating- 4527 | point value `a' to the 64-bit two's complement integer format. The 4528 | conversion is performed according to the IEC/IEEE Standard for Binary 4529 | Floating-Point Arithmetic---which means in particular that the conversion 4530 | is rounded according to the current rounding mode. If `a' is a NaN, 4531 | the largest positive integer is returned. Otherwise, if the conversion 4532 | overflows, the largest integer with the same sign as `a' is returned. 4533 *----------------------------------------------------------------------------*/ 4534 4535 int64_t floatx80_to_int64(floatx80 a, float_status *status) 4536 { 4537 flag aSign; 4538 int32_t aExp, shiftCount; 4539 uint64_t aSig, aSigExtra; 4540 4541 if (floatx80_invalid_encoding(a)) { 4542 float_raise(float_flag_invalid, status); 4543 return 1ULL << 63; 4544 } 4545 aSig = extractFloatx80Frac( a ); 4546 aExp = extractFloatx80Exp( a ); 4547 aSign = extractFloatx80Sign( a ); 4548 shiftCount = 0x403E - aExp; 4549 if ( shiftCount <= 0 ) { 4550 if ( shiftCount ) { 4551 float_raise(float_flag_invalid, status); 4552 if ( ! aSign 4553 || ( ( aExp == 0x7FFF ) 4554 && ( aSig != LIT64( 0x8000000000000000 ) ) ) 4555 ) { 4556 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4557 } 4558 return (int64_t) LIT64( 0x8000000000000000 ); 4559 } 4560 aSigExtra = 0; 4561 } 4562 else { 4563 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); 4564 } 4565 return roundAndPackInt64(aSign, aSig, aSigExtra, status); 4566 4567 } 4568 4569 /*---------------------------------------------------------------------------- 4570 | Returns the result of converting the extended double-precision floating- 4571 | point value `a' to the 64-bit two's complement integer format. The 4572 | conversion is performed according to the IEC/IEEE Standard for Binary 4573 | Floating-Point Arithmetic, except that the conversion is always rounded 4574 | toward zero. If `a' is a NaN, the largest positive integer is returned. 4575 | Otherwise, if the conversion overflows, the largest integer with the same 4576 | sign as `a' is returned. 4577 *----------------------------------------------------------------------------*/ 4578 4579 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status) 4580 { 4581 flag aSign; 4582 int32_t aExp, shiftCount; 4583 uint64_t aSig; 4584 int64_t z; 4585 4586 if (floatx80_invalid_encoding(a)) { 4587 float_raise(float_flag_invalid, status); 4588 return 1ULL << 63; 4589 } 4590 aSig = extractFloatx80Frac( a ); 4591 aExp = extractFloatx80Exp( a ); 4592 aSign = extractFloatx80Sign( a ); 4593 shiftCount = aExp - 0x403E; 4594 if ( 0 <= shiftCount ) { 4595 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); 4596 if ( ( a.high != 0xC03E ) || aSig ) { 4597 float_raise(float_flag_invalid, status); 4598 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { 4599 return LIT64( 0x7FFFFFFFFFFFFFFF ); 4600 } 4601 } 4602 return (int64_t) LIT64( 0x8000000000000000 ); 4603 } 4604 else if ( aExp < 0x3FFF ) { 4605 if (aExp | aSig) { 4606 status->float_exception_flags |= float_flag_inexact; 4607 } 4608 return 0; 4609 } 4610 z = aSig>>( - shiftCount ); 4611 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) { 4612 status->float_exception_flags |= float_flag_inexact; 4613 } 4614 if ( aSign ) z = - z; 4615 return z; 4616 4617 } 4618 4619 /*---------------------------------------------------------------------------- 4620 | Returns the result of converting the extended double-precision floating- 4621 | point value `a' to the single-precision floating-point format. The 4622 | conversion is performed according to the IEC/IEEE Standard for Binary 4623 | Floating-Point Arithmetic. 4624 *----------------------------------------------------------------------------*/ 4625 4626 float32 floatx80_to_float32(floatx80 a, float_status *status) 4627 { 4628 flag aSign; 4629 int32_t aExp; 4630 uint64_t aSig; 4631 4632 if (floatx80_invalid_encoding(a)) { 4633 float_raise(float_flag_invalid, status); 4634 return float32_default_nan(status); 4635 } 4636 aSig = extractFloatx80Frac( a ); 4637 aExp = extractFloatx80Exp( a ); 4638 aSign = extractFloatx80Sign( a ); 4639 if ( aExp == 0x7FFF ) { 4640 if ( (uint64_t) ( aSig<<1 ) ) { 4641 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status); 4642 } 4643 return packFloat32( aSign, 0xFF, 0 ); 4644 } 4645 shift64RightJamming( aSig, 33, &aSig ); 4646 if ( aExp || aSig ) aExp -= 0x3F81; 4647 return roundAndPackFloat32(aSign, aExp, aSig, status); 4648 4649 } 4650 4651 /*---------------------------------------------------------------------------- 4652 | Returns the result of converting the extended double-precision floating- 4653 | point value `a' to the double-precision floating-point format. The 4654 | conversion is performed according to the IEC/IEEE Standard for Binary 4655 | Floating-Point Arithmetic. 4656 *----------------------------------------------------------------------------*/ 4657 4658 float64 floatx80_to_float64(floatx80 a, float_status *status) 4659 { 4660 flag aSign; 4661 int32_t aExp; 4662 uint64_t aSig, zSig; 4663 4664 if (floatx80_invalid_encoding(a)) { 4665 float_raise(float_flag_invalid, status); 4666 return float64_default_nan(status); 4667 } 4668 aSig = extractFloatx80Frac( a ); 4669 aExp = extractFloatx80Exp( a ); 4670 aSign = extractFloatx80Sign( a ); 4671 if ( aExp == 0x7FFF ) { 4672 if ( (uint64_t) ( aSig<<1 ) ) { 4673 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status); 4674 } 4675 return packFloat64( aSign, 0x7FF, 0 ); 4676 } 4677 shift64RightJamming( aSig, 1, &zSig ); 4678 if ( aExp || aSig ) aExp -= 0x3C01; 4679 return roundAndPackFloat64(aSign, aExp, zSig, status); 4680 4681 } 4682 4683 /*---------------------------------------------------------------------------- 4684 | Returns the result of converting the extended double-precision floating- 4685 | point value `a' to the quadruple-precision floating-point format. The 4686 | conversion is performed according to the IEC/IEEE Standard for Binary 4687 | Floating-Point Arithmetic. 4688 *----------------------------------------------------------------------------*/ 4689 4690 float128 floatx80_to_float128(floatx80 a, float_status *status) 4691 { 4692 flag aSign; 4693 int aExp; 4694 uint64_t aSig, zSig0, zSig1; 4695 4696 if (floatx80_invalid_encoding(a)) { 4697 float_raise(float_flag_invalid, status); 4698 return float128_default_nan(status); 4699 } 4700 aSig = extractFloatx80Frac( a ); 4701 aExp = extractFloatx80Exp( a ); 4702 aSign = extractFloatx80Sign( a ); 4703 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) { 4704 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status); 4705 } 4706 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); 4707 return packFloat128( aSign, aExp, zSig0, zSig1 ); 4708 4709 } 4710 4711 /*---------------------------------------------------------------------------- 4712 | Rounds the extended double-precision floating-point value `a' 4713 | to the precision provided by floatx80_rounding_precision and returns the 4714 | result as an extended double-precision floating-point value. 4715 | The operation is performed according to the IEC/IEEE Standard for Binary 4716 | Floating-Point Arithmetic. 4717 *----------------------------------------------------------------------------*/ 4718 4719 floatx80 floatx80_round(floatx80 a, float_status *status) 4720 { 4721 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4722 extractFloatx80Sign(a), 4723 extractFloatx80Exp(a), 4724 extractFloatx80Frac(a), 0, status); 4725 } 4726 4727 /*---------------------------------------------------------------------------- 4728 | Rounds the extended double-precision floating-point value `a' to an integer, 4729 | and returns the result as an extended quadruple-precision floating-point 4730 | value. The operation is performed according to the IEC/IEEE Standard for 4731 | Binary Floating-Point Arithmetic. 4732 *----------------------------------------------------------------------------*/ 4733 4734 floatx80 floatx80_round_to_int(floatx80 a, float_status *status) 4735 { 4736 flag aSign; 4737 int32_t aExp; 4738 uint64_t lastBitMask, roundBitsMask; 4739 floatx80 z; 4740 4741 if (floatx80_invalid_encoding(a)) { 4742 float_raise(float_flag_invalid, status); 4743 return floatx80_default_nan(status); 4744 } 4745 aExp = extractFloatx80Exp( a ); 4746 if ( 0x403E <= aExp ) { 4747 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) { 4748 return propagateFloatx80NaN(a, a, status); 4749 } 4750 return a; 4751 } 4752 if ( aExp < 0x3FFF ) { 4753 if ( ( aExp == 0 ) 4754 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { 4755 return a; 4756 } 4757 status->float_exception_flags |= float_flag_inexact; 4758 aSign = extractFloatx80Sign( a ); 4759 switch (status->float_rounding_mode) { 4760 case float_round_nearest_even: 4761 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) 4762 ) { 4763 return 4764 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4765 } 4766 break; 4767 case float_round_ties_away: 4768 if (aExp == 0x3FFE) { 4769 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000)); 4770 } 4771 break; 4772 case float_round_down: 4773 return 4774 aSign ? 4775 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) 4776 : packFloatx80( 0, 0, 0 ); 4777 case float_round_up: 4778 return 4779 aSign ? packFloatx80( 1, 0, 0 ) 4780 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); 4781 } 4782 return packFloatx80( aSign, 0, 0 ); 4783 } 4784 lastBitMask = 1; 4785 lastBitMask <<= 0x403E - aExp; 4786 roundBitsMask = lastBitMask - 1; 4787 z = a; 4788 switch (status->float_rounding_mode) { 4789 case float_round_nearest_even: 4790 z.low += lastBitMask>>1; 4791 if ((z.low & roundBitsMask) == 0) { 4792 z.low &= ~lastBitMask; 4793 } 4794 break; 4795 case float_round_ties_away: 4796 z.low += lastBitMask >> 1; 4797 break; 4798 case float_round_to_zero: 4799 break; 4800 case float_round_up: 4801 if (!extractFloatx80Sign(z)) { 4802 z.low += roundBitsMask; 4803 } 4804 break; 4805 case float_round_down: 4806 if (extractFloatx80Sign(z)) { 4807 z.low += roundBitsMask; 4808 } 4809 break; 4810 default: 4811 abort(); 4812 } 4813 z.low &= ~ roundBitsMask; 4814 if ( z.low == 0 ) { 4815 ++z.high; 4816 z.low = LIT64( 0x8000000000000000 ); 4817 } 4818 if (z.low != a.low) { 4819 status->float_exception_flags |= float_flag_inexact; 4820 } 4821 return z; 4822 4823 } 4824 4825 /*---------------------------------------------------------------------------- 4826 | Returns the result of adding the absolute values of the extended double- 4827 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is 4828 | negated before being returned. `zSign' is ignored if the result is a NaN. 4829 | The addition is performed according to the IEC/IEEE Standard for Binary 4830 | Floating-Point Arithmetic. 4831 *----------------------------------------------------------------------------*/ 4832 4833 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4834 float_status *status) 4835 { 4836 int32_t aExp, bExp, zExp; 4837 uint64_t aSig, bSig, zSig0, zSig1; 4838 int32_t expDiff; 4839 4840 aSig = extractFloatx80Frac( a ); 4841 aExp = extractFloatx80Exp( a ); 4842 bSig = extractFloatx80Frac( b ); 4843 bExp = extractFloatx80Exp( b ); 4844 expDiff = aExp - bExp; 4845 if ( 0 < expDiff ) { 4846 if ( aExp == 0x7FFF ) { 4847 if ((uint64_t)(aSig << 1)) { 4848 return propagateFloatx80NaN(a, b, status); 4849 } 4850 return a; 4851 } 4852 if ( bExp == 0 ) --expDiff; 4853 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4854 zExp = aExp; 4855 } 4856 else if ( expDiff < 0 ) { 4857 if ( bExp == 0x7FFF ) { 4858 if ((uint64_t)(bSig << 1)) { 4859 return propagateFloatx80NaN(a, b, status); 4860 } 4861 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4862 } 4863 if ( aExp == 0 ) ++expDiff; 4864 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4865 zExp = bExp; 4866 } 4867 else { 4868 if ( aExp == 0x7FFF ) { 4869 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4870 return propagateFloatx80NaN(a, b, status); 4871 } 4872 return a; 4873 } 4874 zSig1 = 0; 4875 zSig0 = aSig + bSig; 4876 if ( aExp == 0 ) { 4877 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); 4878 goto roundAndPack; 4879 } 4880 zExp = aExp; 4881 goto shiftRight1; 4882 } 4883 zSig0 = aSig + bSig; 4884 if ( (int64_t) zSig0 < 0 ) goto roundAndPack; 4885 shiftRight1: 4886 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); 4887 zSig0 |= LIT64( 0x8000000000000000 ); 4888 ++zExp; 4889 roundAndPack: 4890 return roundAndPackFloatx80(status->floatx80_rounding_precision, 4891 zSign, zExp, zSig0, zSig1, status); 4892 } 4893 4894 /*---------------------------------------------------------------------------- 4895 | Returns the result of subtracting the absolute values of the extended 4896 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the 4897 | difference is negated before being returned. `zSign' is ignored if the 4898 | result is a NaN. The subtraction is performed according to the IEC/IEEE 4899 | Standard for Binary Floating-Point Arithmetic. 4900 *----------------------------------------------------------------------------*/ 4901 4902 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign, 4903 float_status *status) 4904 { 4905 int32_t aExp, bExp, zExp; 4906 uint64_t aSig, bSig, zSig0, zSig1; 4907 int32_t expDiff; 4908 4909 aSig = extractFloatx80Frac( a ); 4910 aExp = extractFloatx80Exp( a ); 4911 bSig = extractFloatx80Frac( b ); 4912 bExp = extractFloatx80Exp( b ); 4913 expDiff = aExp - bExp; 4914 if ( 0 < expDiff ) goto aExpBigger; 4915 if ( expDiff < 0 ) goto bExpBigger; 4916 if ( aExp == 0x7FFF ) { 4917 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) { 4918 return propagateFloatx80NaN(a, b, status); 4919 } 4920 float_raise(float_flag_invalid, status); 4921 return floatx80_default_nan(status); 4922 } 4923 if ( aExp == 0 ) { 4924 aExp = 1; 4925 bExp = 1; 4926 } 4927 zSig1 = 0; 4928 if ( bSig < aSig ) goto aBigger; 4929 if ( aSig < bSig ) goto bBigger; 4930 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0); 4931 bExpBigger: 4932 if ( bExp == 0x7FFF ) { 4933 if ((uint64_t)(bSig << 1)) { 4934 return propagateFloatx80NaN(a, b, status); 4935 } 4936 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); 4937 } 4938 if ( aExp == 0 ) ++expDiff; 4939 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); 4940 bBigger: 4941 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); 4942 zExp = bExp; 4943 zSign ^= 1; 4944 goto normalizeRoundAndPack; 4945 aExpBigger: 4946 if ( aExp == 0x7FFF ) { 4947 if ((uint64_t)(aSig << 1)) { 4948 return propagateFloatx80NaN(a, b, status); 4949 } 4950 return a; 4951 } 4952 if ( bExp == 0 ) --expDiff; 4953 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); 4954 aBigger: 4955 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); 4956 zExp = aExp; 4957 normalizeRoundAndPack: 4958 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 4959 zSign, zExp, zSig0, zSig1, status); 4960 } 4961 4962 /*---------------------------------------------------------------------------- 4963 | Returns the result of adding the extended double-precision floating-point 4964 | values `a' and `b'. The operation is performed according to the IEC/IEEE 4965 | Standard for Binary Floating-Point Arithmetic. 4966 *----------------------------------------------------------------------------*/ 4967 4968 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status) 4969 { 4970 flag aSign, bSign; 4971 4972 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4973 float_raise(float_flag_invalid, status); 4974 return floatx80_default_nan(status); 4975 } 4976 aSign = extractFloatx80Sign( a ); 4977 bSign = extractFloatx80Sign( b ); 4978 if ( aSign == bSign ) { 4979 return addFloatx80Sigs(a, b, aSign, status); 4980 } 4981 else { 4982 return subFloatx80Sigs(a, b, aSign, status); 4983 } 4984 4985 } 4986 4987 /*---------------------------------------------------------------------------- 4988 | Returns the result of subtracting the extended double-precision floating- 4989 | point values `a' and `b'. The operation is performed according to the 4990 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 4991 *----------------------------------------------------------------------------*/ 4992 4993 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status) 4994 { 4995 flag aSign, bSign; 4996 4997 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 4998 float_raise(float_flag_invalid, status); 4999 return floatx80_default_nan(status); 5000 } 5001 aSign = extractFloatx80Sign( a ); 5002 bSign = extractFloatx80Sign( b ); 5003 if ( aSign == bSign ) { 5004 return subFloatx80Sigs(a, b, aSign, status); 5005 } 5006 else { 5007 return addFloatx80Sigs(a, b, aSign, status); 5008 } 5009 5010 } 5011 5012 /*---------------------------------------------------------------------------- 5013 | Returns the result of multiplying the extended double-precision floating- 5014 | point values `a' and `b'. The operation is performed according to the 5015 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5016 *----------------------------------------------------------------------------*/ 5017 5018 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status) 5019 { 5020 flag aSign, bSign, zSign; 5021 int32_t aExp, bExp, zExp; 5022 uint64_t aSig, bSig, zSig0, zSig1; 5023 5024 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5025 float_raise(float_flag_invalid, status); 5026 return floatx80_default_nan(status); 5027 } 5028 aSig = extractFloatx80Frac( a ); 5029 aExp = extractFloatx80Exp( a ); 5030 aSign = extractFloatx80Sign( a ); 5031 bSig = extractFloatx80Frac( b ); 5032 bExp = extractFloatx80Exp( b ); 5033 bSign = extractFloatx80Sign( b ); 5034 zSign = aSign ^ bSign; 5035 if ( aExp == 0x7FFF ) { 5036 if ( (uint64_t) ( aSig<<1 ) 5037 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5038 return propagateFloatx80NaN(a, b, status); 5039 } 5040 if ( ( bExp | bSig ) == 0 ) goto invalid; 5041 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5042 } 5043 if ( bExp == 0x7FFF ) { 5044 if ((uint64_t)(bSig << 1)) { 5045 return propagateFloatx80NaN(a, b, status); 5046 } 5047 if ( ( aExp | aSig ) == 0 ) { 5048 invalid: 5049 float_raise(float_flag_invalid, status); 5050 return floatx80_default_nan(status); 5051 } 5052 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5053 } 5054 if ( aExp == 0 ) { 5055 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5056 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5057 } 5058 if ( bExp == 0 ) { 5059 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5060 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5061 } 5062 zExp = aExp + bExp - 0x3FFE; 5063 mul64To128( aSig, bSig, &zSig0, &zSig1 ); 5064 if ( 0 < (int64_t) zSig0 ) { 5065 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); 5066 --zExp; 5067 } 5068 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5069 zSign, zExp, zSig0, zSig1, status); 5070 } 5071 5072 /*---------------------------------------------------------------------------- 5073 | Returns the result of dividing the extended double-precision floating-point 5074 | value `a' by the corresponding value `b'. The operation is performed 5075 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5076 *----------------------------------------------------------------------------*/ 5077 5078 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status) 5079 { 5080 flag aSign, bSign, zSign; 5081 int32_t aExp, bExp, zExp; 5082 uint64_t aSig, bSig, zSig0, zSig1; 5083 uint64_t rem0, rem1, rem2, term0, term1, term2; 5084 5085 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5086 float_raise(float_flag_invalid, status); 5087 return floatx80_default_nan(status); 5088 } 5089 aSig = extractFloatx80Frac( a ); 5090 aExp = extractFloatx80Exp( a ); 5091 aSign = extractFloatx80Sign( a ); 5092 bSig = extractFloatx80Frac( b ); 5093 bExp = extractFloatx80Exp( b ); 5094 bSign = extractFloatx80Sign( b ); 5095 zSign = aSign ^ bSign; 5096 if ( aExp == 0x7FFF ) { 5097 if ((uint64_t)(aSig << 1)) { 5098 return propagateFloatx80NaN(a, b, status); 5099 } 5100 if ( bExp == 0x7FFF ) { 5101 if ((uint64_t)(bSig << 1)) { 5102 return propagateFloatx80NaN(a, b, status); 5103 } 5104 goto invalid; 5105 } 5106 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5107 } 5108 if ( bExp == 0x7FFF ) { 5109 if ((uint64_t)(bSig << 1)) { 5110 return propagateFloatx80NaN(a, b, status); 5111 } 5112 return packFloatx80( zSign, 0, 0 ); 5113 } 5114 if ( bExp == 0 ) { 5115 if ( bSig == 0 ) { 5116 if ( ( aExp | aSig ) == 0 ) { 5117 invalid: 5118 float_raise(float_flag_invalid, status); 5119 return floatx80_default_nan(status); 5120 } 5121 float_raise(float_flag_divbyzero, status); 5122 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5123 } 5124 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5125 } 5126 if ( aExp == 0 ) { 5127 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); 5128 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); 5129 } 5130 zExp = aExp - bExp + 0x3FFE; 5131 rem1 = 0; 5132 if ( bSig <= aSig ) { 5133 shift128Right( aSig, 0, 1, &aSig, &rem1 ); 5134 ++zExp; 5135 } 5136 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); 5137 mul64To128( bSig, zSig0, &term0, &term1 ); 5138 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); 5139 while ( (int64_t) rem0 < 0 ) { 5140 --zSig0; 5141 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); 5142 } 5143 zSig1 = estimateDiv128To64( rem1, 0, bSig ); 5144 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) { 5145 mul64To128( bSig, zSig1, &term1, &term2 ); 5146 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5147 while ( (int64_t) rem1 < 0 ) { 5148 --zSig1; 5149 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); 5150 } 5151 zSig1 |= ( ( rem1 | rem2 ) != 0 ); 5152 } 5153 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5154 zSign, zExp, zSig0, zSig1, status); 5155 } 5156 5157 /*---------------------------------------------------------------------------- 5158 | Returns the remainder of the extended double-precision floating-point value 5159 | `a' with respect to the corresponding value `b'. The operation is performed 5160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5161 *----------------------------------------------------------------------------*/ 5162 5163 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status) 5164 { 5165 flag aSign, zSign; 5166 int32_t aExp, bExp, expDiff; 5167 uint64_t aSig0, aSig1, bSig; 5168 uint64_t q, term0, term1, alternateASig0, alternateASig1; 5169 5170 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5171 float_raise(float_flag_invalid, status); 5172 return floatx80_default_nan(status); 5173 } 5174 aSig0 = extractFloatx80Frac( a ); 5175 aExp = extractFloatx80Exp( a ); 5176 aSign = extractFloatx80Sign( a ); 5177 bSig = extractFloatx80Frac( b ); 5178 bExp = extractFloatx80Exp( b ); 5179 if ( aExp == 0x7FFF ) { 5180 if ( (uint64_t) ( aSig0<<1 ) 5181 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) { 5182 return propagateFloatx80NaN(a, b, status); 5183 } 5184 goto invalid; 5185 } 5186 if ( bExp == 0x7FFF ) { 5187 if ((uint64_t)(bSig << 1)) { 5188 return propagateFloatx80NaN(a, b, status); 5189 } 5190 return a; 5191 } 5192 if ( bExp == 0 ) { 5193 if ( bSig == 0 ) { 5194 invalid: 5195 float_raise(float_flag_invalid, status); 5196 return floatx80_default_nan(status); 5197 } 5198 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); 5199 } 5200 if ( aExp == 0 ) { 5201 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a; 5202 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5203 } 5204 bSig |= LIT64( 0x8000000000000000 ); 5205 zSign = aSign; 5206 expDiff = aExp - bExp; 5207 aSig1 = 0; 5208 if ( expDiff < 0 ) { 5209 if ( expDiff < -1 ) return a; 5210 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); 5211 expDiff = 0; 5212 } 5213 q = ( bSig <= aSig0 ); 5214 if ( q ) aSig0 -= bSig; 5215 expDiff -= 64; 5216 while ( 0 < expDiff ) { 5217 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5218 q = ( 2 < q ) ? q - 2 : 0; 5219 mul64To128( bSig, q, &term0, &term1 ); 5220 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5221 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); 5222 expDiff -= 62; 5223 } 5224 expDiff += 64; 5225 if ( 0 < expDiff ) { 5226 q = estimateDiv128To64( aSig0, aSig1, bSig ); 5227 q = ( 2 < q ) ? q - 2 : 0; 5228 q >>= 64 - expDiff; 5229 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); 5230 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5231 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); 5232 while ( le128( term0, term1, aSig0, aSig1 ) ) { 5233 ++q; 5234 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); 5235 } 5236 } 5237 else { 5238 term1 = 0; 5239 term0 = bSig; 5240 } 5241 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); 5242 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5243 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) 5244 && ( q & 1 ) ) 5245 ) { 5246 aSig0 = alternateASig0; 5247 aSig1 = alternateASig1; 5248 zSign = ! zSign; 5249 } 5250 return 5251 normalizeRoundAndPackFloatx80( 5252 80, zSign, bExp + expDiff, aSig0, aSig1, status); 5253 5254 } 5255 5256 /*---------------------------------------------------------------------------- 5257 | Returns the square root of the extended double-precision floating-point 5258 | value `a'. The operation is performed according to the IEC/IEEE Standard 5259 | for Binary Floating-Point Arithmetic. 5260 *----------------------------------------------------------------------------*/ 5261 5262 floatx80 floatx80_sqrt(floatx80 a, float_status *status) 5263 { 5264 flag aSign; 5265 int32_t aExp, zExp; 5266 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0; 5267 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 5268 5269 if (floatx80_invalid_encoding(a)) { 5270 float_raise(float_flag_invalid, status); 5271 return floatx80_default_nan(status); 5272 } 5273 aSig0 = extractFloatx80Frac( a ); 5274 aExp = extractFloatx80Exp( a ); 5275 aSign = extractFloatx80Sign( a ); 5276 if ( aExp == 0x7FFF ) { 5277 if ((uint64_t)(aSig0 << 1)) { 5278 return propagateFloatx80NaN(a, a, status); 5279 } 5280 if ( ! aSign ) return a; 5281 goto invalid; 5282 } 5283 if ( aSign ) { 5284 if ( ( aExp | aSig0 ) == 0 ) return a; 5285 invalid: 5286 float_raise(float_flag_invalid, status); 5287 return floatx80_default_nan(status); 5288 } 5289 if ( aExp == 0 ) { 5290 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); 5291 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); 5292 } 5293 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; 5294 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); 5295 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); 5296 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 5297 doubleZSig0 = zSig0<<1; 5298 mul64To128( zSig0, zSig0, &term0, &term1 ); 5299 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 5300 while ( (int64_t) rem0 < 0 ) { 5301 --zSig0; 5302 doubleZSig0 -= 2; 5303 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 5304 } 5305 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 5306 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { 5307 if ( zSig1 == 0 ) zSig1 = 1; 5308 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 5309 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 5310 mul64To128( zSig1, zSig1, &term2, &term3 ); 5311 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 5312 while ( (int64_t) rem1 < 0 ) { 5313 --zSig1; 5314 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 5315 term3 |= 1; 5316 term2 |= doubleZSig0; 5317 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 5318 } 5319 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 5320 } 5321 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); 5322 zSig0 |= doubleZSig0; 5323 return roundAndPackFloatx80(status->floatx80_rounding_precision, 5324 0, zExp, zSig0, zSig1, status); 5325 } 5326 5327 /*---------------------------------------------------------------------------- 5328 | Returns 1 if the extended double-precision floating-point value `a' is equal 5329 | to the corresponding value `b', and 0 otherwise. The invalid exception is 5330 | raised if either operand is a NaN. Otherwise, the comparison is performed 5331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5332 *----------------------------------------------------------------------------*/ 5333 5334 int floatx80_eq(floatx80 a, floatx80 b, float_status *status) 5335 { 5336 5337 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5338 || (extractFloatx80Exp(a) == 0x7FFF 5339 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5340 || (extractFloatx80Exp(b) == 0x7FFF 5341 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5342 ) { 5343 float_raise(float_flag_invalid, status); 5344 return 0; 5345 } 5346 return 5347 ( a.low == b.low ) 5348 && ( ( a.high == b.high ) 5349 || ( ( a.low == 0 ) 5350 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5351 ); 5352 5353 } 5354 5355 /*---------------------------------------------------------------------------- 5356 | Returns 1 if the extended double-precision floating-point value `a' is 5357 | less than or equal to the corresponding value `b', and 0 otherwise. The 5358 | invalid exception is raised if either operand is a NaN. The comparison is 5359 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5360 | Arithmetic. 5361 *----------------------------------------------------------------------------*/ 5362 5363 int floatx80_le(floatx80 a, floatx80 b, float_status *status) 5364 { 5365 flag aSign, bSign; 5366 5367 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5368 || (extractFloatx80Exp(a) == 0x7FFF 5369 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5370 || (extractFloatx80Exp(b) == 0x7FFF 5371 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5372 ) { 5373 float_raise(float_flag_invalid, status); 5374 return 0; 5375 } 5376 aSign = extractFloatx80Sign( a ); 5377 bSign = extractFloatx80Sign( b ); 5378 if ( aSign != bSign ) { 5379 return 5380 aSign 5381 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5382 == 0 ); 5383 } 5384 return 5385 aSign ? le128( b.high, b.low, a.high, a.low ) 5386 : le128( a.high, a.low, b.high, b.low ); 5387 5388 } 5389 5390 /*---------------------------------------------------------------------------- 5391 | Returns 1 if the extended double-precision floating-point value `a' is 5392 | less than the corresponding value `b', and 0 otherwise. The invalid 5393 | exception is raised if either operand is a NaN. The comparison is performed 5394 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5395 *----------------------------------------------------------------------------*/ 5396 5397 int floatx80_lt(floatx80 a, floatx80 b, float_status *status) 5398 { 5399 flag aSign, bSign; 5400 5401 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5402 || (extractFloatx80Exp(a) == 0x7FFF 5403 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5404 || (extractFloatx80Exp(b) == 0x7FFF 5405 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5406 ) { 5407 float_raise(float_flag_invalid, status); 5408 return 0; 5409 } 5410 aSign = extractFloatx80Sign( a ); 5411 bSign = extractFloatx80Sign( b ); 5412 if ( aSign != bSign ) { 5413 return 5414 aSign 5415 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5416 != 0 ); 5417 } 5418 return 5419 aSign ? lt128( b.high, b.low, a.high, a.low ) 5420 : lt128( a.high, a.low, b.high, b.low ); 5421 5422 } 5423 5424 /*---------------------------------------------------------------------------- 5425 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5426 | cannot be compared, and 0 otherwise. The invalid exception is raised if 5427 | either operand is a NaN. The comparison is performed according to the 5428 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5429 *----------------------------------------------------------------------------*/ 5430 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status) 5431 { 5432 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b) 5433 || (extractFloatx80Exp(a) == 0x7FFF 5434 && (uint64_t) (extractFloatx80Frac(a) << 1)) 5435 || (extractFloatx80Exp(b) == 0x7FFF 5436 && (uint64_t) (extractFloatx80Frac(b) << 1)) 5437 ) { 5438 float_raise(float_flag_invalid, status); 5439 return 1; 5440 } 5441 return 0; 5442 } 5443 5444 /*---------------------------------------------------------------------------- 5445 | Returns 1 if the extended double-precision floating-point value `a' is 5446 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 5447 | cause an exception. The comparison is performed according to the IEC/IEEE 5448 | Standard for Binary Floating-Point Arithmetic. 5449 *----------------------------------------------------------------------------*/ 5450 5451 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status) 5452 { 5453 5454 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5455 float_raise(float_flag_invalid, status); 5456 return 0; 5457 } 5458 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5459 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5460 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5461 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5462 ) { 5463 if (floatx80_is_signaling_nan(a, status) 5464 || floatx80_is_signaling_nan(b, status)) { 5465 float_raise(float_flag_invalid, status); 5466 } 5467 return 0; 5468 } 5469 return 5470 ( a.low == b.low ) 5471 && ( ( a.high == b.high ) 5472 || ( ( a.low == 0 ) 5473 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 5474 ); 5475 5476 } 5477 5478 /*---------------------------------------------------------------------------- 5479 | Returns 1 if the extended double-precision floating-point value `a' is less 5480 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs 5481 | do not cause an exception. Otherwise, the comparison is performed according 5482 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5483 *----------------------------------------------------------------------------*/ 5484 5485 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status) 5486 { 5487 flag aSign, bSign; 5488 5489 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5490 float_raise(float_flag_invalid, status); 5491 return 0; 5492 } 5493 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5494 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5495 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5496 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5497 ) { 5498 if (floatx80_is_signaling_nan(a, status) 5499 || floatx80_is_signaling_nan(b, status)) { 5500 float_raise(float_flag_invalid, status); 5501 } 5502 return 0; 5503 } 5504 aSign = extractFloatx80Sign( a ); 5505 bSign = extractFloatx80Sign( b ); 5506 if ( aSign != bSign ) { 5507 return 5508 aSign 5509 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5510 == 0 ); 5511 } 5512 return 5513 aSign ? le128( b.high, b.low, a.high, a.low ) 5514 : le128( a.high, a.low, b.high, b.low ); 5515 5516 } 5517 5518 /*---------------------------------------------------------------------------- 5519 | Returns 1 if the extended double-precision floating-point value `a' is less 5520 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause 5521 | an exception. Otherwise, the comparison is performed according to the 5522 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 5523 *----------------------------------------------------------------------------*/ 5524 5525 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status) 5526 { 5527 flag aSign, bSign; 5528 5529 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5530 float_raise(float_flag_invalid, status); 5531 return 0; 5532 } 5533 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5534 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5535 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5536 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5537 ) { 5538 if (floatx80_is_signaling_nan(a, status) 5539 || floatx80_is_signaling_nan(b, status)) { 5540 float_raise(float_flag_invalid, status); 5541 } 5542 return 0; 5543 } 5544 aSign = extractFloatx80Sign( a ); 5545 bSign = extractFloatx80Sign( b ); 5546 if ( aSign != bSign ) { 5547 return 5548 aSign 5549 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 5550 != 0 ); 5551 } 5552 return 5553 aSign ? lt128( b.high, b.low, a.high, a.low ) 5554 : lt128( a.high, a.low, b.high, b.low ); 5555 5556 } 5557 5558 /*---------------------------------------------------------------------------- 5559 | Returns 1 if the extended double-precision floating-point values `a' and `b' 5560 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception. 5561 | The comparison is performed according to the IEC/IEEE Standard for Binary 5562 | Floating-Point Arithmetic. 5563 *----------------------------------------------------------------------------*/ 5564 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status) 5565 { 5566 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 5567 float_raise(float_flag_invalid, status); 5568 return 1; 5569 } 5570 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) 5571 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) 5572 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) 5573 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) ) 5574 ) { 5575 if (floatx80_is_signaling_nan(a, status) 5576 || floatx80_is_signaling_nan(b, status)) { 5577 float_raise(float_flag_invalid, status); 5578 } 5579 return 1; 5580 } 5581 return 0; 5582 } 5583 5584 /*---------------------------------------------------------------------------- 5585 | Returns the result of converting the quadruple-precision floating-point 5586 | value `a' to the 32-bit two's complement integer format. The conversion 5587 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5588 | Arithmetic---which means in particular that the conversion is rounded 5589 | according to the current rounding mode. If `a' is a NaN, the largest 5590 | positive integer is returned. Otherwise, if the conversion overflows, the 5591 | largest integer with the same sign as `a' is returned. 5592 *----------------------------------------------------------------------------*/ 5593 5594 int32_t float128_to_int32(float128 a, float_status *status) 5595 { 5596 flag aSign; 5597 int32_t aExp, shiftCount; 5598 uint64_t aSig0, aSig1; 5599 5600 aSig1 = extractFloat128Frac1( a ); 5601 aSig0 = extractFloat128Frac0( a ); 5602 aExp = extractFloat128Exp( a ); 5603 aSign = extractFloat128Sign( a ); 5604 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; 5605 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5606 aSig0 |= ( aSig1 != 0 ); 5607 shiftCount = 0x4028 - aExp; 5608 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); 5609 return roundAndPackInt32(aSign, aSig0, status); 5610 5611 } 5612 5613 /*---------------------------------------------------------------------------- 5614 | Returns the result of converting the quadruple-precision floating-point 5615 | value `a' to the 32-bit two's complement integer format. The conversion 5616 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5617 | Arithmetic, except that the conversion is always rounded toward zero. If 5618 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the 5619 | conversion overflows, the largest integer with the same sign as `a' is 5620 | returned. 5621 *----------------------------------------------------------------------------*/ 5622 5623 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status) 5624 { 5625 flag aSign; 5626 int32_t aExp, shiftCount; 5627 uint64_t aSig0, aSig1, savedASig; 5628 int32_t z; 5629 5630 aSig1 = extractFloat128Frac1( a ); 5631 aSig0 = extractFloat128Frac0( a ); 5632 aExp = extractFloat128Exp( a ); 5633 aSign = extractFloat128Sign( a ); 5634 aSig0 |= ( aSig1 != 0 ); 5635 if ( 0x401E < aExp ) { 5636 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; 5637 goto invalid; 5638 } 5639 else if ( aExp < 0x3FFF ) { 5640 if (aExp || aSig0) { 5641 status->float_exception_flags |= float_flag_inexact; 5642 } 5643 return 0; 5644 } 5645 aSig0 |= LIT64( 0x0001000000000000 ); 5646 shiftCount = 0x402F - aExp; 5647 savedASig = aSig0; 5648 aSig0 >>= shiftCount; 5649 z = aSig0; 5650 if ( aSign ) z = - z; 5651 if ( ( z < 0 ) ^ aSign ) { 5652 invalid: 5653 float_raise(float_flag_invalid, status); 5654 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF; 5655 } 5656 if ( ( aSig0<<shiftCount ) != savedASig ) { 5657 status->float_exception_flags |= float_flag_inexact; 5658 } 5659 return z; 5660 5661 } 5662 5663 /*---------------------------------------------------------------------------- 5664 | Returns the result of converting the quadruple-precision floating-point 5665 | value `a' to the 64-bit two's complement integer format. The conversion 5666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5667 | Arithmetic---which means in particular that the conversion is rounded 5668 | according to the current rounding mode. If `a' is a NaN, the largest 5669 | positive integer is returned. Otherwise, if the conversion overflows, the 5670 | largest integer with the same sign as `a' is returned. 5671 *----------------------------------------------------------------------------*/ 5672 5673 int64_t float128_to_int64(float128 a, float_status *status) 5674 { 5675 flag aSign; 5676 int32_t aExp, shiftCount; 5677 uint64_t aSig0, aSig1; 5678 5679 aSig1 = extractFloat128Frac1( a ); 5680 aSig0 = extractFloat128Frac0( a ); 5681 aExp = extractFloat128Exp( a ); 5682 aSign = extractFloat128Sign( a ); 5683 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5684 shiftCount = 0x402F - aExp; 5685 if ( shiftCount <= 0 ) { 5686 if ( 0x403E < aExp ) { 5687 float_raise(float_flag_invalid, status); 5688 if ( ! aSign 5689 || ( ( aExp == 0x7FFF ) 5690 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) 5691 ) 5692 ) { 5693 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5694 } 5695 return (int64_t) LIT64( 0x8000000000000000 ); 5696 } 5697 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); 5698 } 5699 else { 5700 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); 5701 } 5702 return roundAndPackInt64(aSign, aSig0, aSig1, status); 5703 5704 } 5705 5706 /*---------------------------------------------------------------------------- 5707 | Returns the result of converting the quadruple-precision floating-point 5708 | value `a' to the 64-bit two's complement integer format. The conversion 5709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5710 | Arithmetic, except that the conversion is always rounded toward zero. 5711 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if 5712 | the conversion overflows, the largest integer with the same sign as `a' is 5713 | returned. 5714 *----------------------------------------------------------------------------*/ 5715 5716 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status) 5717 { 5718 flag aSign; 5719 int32_t aExp, shiftCount; 5720 uint64_t aSig0, aSig1; 5721 int64_t z; 5722 5723 aSig1 = extractFloat128Frac1( a ); 5724 aSig0 = extractFloat128Frac0( a ); 5725 aExp = extractFloat128Exp( a ); 5726 aSign = extractFloat128Sign( a ); 5727 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); 5728 shiftCount = aExp - 0x402F; 5729 if ( 0 < shiftCount ) { 5730 if ( 0x403E <= aExp ) { 5731 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); 5732 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) 5733 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { 5734 if (aSig1) { 5735 status->float_exception_flags |= float_flag_inexact; 5736 } 5737 } 5738 else { 5739 float_raise(float_flag_invalid, status); 5740 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { 5741 return LIT64( 0x7FFFFFFFFFFFFFFF ); 5742 } 5743 } 5744 return (int64_t) LIT64( 0x8000000000000000 ); 5745 } 5746 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); 5747 if ( (uint64_t) ( aSig1<<shiftCount ) ) { 5748 status->float_exception_flags |= float_flag_inexact; 5749 } 5750 } 5751 else { 5752 if ( aExp < 0x3FFF ) { 5753 if ( aExp | aSig0 | aSig1 ) { 5754 status->float_exception_flags |= float_flag_inexact; 5755 } 5756 return 0; 5757 } 5758 z = aSig0>>( - shiftCount ); 5759 if ( aSig1 5760 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) { 5761 status->float_exception_flags |= float_flag_inexact; 5762 } 5763 } 5764 if ( aSign ) z = - z; 5765 return z; 5766 5767 } 5768 5769 /*---------------------------------------------------------------------------- 5770 | Returns the result of converting the quadruple-precision floating-point value 5771 | `a' to the 64-bit unsigned integer format. The conversion is 5772 | performed according to the IEC/IEEE Standard for Binary Floating-Point 5773 | Arithmetic---which means in particular that the conversion is rounded 5774 | according to the current rounding mode. If `a' is a NaN, the largest 5775 | positive integer is returned. If the conversion overflows, the 5776 | largest unsigned integer is returned. If 'a' is negative, the value is 5777 | rounded and zero is returned; negative values that do not round to zero 5778 | will raise the inexact exception. 5779 *----------------------------------------------------------------------------*/ 5780 5781 uint64_t float128_to_uint64(float128 a, float_status *status) 5782 { 5783 flag aSign; 5784 int aExp; 5785 int shiftCount; 5786 uint64_t aSig0, aSig1; 5787 5788 aSig0 = extractFloat128Frac0(a); 5789 aSig1 = extractFloat128Frac1(a); 5790 aExp = extractFloat128Exp(a); 5791 aSign = extractFloat128Sign(a); 5792 if (aSign && (aExp > 0x3FFE)) { 5793 float_raise(float_flag_invalid, status); 5794 if (float128_is_any_nan(a)) { 5795 return LIT64(0xFFFFFFFFFFFFFFFF); 5796 } else { 5797 return 0; 5798 } 5799 } 5800 if (aExp) { 5801 aSig0 |= LIT64(0x0001000000000000); 5802 } 5803 shiftCount = 0x402F - aExp; 5804 if (shiftCount <= 0) { 5805 if (0x403E < aExp) { 5806 float_raise(float_flag_invalid, status); 5807 return LIT64(0xFFFFFFFFFFFFFFFF); 5808 } 5809 shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1); 5810 } else { 5811 shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1); 5812 } 5813 return roundAndPackUint64(aSign, aSig0, aSig1, status); 5814 } 5815 5816 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status) 5817 { 5818 uint64_t v; 5819 signed char current_rounding_mode = status->float_rounding_mode; 5820 5821 set_float_rounding_mode(float_round_to_zero, status); 5822 v = float128_to_uint64(a, status); 5823 set_float_rounding_mode(current_rounding_mode, status); 5824 5825 return v; 5826 } 5827 5828 /*---------------------------------------------------------------------------- 5829 | Returns the result of converting the quadruple-precision floating-point 5830 | value `a' to the 32-bit unsigned integer format. The conversion 5831 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5832 | Arithmetic except that the conversion is always rounded toward zero. 5833 | If `a' is a NaN, the largest positive integer is returned. Otherwise, 5834 | if the conversion overflows, the largest unsigned integer is returned. 5835 | If 'a' is negative, the value is rounded and zero is returned; negative 5836 | values that do not round to zero will raise the inexact exception. 5837 *----------------------------------------------------------------------------*/ 5838 5839 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status) 5840 { 5841 uint64_t v; 5842 uint32_t res; 5843 int old_exc_flags = get_float_exception_flags(status); 5844 5845 v = float128_to_uint64_round_to_zero(a, status); 5846 if (v > 0xffffffff) { 5847 res = 0xffffffff; 5848 } else { 5849 return v; 5850 } 5851 set_float_exception_flags(old_exc_flags, status); 5852 float_raise(float_flag_invalid, status); 5853 return res; 5854 } 5855 5856 /*---------------------------------------------------------------------------- 5857 | Returns the result of converting the quadruple-precision floating-point 5858 | value `a' to the single-precision floating-point format. The conversion 5859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5860 | Arithmetic. 5861 *----------------------------------------------------------------------------*/ 5862 5863 float32 float128_to_float32(float128 a, float_status *status) 5864 { 5865 flag aSign; 5866 int32_t aExp; 5867 uint64_t aSig0, aSig1; 5868 uint32_t zSig; 5869 5870 aSig1 = extractFloat128Frac1( a ); 5871 aSig0 = extractFloat128Frac0( a ); 5872 aExp = extractFloat128Exp( a ); 5873 aSign = extractFloat128Sign( a ); 5874 if ( aExp == 0x7FFF ) { 5875 if ( aSig0 | aSig1 ) { 5876 return commonNaNToFloat32(float128ToCommonNaN(a, status), status); 5877 } 5878 return packFloat32( aSign, 0xFF, 0 ); 5879 } 5880 aSig0 |= ( aSig1 != 0 ); 5881 shift64RightJamming( aSig0, 18, &aSig0 ); 5882 zSig = aSig0; 5883 if ( aExp || zSig ) { 5884 zSig |= 0x40000000; 5885 aExp -= 0x3F81; 5886 } 5887 return roundAndPackFloat32(aSign, aExp, zSig, status); 5888 5889 } 5890 5891 /*---------------------------------------------------------------------------- 5892 | Returns the result of converting the quadruple-precision floating-point 5893 | value `a' to the double-precision floating-point format. The conversion 5894 | is performed according to the IEC/IEEE Standard for Binary Floating-Point 5895 | Arithmetic. 5896 *----------------------------------------------------------------------------*/ 5897 5898 float64 float128_to_float64(float128 a, float_status *status) 5899 { 5900 flag aSign; 5901 int32_t aExp; 5902 uint64_t aSig0, aSig1; 5903 5904 aSig1 = extractFloat128Frac1( a ); 5905 aSig0 = extractFloat128Frac0( a ); 5906 aExp = extractFloat128Exp( a ); 5907 aSign = extractFloat128Sign( a ); 5908 if ( aExp == 0x7FFF ) { 5909 if ( aSig0 | aSig1 ) { 5910 return commonNaNToFloat64(float128ToCommonNaN(a, status), status); 5911 } 5912 return packFloat64( aSign, 0x7FF, 0 ); 5913 } 5914 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 5915 aSig0 |= ( aSig1 != 0 ); 5916 if ( aExp || aSig0 ) { 5917 aSig0 |= LIT64( 0x4000000000000000 ); 5918 aExp -= 0x3C01; 5919 } 5920 return roundAndPackFloat64(aSign, aExp, aSig0, status); 5921 5922 } 5923 5924 /*---------------------------------------------------------------------------- 5925 | Returns the result of converting the quadruple-precision floating-point 5926 | value `a' to the extended double-precision floating-point format. The 5927 | conversion is performed according to the IEC/IEEE Standard for Binary 5928 | Floating-Point Arithmetic. 5929 *----------------------------------------------------------------------------*/ 5930 5931 floatx80 float128_to_floatx80(float128 a, float_status *status) 5932 { 5933 flag aSign; 5934 int32_t aExp; 5935 uint64_t aSig0, aSig1; 5936 5937 aSig1 = extractFloat128Frac1( a ); 5938 aSig0 = extractFloat128Frac0( a ); 5939 aExp = extractFloat128Exp( a ); 5940 aSign = extractFloat128Sign( a ); 5941 if ( aExp == 0x7FFF ) { 5942 if ( aSig0 | aSig1 ) { 5943 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status); 5944 } 5945 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); 5946 } 5947 if ( aExp == 0 ) { 5948 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); 5949 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 5950 } 5951 else { 5952 aSig0 |= LIT64( 0x0001000000000000 ); 5953 } 5954 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); 5955 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status); 5956 5957 } 5958 5959 /*---------------------------------------------------------------------------- 5960 | Rounds the quadruple-precision floating-point value `a' to an integer, and 5961 | returns the result as a quadruple-precision floating-point value. The 5962 | operation is performed according to the IEC/IEEE Standard for Binary 5963 | Floating-Point Arithmetic. 5964 *----------------------------------------------------------------------------*/ 5965 5966 float128 float128_round_to_int(float128 a, float_status *status) 5967 { 5968 flag aSign; 5969 int32_t aExp; 5970 uint64_t lastBitMask, roundBitsMask; 5971 float128 z; 5972 5973 aExp = extractFloat128Exp( a ); 5974 if ( 0x402F <= aExp ) { 5975 if ( 0x406F <= aExp ) { 5976 if ( ( aExp == 0x7FFF ) 5977 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) 5978 ) { 5979 return propagateFloat128NaN(a, a, status); 5980 } 5981 return a; 5982 } 5983 lastBitMask = 1; 5984 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; 5985 roundBitsMask = lastBitMask - 1; 5986 z = a; 5987 switch (status->float_rounding_mode) { 5988 case float_round_nearest_even: 5989 if ( lastBitMask ) { 5990 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); 5991 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; 5992 } 5993 else { 5994 if ( (int64_t) z.low < 0 ) { 5995 ++z.high; 5996 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1; 5997 } 5998 } 5999 break; 6000 case float_round_ties_away: 6001 if (lastBitMask) { 6002 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low); 6003 } else { 6004 if ((int64_t) z.low < 0) { 6005 ++z.high; 6006 } 6007 } 6008 break; 6009 case float_round_to_zero: 6010 break; 6011 case float_round_up: 6012 if (!extractFloat128Sign(z)) { 6013 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6014 } 6015 break; 6016 case float_round_down: 6017 if (extractFloat128Sign(z)) { 6018 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low); 6019 } 6020 break; 6021 default: 6022 abort(); 6023 } 6024 z.low &= ~ roundBitsMask; 6025 } 6026 else { 6027 if ( aExp < 0x3FFF ) { 6028 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a; 6029 status->float_exception_flags |= float_flag_inexact; 6030 aSign = extractFloat128Sign( a ); 6031 switch (status->float_rounding_mode) { 6032 case float_round_nearest_even: 6033 if ( ( aExp == 0x3FFE ) 6034 && ( extractFloat128Frac0( a ) 6035 | extractFloat128Frac1( a ) ) 6036 ) { 6037 return packFloat128( aSign, 0x3FFF, 0, 0 ); 6038 } 6039 break; 6040 case float_round_ties_away: 6041 if (aExp == 0x3FFE) { 6042 return packFloat128(aSign, 0x3FFF, 0, 0); 6043 } 6044 break; 6045 case float_round_down: 6046 return 6047 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) 6048 : packFloat128( 0, 0, 0, 0 ); 6049 case float_round_up: 6050 return 6051 aSign ? packFloat128( 1, 0, 0, 0 ) 6052 : packFloat128( 0, 0x3FFF, 0, 0 ); 6053 } 6054 return packFloat128( aSign, 0, 0, 0 ); 6055 } 6056 lastBitMask = 1; 6057 lastBitMask <<= 0x402F - aExp; 6058 roundBitsMask = lastBitMask - 1; 6059 z.low = 0; 6060 z.high = a.high; 6061 switch (status->float_rounding_mode) { 6062 case float_round_nearest_even: 6063 z.high += lastBitMask>>1; 6064 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { 6065 z.high &= ~ lastBitMask; 6066 } 6067 break; 6068 case float_round_ties_away: 6069 z.high += lastBitMask>>1; 6070 break; 6071 case float_round_to_zero: 6072 break; 6073 case float_round_up: 6074 if (!extractFloat128Sign(z)) { 6075 z.high |= ( a.low != 0 ); 6076 z.high += roundBitsMask; 6077 } 6078 break; 6079 case float_round_down: 6080 if (extractFloat128Sign(z)) { 6081 z.high |= (a.low != 0); 6082 z.high += roundBitsMask; 6083 } 6084 break; 6085 default: 6086 abort(); 6087 } 6088 z.high &= ~ roundBitsMask; 6089 } 6090 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { 6091 status->float_exception_flags |= float_flag_inexact; 6092 } 6093 return z; 6094 6095 } 6096 6097 /*---------------------------------------------------------------------------- 6098 | Returns the result of adding the absolute values of the quadruple-precision 6099 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated 6100 | before being returned. `zSign' is ignored if the result is a NaN. 6101 | The addition is performed according to the IEC/IEEE Standard for Binary 6102 | Floating-Point Arithmetic. 6103 *----------------------------------------------------------------------------*/ 6104 6105 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign, 6106 float_status *status) 6107 { 6108 int32_t aExp, bExp, zExp; 6109 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6110 int32_t expDiff; 6111 6112 aSig1 = extractFloat128Frac1( a ); 6113 aSig0 = extractFloat128Frac0( a ); 6114 aExp = extractFloat128Exp( a ); 6115 bSig1 = extractFloat128Frac1( b ); 6116 bSig0 = extractFloat128Frac0( b ); 6117 bExp = extractFloat128Exp( b ); 6118 expDiff = aExp - bExp; 6119 if ( 0 < expDiff ) { 6120 if ( aExp == 0x7FFF ) { 6121 if (aSig0 | aSig1) { 6122 return propagateFloat128NaN(a, b, status); 6123 } 6124 return a; 6125 } 6126 if ( bExp == 0 ) { 6127 --expDiff; 6128 } 6129 else { 6130 bSig0 |= LIT64( 0x0001000000000000 ); 6131 } 6132 shift128ExtraRightJamming( 6133 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); 6134 zExp = aExp; 6135 } 6136 else if ( expDiff < 0 ) { 6137 if ( bExp == 0x7FFF ) { 6138 if (bSig0 | bSig1) { 6139 return propagateFloat128NaN(a, b, status); 6140 } 6141 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6142 } 6143 if ( aExp == 0 ) { 6144 ++expDiff; 6145 } 6146 else { 6147 aSig0 |= LIT64( 0x0001000000000000 ); 6148 } 6149 shift128ExtraRightJamming( 6150 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); 6151 zExp = bExp; 6152 } 6153 else { 6154 if ( aExp == 0x7FFF ) { 6155 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6156 return propagateFloat128NaN(a, b, status); 6157 } 6158 return a; 6159 } 6160 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6161 if ( aExp == 0 ) { 6162 if (status->flush_to_zero) { 6163 if (zSig0 | zSig1) { 6164 float_raise(float_flag_output_denormal, status); 6165 } 6166 return packFloat128(zSign, 0, 0, 0); 6167 } 6168 return packFloat128( zSign, 0, zSig0, zSig1 ); 6169 } 6170 zSig2 = 0; 6171 zSig0 |= LIT64( 0x0002000000000000 ); 6172 zExp = aExp; 6173 goto shiftRight1; 6174 } 6175 aSig0 |= LIT64( 0x0001000000000000 ); 6176 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6177 --zExp; 6178 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; 6179 ++zExp; 6180 shiftRight1: 6181 shift128ExtraRightJamming( 6182 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6183 roundAndPack: 6184 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6185 6186 } 6187 6188 /*---------------------------------------------------------------------------- 6189 | Returns the result of subtracting the absolute values of the quadruple- 6190 | precision floating-point values `a' and `b'. If `zSign' is 1, the 6191 | difference is negated before being returned. `zSign' is ignored if the 6192 | result is a NaN. The subtraction is performed according to the IEC/IEEE 6193 | Standard for Binary Floating-Point Arithmetic. 6194 *----------------------------------------------------------------------------*/ 6195 6196 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign, 6197 float_status *status) 6198 { 6199 int32_t aExp, bExp, zExp; 6200 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; 6201 int32_t expDiff; 6202 6203 aSig1 = extractFloat128Frac1( a ); 6204 aSig0 = extractFloat128Frac0( a ); 6205 aExp = extractFloat128Exp( a ); 6206 bSig1 = extractFloat128Frac1( b ); 6207 bSig0 = extractFloat128Frac0( b ); 6208 bExp = extractFloat128Exp( b ); 6209 expDiff = aExp - bExp; 6210 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); 6211 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); 6212 if ( 0 < expDiff ) goto aExpBigger; 6213 if ( expDiff < 0 ) goto bExpBigger; 6214 if ( aExp == 0x7FFF ) { 6215 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { 6216 return propagateFloat128NaN(a, b, status); 6217 } 6218 float_raise(float_flag_invalid, status); 6219 return float128_default_nan(status); 6220 } 6221 if ( aExp == 0 ) { 6222 aExp = 1; 6223 bExp = 1; 6224 } 6225 if ( bSig0 < aSig0 ) goto aBigger; 6226 if ( aSig0 < bSig0 ) goto bBigger; 6227 if ( bSig1 < aSig1 ) goto aBigger; 6228 if ( aSig1 < bSig1 ) goto bBigger; 6229 return packFloat128(status->float_rounding_mode == float_round_down, 6230 0, 0, 0); 6231 bExpBigger: 6232 if ( bExp == 0x7FFF ) { 6233 if (bSig0 | bSig1) { 6234 return propagateFloat128NaN(a, b, status); 6235 } 6236 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); 6237 } 6238 if ( aExp == 0 ) { 6239 ++expDiff; 6240 } 6241 else { 6242 aSig0 |= LIT64( 0x4000000000000000 ); 6243 } 6244 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6245 bSig0 |= LIT64( 0x4000000000000000 ); 6246 bBigger: 6247 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6248 zExp = bExp; 6249 zSign ^= 1; 6250 goto normalizeRoundAndPack; 6251 aExpBigger: 6252 if ( aExp == 0x7FFF ) { 6253 if (aSig0 | aSig1) { 6254 return propagateFloat128NaN(a, b, status); 6255 } 6256 return a; 6257 } 6258 if ( bExp == 0 ) { 6259 --expDiff; 6260 } 6261 else { 6262 bSig0 |= LIT64( 0x4000000000000000 ); 6263 } 6264 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); 6265 aSig0 |= LIT64( 0x4000000000000000 ); 6266 aBigger: 6267 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); 6268 zExp = aExp; 6269 normalizeRoundAndPack: 6270 --zExp; 6271 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, 6272 status); 6273 6274 } 6275 6276 /*---------------------------------------------------------------------------- 6277 | Returns the result of adding the quadruple-precision floating-point values 6278 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard 6279 | for Binary Floating-Point Arithmetic. 6280 *----------------------------------------------------------------------------*/ 6281 6282 float128 float128_add(float128 a, float128 b, float_status *status) 6283 { 6284 flag aSign, bSign; 6285 6286 aSign = extractFloat128Sign( a ); 6287 bSign = extractFloat128Sign( b ); 6288 if ( aSign == bSign ) { 6289 return addFloat128Sigs(a, b, aSign, status); 6290 } 6291 else { 6292 return subFloat128Sigs(a, b, aSign, status); 6293 } 6294 6295 } 6296 6297 /*---------------------------------------------------------------------------- 6298 | Returns the result of subtracting the quadruple-precision floating-point 6299 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6300 | Standard for Binary Floating-Point Arithmetic. 6301 *----------------------------------------------------------------------------*/ 6302 6303 float128 float128_sub(float128 a, float128 b, float_status *status) 6304 { 6305 flag aSign, bSign; 6306 6307 aSign = extractFloat128Sign( a ); 6308 bSign = extractFloat128Sign( b ); 6309 if ( aSign == bSign ) { 6310 return subFloat128Sigs(a, b, aSign, status); 6311 } 6312 else { 6313 return addFloat128Sigs(a, b, aSign, status); 6314 } 6315 6316 } 6317 6318 /*---------------------------------------------------------------------------- 6319 | Returns the result of multiplying the quadruple-precision floating-point 6320 | values `a' and `b'. The operation is performed according to the IEC/IEEE 6321 | Standard for Binary Floating-Point Arithmetic. 6322 *----------------------------------------------------------------------------*/ 6323 6324 float128 float128_mul(float128 a, float128 b, float_status *status) 6325 { 6326 flag aSign, bSign, zSign; 6327 int32_t aExp, bExp, zExp; 6328 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; 6329 6330 aSig1 = extractFloat128Frac1( a ); 6331 aSig0 = extractFloat128Frac0( a ); 6332 aExp = extractFloat128Exp( a ); 6333 aSign = extractFloat128Sign( a ); 6334 bSig1 = extractFloat128Frac1( b ); 6335 bSig0 = extractFloat128Frac0( b ); 6336 bExp = extractFloat128Exp( b ); 6337 bSign = extractFloat128Sign( b ); 6338 zSign = aSign ^ bSign; 6339 if ( aExp == 0x7FFF ) { 6340 if ( ( aSig0 | aSig1 ) 6341 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6342 return propagateFloat128NaN(a, b, status); 6343 } 6344 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; 6345 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6346 } 6347 if ( bExp == 0x7FFF ) { 6348 if (bSig0 | bSig1) { 6349 return propagateFloat128NaN(a, b, status); 6350 } 6351 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6352 invalid: 6353 float_raise(float_flag_invalid, status); 6354 return float128_default_nan(status); 6355 } 6356 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6357 } 6358 if ( aExp == 0 ) { 6359 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6360 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6361 } 6362 if ( bExp == 0 ) { 6363 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6364 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6365 } 6366 zExp = aExp + bExp - 0x4000; 6367 aSig0 |= LIT64( 0x0001000000000000 ); 6368 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); 6369 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); 6370 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); 6371 zSig2 |= ( zSig3 != 0 ); 6372 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { 6373 shift128ExtraRightJamming( 6374 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); 6375 ++zExp; 6376 } 6377 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6378 6379 } 6380 6381 /*---------------------------------------------------------------------------- 6382 | Returns the result of dividing the quadruple-precision floating-point value 6383 | `a' by the corresponding value `b'. The operation is performed according to 6384 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6385 *----------------------------------------------------------------------------*/ 6386 6387 float128 float128_div(float128 a, float128 b, float_status *status) 6388 { 6389 flag aSign, bSign, zSign; 6390 int32_t aExp, bExp, zExp; 6391 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; 6392 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6393 6394 aSig1 = extractFloat128Frac1( a ); 6395 aSig0 = extractFloat128Frac0( a ); 6396 aExp = extractFloat128Exp( a ); 6397 aSign = extractFloat128Sign( a ); 6398 bSig1 = extractFloat128Frac1( b ); 6399 bSig0 = extractFloat128Frac0( b ); 6400 bExp = extractFloat128Exp( b ); 6401 bSign = extractFloat128Sign( b ); 6402 zSign = aSign ^ bSign; 6403 if ( aExp == 0x7FFF ) { 6404 if (aSig0 | aSig1) { 6405 return propagateFloat128NaN(a, b, status); 6406 } 6407 if ( bExp == 0x7FFF ) { 6408 if (bSig0 | bSig1) { 6409 return propagateFloat128NaN(a, b, status); 6410 } 6411 goto invalid; 6412 } 6413 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6414 } 6415 if ( bExp == 0x7FFF ) { 6416 if (bSig0 | bSig1) { 6417 return propagateFloat128NaN(a, b, status); 6418 } 6419 return packFloat128( zSign, 0, 0, 0 ); 6420 } 6421 if ( bExp == 0 ) { 6422 if ( ( bSig0 | bSig1 ) == 0 ) { 6423 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { 6424 invalid: 6425 float_raise(float_flag_invalid, status); 6426 return float128_default_nan(status); 6427 } 6428 float_raise(float_flag_divbyzero, status); 6429 return packFloat128( zSign, 0x7FFF, 0, 0 ); 6430 } 6431 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6432 } 6433 if ( aExp == 0 ) { 6434 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); 6435 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6436 } 6437 zExp = aExp - bExp + 0x3FFD; 6438 shortShift128Left( 6439 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); 6440 shortShift128Left( 6441 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6442 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { 6443 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); 6444 ++zExp; 6445 } 6446 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6447 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); 6448 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); 6449 while ( (int64_t) rem0 < 0 ) { 6450 --zSig0; 6451 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); 6452 } 6453 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); 6454 if ( ( zSig1 & 0x3FFF ) <= 4 ) { 6455 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); 6456 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); 6457 while ( (int64_t) rem1 < 0 ) { 6458 --zSig1; 6459 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); 6460 } 6461 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6462 } 6463 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); 6464 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status); 6465 6466 } 6467 6468 /*---------------------------------------------------------------------------- 6469 | Returns the remainder of the quadruple-precision floating-point value `a' 6470 | with respect to the corresponding value `b'. The operation is performed 6471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6472 *----------------------------------------------------------------------------*/ 6473 6474 float128 float128_rem(float128 a, float128 b, float_status *status) 6475 { 6476 flag aSign, zSign; 6477 int32_t aExp, bExp, expDiff; 6478 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; 6479 uint64_t allZero, alternateASig0, alternateASig1, sigMean1; 6480 int64_t sigMean0; 6481 6482 aSig1 = extractFloat128Frac1( a ); 6483 aSig0 = extractFloat128Frac0( a ); 6484 aExp = extractFloat128Exp( a ); 6485 aSign = extractFloat128Sign( a ); 6486 bSig1 = extractFloat128Frac1( b ); 6487 bSig0 = extractFloat128Frac0( b ); 6488 bExp = extractFloat128Exp( b ); 6489 if ( aExp == 0x7FFF ) { 6490 if ( ( aSig0 | aSig1 ) 6491 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { 6492 return propagateFloat128NaN(a, b, status); 6493 } 6494 goto invalid; 6495 } 6496 if ( bExp == 0x7FFF ) { 6497 if (bSig0 | bSig1) { 6498 return propagateFloat128NaN(a, b, status); 6499 } 6500 return a; 6501 } 6502 if ( bExp == 0 ) { 6503 if ( ( bSig0 | bSig1 ) == 0 ) { 6504 invalid: 6505 float_raise(float_flag_invalid, status); 6506 return float128_default_nan(status); 6507 } 6508 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); 6509 } 6510 if ( aExp == 0 ) { 6511 if ( ( aSig0 | aSig1 ) == 0 ) return a; 6512 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6513 } 6514 expDiff = aExp - bExp; 6515 if ( expDiff < -1 ) return a; 6516 shortShift128Left( 6517 aSig0 | LIT64( 0x0001000000000000 ), 6518 aSig1, 6519 15 - ( expDiff < 0 ), 6520 &aSig0, 6521 &aSig1 6522 ); 6523 shortShift128Left( 6524 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); 6525 q = le128( bSig0, bSig1, aSig0, aSig1 ); 6526 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6527 expDiff -= 64; 6528 while ( 0 < expDiff ) { 6529 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6530 q = ( 4 < q ) ? q - 4 : 0; 6531 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6532 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); 6533 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); 6534 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); 6535 expDiff -= 61; 6536 } 6537 if ( -64 < expDiff ) { 6538 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); 6539 q = ( 4 < q ) ? q - 4 : 0; 6540 q >>= - expDiff; 6541 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6542 expDiff += 52; 6543 if ( expDiff < 0 ) { 6544 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); 6545 } 6546 else { 6547 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); 6548 } 6549 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); 6550 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); 6551 } 6552 else { 6553 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); 6554 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); 6555 } 6556 do { 6557 alternateASig0 = aSig0; 6558 alternateASig1 = aSig1; 6559 ++q; 6560 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); 6561 } while ( 0 <= (int64_t) aSig0 ); 6562 add128( 6563 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 ); 6564 if ( ( sigMean0 < 0 ) 6565 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { 6566 aSig0 = alternateASig0; 6567 aSig1 = alternateASig1; 6568 } 6569 zSign = ( (int64_t) aSig0 < 0 ); 6570 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); 6571 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1, 6572 status); 6573 } 6574 6575 /*---------------------------------------------------------------------------- 6576 | Returns the square root of the quadruple-precision floating-point value `a'. 6577 | The operation is performed according to the IEC/IEEE Standard for Binary 6578 | Floating-Point Arithmetic. 6579 *----------------------------------------------------------------------------*/ 6580 6581 float128 float128_sqrt(float128 a, float_status *status) 6582 { 6583 flag aSign; 6584 int32_t aExp, zExp; 6585 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; 6586 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3; 6587 6588 aSig1 = extractFloat128Frac1( a ); 6589 aSig0 = extractFloat128Frac0( a ); 6590 aExp = extractFloat128Exp( a ); 6591 aSign = extractFloat128Sign( a ); 6592 if ( aExp == 0x7FFF ) { 6593 if (aSig0 | aSig1) { 6594 return propagateFloat128NaN(a, a, status); 6595 } 6596 if ( ! aSign ) return a; 6597 goto invalid; 6598 } 6599 if ( aSign ) { 6600 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; 6601 invalid: 6602 float_raise(float_flag_invalid, status); 6603 return float128_default_nan(status); 6604 } 6605 if ( aExp == 0 ) { 6606 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); 6607 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); 6608 } 6609 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; 6610 aSig0 |= LIT64( 0x0001000000000000 ); 6611 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); 6612 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); 6613 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); 6614 doubleZSig0 = zSig0<<1; 6615 mul64To128( zSig0, zSig0, &term0, &term1 ); 6616 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); 6617 while ( (int64_t) rem0 < 0 ) { 6618 --zSig0; 6619 doubleZSig0 -= 2; 6620 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); 6621 } 6622 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); 6623 if ( ( zSig1 & 0x1FFF ) <= 5 ) { 6624 if ( zSig1 == 0 ) zSig1 = 1; 6625 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); 6626 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); 6627 mul64To128( zSig1, zSig1, &term2, &term3 ); 6628 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); 6629 while ( (int64_t) rem1 < 0 ) { 6630 --zSig1; 6631 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); 6632 term3 |= 1; 6633 term2 |= doubleZSig0; 6634 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); 6635 } 6636 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); 6637 } 6638 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); 6639 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status); 6640 6641 } 6642 6643 /*---------------------------------------------------------------------------- 6644 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6645 | the corresponding value `b', and 0 otherwise. The invalid exception is 6646 | raised if either operand is a NaN. Otherwise, the comparison is performed 6647 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6648 *----------------------------------------------------------------------------*/ 6649 6650 int float128_eq(float128 a, float128 b, float_status *status) 6651 { 6652 6653 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6654 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6655 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6656 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6657 ) { 6658 float_raise(float_flag_invalid, status); 6659 return 0; 6660 } 6661 return 6662 ( a.low == b.low ) 6663 && ( ( a.high == b.high ) 6664 || ( ( a.low == 0 ) 6665 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6666 ); 6667 6668 } 6669 6670 /*---------------------------------------------------------------------------- 6671 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6672 | or equal to the corresponding value `b', and 0 otherwise. The invalid 6673 | exception is raised if either operand is a NaN. The comparison is performed 6674 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6675 *----------------------------------------------------------------------------*/ 6676 6677 int float128_le(float128 a, float128 b, float_status *status) 6678 { 6679 flag aSign, bSign; 6680 6681 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6682 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6683 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6684 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6685 ) { 6686 float_raise(float_flag_invalid, status); 6687 return 0; 6688 } 6689 aSign = extractFloat128Sign( a ); 6690 bSign = extractFloat128Sign( b ); 6691 if ( aSign != bSign ) { 6692 return 6693 aSign 6694 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6695 == 0 ); 6696 } 6697 return 6698 aSign ? le128( b.high, b.low, a.high, a.low ) 6699 : le128( a.high, a.low, b.high, b.low ); 6700 6701 } 6702 6703 /*---------------------------------------------------------------------------- 6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6705 | the corresponding value `b', and 0 otherwise. The invalid exception is 6706 | raised if either operand is a NaN. The comparison is performed according 6707 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6708 *----------------------------------------------------------------------------*/ 6709 6710 int float128_lt(float128 a, float128 b, float_status *status) 6711 { 6712 flag aSign, bSign; 6713 6714 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6715 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6716 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6717 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6718 ) { 6719 float_raise(float_flag_invalid, status); 6720 return 0; 6721 } 6722 aSign = extractFloat128Sign( a ); 6723 bSign = extractFloat128Sign( b ); 6724 if ( aSign != bSign ) { 6725 return 6726 aSign 6727 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6728 != 0 ); 6729 } 6730 return 6731 aSign ? lt128( b.high, b.low, a.high, a.low ) 6732 : lt128( a.high, a.low, b.high, b.low ); 6733 6734 } 6735 6736 /*---------------------------------------------------------------------------- 6737 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6738 | be compared, and 0 otherwise. The invalid exception is raised if either 6739 | operand is a NaN. The comparison is performed according to the IEC/IEEE 6740 | Standard for Binary Floating-Point Arithmetic. 6741 *----------------------------------------------------------------------------*/ 6742 6743 int float128_unordered(float128 a, float128 b, float_status *status) 6744 { 6745 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6746 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6747 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6748 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6749 ) { 6750 float_raise(float_flag_invalid, status); 6751 return 1; 6752 } 6753 return 0; 6754 } 6755 6756 /*---------------------------------------------------------------------------- 6757 | Returns 1 if the quadruple-precision floating-point value `a' is equal to 6758 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6759 | exception. The comparison is performed according to the IEC/IEEE Standard 6760 | for Binary Floating-Point Arithmetic. 6761 *----------------------------------------------------------------------------*/ 6762 6763 int float128_eq_quiet(float128 a, float128 b, float_status *status) 6764 { 6765 6766 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6767 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6768 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6769 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6770 ) { 6771 if (float128_is_signaling_nan(a, status) 6772 || float128_is_signaling_nan(b, status)) { 6773 float_raise(float_flag_invalid, status); 6774 } 6775 return 0; 6776 } 6777 return 6778 ( a.low == b.low ) 6779 && ( ( a.high == b.high ) 6780 || ( ( a.low == 0 ) 6781 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) ) 6782 ); 6783 6784 } 6785 6786 /*---------------------------------------------------------------------------- 6787 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6788 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not 6789 | cause an exception. Otherwise, the comparison is performed according to the 6790 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. 6791 *----------------------------------------------------------------------------*/ 6792 6793 int float128_le_quiet(float128 a, float128 b, float_status *status) 6794 { 6795 flag aSign, bSign; 6796 6797 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6798 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6799 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6800 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6801 ) { 6802 if (float128_is_signaling_nan(a, status) 6803 || float128_is_signaling_nan(b, status)) { 6804 float_raise(float_flag_invalid, status); 6805 } 6806 return 0; 6807 } 6808 aSign = extractFloat128Sign( a ); 6809 bSign = extractFloat128Sign( b ); 6810 if ( aSign != bSign ) { 6811 return 6812 aSign 6813 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6814 == 0 ); 6815 } 6816 return 6817 aSign ? le128( b.high, b.low, a.high, a.low ) 6818 : le128( a.high, a.low, b.high, b.low ); 6819 6820 } 6821 6822 /*---------------------------------------------------------------------------- 6823 | Returns 1 if the quadruple-precision floating-point value `a' is less than 6824 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an 6825 | exception. Otherwise, the comparison is performed according to the IEC/IEEE 6826 | Standard for Binary Floating-Point Arithmetic. 6827 *----------------------------------------------------------------------------*/ 6828 6829 int float128_lt_quiet(float128 a, float128 b, float_status *status) 6830 { 6831 flag aSign, bSign; 6832 6833 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6834 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6835 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6836 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6837 ) { 6838 if (float128_is_signaling_nan(a, status) 6839 || float128_is_signaling_nan(b, status)) { 6840 float_raise(float_flag_invalid, status); 6841 } 6842 return 0; 6843 } 6844 aSign = extractFloat128Sign( a ); 6845 bSign = extractFloat128Sign( b ); 6846 if ( aSign != bSign ) { 6847 return 6848 aSign 6849 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) 6850 != 0 ); 6851 } 6852 return 6853 aSign ? lt128( b.high, b.low, a.high, a.low ) 6854 : lt128( a.high, a.low, b.high, b.low ); 6855 6856 } 6857 6858 /*---------------------------------------------------------------------------- 6859 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot 6860 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The 6861 | comparison is performed according to the IEC/IEEE Standard for Binary 6862 | Floating-Point Arithmetic. 6863 *----------------------------------------------------------------------------*/ 6864 6865 int float128_unordered_quiet(float128 a, float128 b, float_status *status) 6866 { 6867 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) 6868 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) 6869 || ( ( extractFloat128Exp( b ) == 0x7FFF ) 6870 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) 6871 ) { 6872 if (float128_is_signaling_nan(a, status) 6873 || float128_is_signaling_nan(b, status)) { 6874 float_raise(float_flag_invalid, status); 6875 } 6876 return 1; 6877 } 6878 return 0; 6879 } 6880 6881 static inline int floatx80_compare_internal(floatx80 a, floatx80 b, 6882 int is_quiet, float_status *status) 6883 { 6884 flag aSign, bSign; 6885 6886 if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) { 6887 float_raise(float_flag_invalid, status); 6888 return float_relation_unordered; 6889 } 6890 if (( ( extractFloatx80Exp( a ) == 0x7fff ) && 6891 ( extractFloatx80Frac( a )<<1 ) ) || 6892 ( ( extractFloatx80Exp( b ) == 0x7fff ) && 6893 ( extractFloatx80Frac( b )<<1 ) )) { 6894 if (!is_quiet || 6895 floatx80_is_signaling_nan(a, status) || 6896 floatx80_is_signaling_nan(b, status)) { 6897 float_raise(float_flag_invalid, status); 6898 } 6899 return float_relation_unordered; 6900 } 6901 aSign = extractFloatx80Sign( a ); 6902 bSign = extractFloatx80Sign( b ); 6903 if ( aSign != bSign ) { 6904 6905 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) && 6906 ( ( a.low | b.low ) == 0 ) ) { 6907 /* zero case */ 6908 return float_relation_equal; 6909 } else { 6910 return 1 - (2 * aSign); 6911 } 6912 } else { 6913 if (a.low == b.low && a.high == b.high) { 6914 return float_relation_equal; 6915 } else { 6916 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6917 } 6918 } 6919 } 6920 6921 int floatx80_compare(floatx80 a, floatx80 b, float_status *status) 6922 { 6923 return floatx80_compare_internal(a, b, 0, status); 6924 } 6925 6926 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status) 6927 { 6928 return floatx80_compare_internal(a, b, 1, status); 6929 } 6930 6931 static inline int float128_compare_internal(float128 a, float128 b, 6932 int is_quiet, float_status *status) 6933 { 6934 flag aSign, bSign; 6935 6936 if (( ( extractFloat128Exp( a ) == 0x7fff ) && 6937 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || 6938 ( ( extractFloat128Exp( b ) == 0x7fff ) && 6939 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { 6940 if (!is_quiet || 6941 float128_is_signaling_nan(a, status) || 6942 float128_is_signaling_nan(b, status)) { 6943 float_raise(float_flag_invalid, status); 6944 } 6945 return float_relation_unordered; 6946 } 6947 aSign = extractFloat128Sign( a ); 6948 bSign = extractFloat128Sign( b ); 6949 if ( aSign != bSign ) { 6950 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { 6951 /* zero case */ 6952 return float_relation_equal; 6953 } else { 6954 return 1 - (2 * aSign); 6955 } 6956 } else { 6957 if (a.low == b.low && a.high == b.high) { 6958 return float_relation_equal; 6959 } else { 6960 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); 6961 } 6962 } 6963 } 6964 6965 int float128_compare(float128 a, float128 b, float_status *status) 6966 { 6967 return float128_compare_internal(a, b, 0, status); 6968 } 6969 6970 int float128_compare_quiet(float128 a, float128 b, float_status *status) 6971 { 6972 return float128_compare_internal(a, b, 1, status); 6973 } 6974 6975 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status) 6976 { 6977 flag aSign; 6978 int32_t aExp; 6979 uint64_t aSig; 6980 6981 if (floatx80_invalid_encoding(a)) { 6982 float_raise(float_flag_invalid, status); 6983 return floatx80_default_nan(status); 6984 } 6985 aSig = extractFloatx80Frac( a ); 6986 aExp = extractFloatx80Exp( a ); 6987 aSign = extractFloatx80Sign( a ); 6988 6989 if ( aExp == 0x7FFF ) { 6990 if ( aSig<<1 ) { 6991 return propagateFloatx80NaN(a, a, status); 6992 } 6993 return a; 6994 } 6995 6996 if (aExp == 0) { 6997 if (aSig == 0) { 6998 return a; 6999 } 7000 aExp++; 7001 } 7002 7003 if (n > 0x10000) { 7004 n = 0x10000; 7005 } else if (n < -0x10000) { 7006 n = -0x10000; 7007 } 7008 7009 aExp += n; 7010 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision, 7011 aSign, aExp, aSig, 0, status); 7012 } 7013 7014 float128 float128_scalbn(float128 a, int n, float_status *status) 7015 { 7016 flag aSign; 7017 int32_t aExp; 7018 uint64_t aSig0, aSig1; 7019 7020 aSig1 = extractFloat128Frac1( a ); 7021 aSig0 = extractFloat128Frac0( a ); 7022 aExp = extractFloat128Exp( a ); 7023 aSign = extractFloat128Sign( a ); 7024 if ( aExp == 0x7FFF ) { 7025 if ( aSig0 | aSig1 ) { 7026 return propagateFloat128NaN(a, a, status); 7027 } 7028 return a; 7029 } 7030 if (aExp != 0) { 7031 aSig0 |= LIT64( 0x0001000000000000 ); 7032 } else if (aSig0 == 0 && aSig1 == 0) { 7033 return a; 7034 } else { 7035 aExp++; 7036 } 7037 7038 if (n > 0x10000) { 7039 n = 0x10000; 7040 } else if (n < -0x10000) { 7041 n = -0x10000; 7042 } 7043 7044 aExp += n - 1; 7045 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 7046 , status); 7047 7048 } 7049